[med-svn] [Git][med-team/kallisto][upstream] New upstream version 0.46.1+dfsg
Steffen Möller
gitlab at salsa.debian.org
Fri Nov 8 22:22:39 GMT 2019
Steffen Möller pushed to branch upstream at Debian Med / kallisto
Commits:
fc9b521a by Steffen Moeller at 2019-11-08T22:14:31Z
New upstream version 0.46.1+dfsg
- - - - -
6 changed files:
- .gitignore
- README.md
- src/ProcessReads.cpp
- src/ProcessReads.h
- src/common.h
- src/main.cpp
Changes:
=====================================
.gitignore
=====================================
@@ -18,3 +18,6 @@ CTestTestfile.cmake
debug/
release/
.vscode/
+ext/htslib/src/htslib-stamp
+ext/htslib/tmp/
+.snakemake
=====================================
README.md
=====================================
@@ -5,11 +5,11 @@ RNA-Seq data, or more generally of target sequences using high-throughput
sequencing reads. It is based on the novel idea of _pseudoalignment_ for
rapidly determining the compatibility of reads with targets, without the need
for alignment. On benchmarks with standard RNA-Seq data, __kallisto__ can
-quantify 30 million human reads in less than 3 minutes on a Mac desktop
+quantify 30 million human bulk RNA-seq reads in less than 3 minutes on a Mac desktop
computer using only the read sequences and a transcriptome index that
-itself takes less than 10 minutes to build. Pseudoalignment of reads
+itself takes than 10 minutes to build. Pseudoalignment of reads
preserves the key information needed for quantification, and __kallisto__
-is therefore not only fast, but also as accurate than existing
+is therefore not only fast, but also comparably accurate to other existing
quantification tools. In fact, because the pseudoalignment procedure is
robust to errors in the reads, in many benchmarks __kallisto__
significantly outperforms existing tools. The __kallisto__ algorithms are described in more detail in:
@@ -18,7 +18,9 @@ NL Bray, H Pimentel, P Melsted and L Pachter, [Near optimal probabilistic RNA-se
Scripts reproducing all the results of the paper are available [here](https://github.com/pachterlab/kallisto_paper_analysis).
-__kallisto__ quantified RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/).
+__kallisto__ quantified bulk RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/).
+
+__kallisto__ can be used together with [__bustools__](https://bustools.github.io/) to pre-process single-cell RNA-seq data. See the [kallistobus.tools](https://www.kallistobus.tools/) website for instructions.
## Manual
@@ -26,24 +28,15 @@ Please visit http://pachterlab.github.io/kallisto/manual.html for the manual.
## License
-Please read the license before using kallisto. The license is distributed with __kallisto__ in the file `license.txt` also viewable [here](https://pachterlab.github.io/kallisto/download).
-
-## Announcements
-
-There is a low traffic Google Group,
-[kallisto-sleuth-announcements](https://groups.google.com/d/forum/kallisto-sleuth-announcements)
-where we make announcements about new releases. This is a read-only mailing
-list.
+__kallisto__ is distributed under the BSD-2 license. The license is distributed with __kallisto__ in the file `license.txt`, which is also viewable [here](https://pachterlab.github.io/kallisto/download). Please read the license before using __kallisto__.
## Getting help
-For help running __kallisto__, please post to the [kallisto-sleuth-users
-Google Group](https://groups.google.com/d/forum/kallisto-sleuth-users).
+For help running __kallisto__, please post to the [kallisto-and-applications Google Group](https://groups.google.com/forum/#!forum/kallisto-and-applications).
## Reporting bugs
-Please report bugs to the [Github issues
-page](https://github.com/pachterlab/kallisto/issues)
+Please report bugs to the [Github issues page](https://github.com/pachterlab/kallisto/issues)
## Development and pull requests
=====================================
src/ProcessReads.cpp
=====================================
@@ -256,7 +256,7 @@ int64_t ProcessBUSReads(MasterProcessor& MP, const ProgramOptions& opt) {
if (nummapped == 0) {
std::cerr << "[~warn] no reads pseudoaligned." << std::endl;
}
-
+
return numreads;
}
@@ -565,7 +565,7 @@ void MasterProcessor::processAln(const EMAlgorithm& em, bool useEM = true) {
assert(opt.pseudobam);
pseudobatchf_in.open(opt.output + "/pseudoaln.bin", std::ios::in | std::ios::binary);
- SR.reset();
+ SR->reset();
std::vector<std::thread> workers;
for (int i = 0; i < opt.threads; i++) {
@@ -893,6 +893,7 @@ ReadProcessor::ReadProcessor(ReadProcessor && o) :
seqs(std::move(o.seqs)),
names(std::move(o.names)),
quals(std::move(o.quals)),
+ flags(std::move(o.flags)),
umis(std::move(o.umis)),
newEcs(std::move(o.newEcs)),
flens(std::move(o.flens)),
@@ -919,16 +920,16 @@ void ReadProcessor::operator()() {
if (batchSR.empty()) {
return;
} else {
- batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam );
+ batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam );
}
} else {
std::lock_guard<std::mutex> lock(mp.reader_lock);
- if (mp.SR.empty()) {
+ if (mp.SR->empty()) {
// nothing to do
return;
} else {
// get new sequences
- mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
+ mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
}
// release the reader lock
}
@@ -1222,7 +1223,7 @@ void ReadProcessor::clear() {
BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp, int _id, int _local_id) :
- paired(!opt.single_end), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
+ paired(!opt.single_end), bam(opt.bam), num(opt.num), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
// initialize buffer
bufsize = mp.bufsize;
buffer = new char[bufsize];
@@ -1238,6 +1239,8 @@ BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, co
BUSProcessor::BUSProcessor(BUSProcessor && o) :
paired(o.paired),
+ bam(o.bam),
+ num(o.num),
tc(o.tc),
index(o.index),
mp(o.mp),
@@ -1248,6 +1251,7 @@ BUSProcessor::BUSProcessor(BUSProcessor && o) :
seqs(std::move(o.seqs)),
names(std::move(o.names)),
quals(std::move(o.quals)),
+ flags(std::move(o.flags)),
newEcs(std::move(o.newEcs)),
flens(std::move(o.flens)),
bias5(std::move(o.bias5)),
@@ -1273,13 +1277,13 @@ void BUSProcessor::operator()() {
// grab the reader lock
{
std::lock_guard<std::mutex> lock(mp.reader_lock);
- if (mp.SR.empty()) {
+ if (mp.SR->empty()) {
// nothing to do
return;
} else {
// get new sequences
std::vector<std::string> umis;
- mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, false);
+ mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, false);
}
// release the reader lock
}
@@ -1304,7 +1308,7 @@ void BUSProcessor::processBuffer() {
std::vector<std::pair<KmerEntry,int>> v,v2;
std::vector<int> vtmp;
std::vector<int> u;
-
+
u.reserve(1000);
v.reserve(1000);
vtmp.reserve(1000);
@@ -1326,9 +1330,17 @@ void BUSProcessor::processBuffer() {
// actually process the sequences
const BUSOptions& busopt = mp.opt.busOptions;
- int incf = busopt.nfiles-1;
+ int incf, jmax;
+ if (bam) {
+ incf = 1;
+ jmax = 2;
+ } else {
+ incf = busopt.nfiles-1;
+ jmax = busopt.nfiles;
+ }
+ //int incf = (bam) ? 1 : busopt.nfiles-1;
for (int i = 0; i + incf < seqs.size(); i++) {
- for (int j = 0; j < busopt.nfiles; j++) {
+ for (int j = 0; j < jmax /*(bam) ? 2 : busopt.nfiles*/; j++) {
s[j] = seqs[i+j].first;
l[j] = seqs[i+j].second;
}
@@ -1350,7 +1362,7 @@ void BUSProcessor::processBuffer() {
}
- auto &bcc = busopt.bc[0];
+// auto &bcc = busopt.bc[0];
int blen = 0;
bool bad_bc = false;
for (auto &bcc : busopt.bc) {
@@ -1401,6 +1413,9 @@ void BUSProcessor::processBuffer() {
b.flags |= (f) << 8;
b.count = 1;
//std::cout << std::string(s1,10) << "\t" << b.barcode << "\t" << std::string(s1+10,16) << "\t" << b.UMI << "\n";
+ if (num) {
+ b.flags = (uint32_t) flags[i / jmax];
+ }
//ec = tc.findEC(u);
@@ -1438,8 +1453,7 @@ AlnProcessor::AlnProcessor(const KmerIndex& index, const ProgramOptions& opt, Ma
buffer = new char[bufsize];
bambufsize = 1<<20;
bambuffer = new char[bambufsize]; // refactor this?
-
-
+
if (opt.batch_mode) {
/* need to check this later */
assert(id != -1);
@@ -1475,6 +1489,7 @@ AlnProcessor::AlnProcessor(AlnProcessor && o) :
seqs(std::move(o.seqs)),
names(std::move(o.names)),
quals(std::move(o.quals)),
+ flags(std::move(o.flags)),
umis(std::move(o.umis)),
batchSR(std::move(o.batchSR)) {
buffer = o.buffer;
@@ -1515,19 +1530,19 @@ void AlnProcessor::operator()() {
if (batchSR.empty()) {
return;
} else {
- batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true );
+ batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true );
readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
assert(pseudobatch.batch_id == readbatch_id);
assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
}
} else {
std::lock_guard<std::mutex> lock(mp.reader_lock);
- if (mp.SR.empty()) {
+ if (mp.SR->empty()) {
// nothing to do
return;
} else {
// get new sequences
- mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true);
+ mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true);
readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
assert(pseudobatch.batch_id == readbatch_id);
assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
@@ -2557,9 +2572,14 @@ int fillBamRecord(bam1_t &b, uint8_t* buf, const char *seq, const char *name, co
}
-/** -- sequence reader -- **/
+/** -- sequence readers -- **/
+
+void SequenceReader::reset() {
+ state = false;
+ readbatch_id = -1;
+}
-SequenceReader::~SequenceReader() {
+FastqSequenceReader::~FastqSequenceReader() {
for (auto &f : fp) {
if (f) {
gzclose(f);
@@ -2578,26 +2598,20 @@ SequenceReader::~SequenceReader() {
}
-void SequenceReader::reserveNfiles(int n) {
- fp.resize(nfiles);
- seq.resize(nfiles, nullptr);
- l.resize(nfiles, 0);
- nl.resize(nfiles, 0);
+bool FastqSequenceReader::empty() {
+ return (!state && current_file >= files.size());
}
-void SequenceReader::reset() {
- for (auto &f : fp) {
+void FastqSequenceReader::reset() {
+ SequenceReader::reset();
+
+ for (auto &f : fp) {
if (f) {
gzclose(f);
}
f = nullptr;
}
- for (auto &s : seq) {
- kseq_destroy(s);
- s = nullptr;
- }
-
if (f_umi && f_umi->is_open()) {
f_umi->close();
}
@@ -2612,15 +2626,24 @@ void SequenceReader::reset() {
}
current_file = 0;
- state = false;
- readbatch_id = -1;
+ for (auto &s : seq) {
+ kseq_destroy(s);
+ s = nullptr;
+ }
}
+void FastqSequenceReader::reserveNfiles(int n) {
+ fp.resize(nfiles);
+ l.resize(nfiles, 0);
+ nl.resize(nfiles, 0);
+ seq.resize(nfiles, nullptr);
+}
// returns true if there is more left to read from the files
-bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
+bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
std::vector<std::pair<const char *, int> > &names,
std::vector<std::pair<const char *, int> > &quals,
+ std::vector<uint32_t>& flags,
std::vector<std::string> &umis, int& read_id,
bool full) {
@@ -2634,6 +2657,7 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
names.clear();
quals.clear();
}
+ flags.clear();
bool usingUMIfiles = !umi_files.empty();
int umis_read = 0;
@@ -2678,7 +2702,7 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
bool all_l = true;
int bufadd = nfiles;
for (int i = 0; i < nfiles; i++) {
- all_l = all_l && l[i] > 0;
+ all_l = all_l && l[i] >= 0;
bufadd += l[i]; // includes seq
}
if (all_l) {
@@ -2718,6 +2742,8 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
ss >> umi;
umis.emplace_back(std::move(umi));
}
+
+ flags.push_back(numreads++);
} else {
return true; // read it next time
}
@@ -2732,27 +2758,146 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
}
}
-bool SequenceReader::empty() {
- return (!state && current_file >= files.size());
-}
-
// move constructor
-SequenceReader::SequenceReader(SequenceReader&& o) :
+#if 1
+FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) :
+ nfiles(o.nfiles),
+ numreads(o.numreads),
fp(std::move(o.fp)),
- seq(std::move(o.seq)),
l(std::move(o.l)),
nl(std::move(o.nl)),
paired(o.paired),
- nfiles(o.nfiles),
files(std::move(o.files)),
umi_files(std::move(o.umi_files)),
f_umi(std::move(o.f_umi)),
- current_file(o.current_file),
- state(o.state) {
+ current_file(o.current_file) {
+
+ o.fp.resize(nfiles);
+ o.l.resize(nfiles, 0);
+ o.nl.resize(nfiles, 0);
+ o.state = false;
+}
+#endif
+#if 0
+FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) {
+ fp = (std::move(o.fp));
+ l = (std::move(o.l));
+ nl = (std::move(o.nl));
+ paired = (o.paired);
+ nfiles = (o.nfiles);
+ files = (std::move(o.files)),
+ umi_files = (std::move(o.umi_files));
+ f_umi = (std::move(o.f_umi));
+ current_file= (o.current_file);
+ seq = (std::move(o.seq));
o.fp.resize(nfiles);
- o.seq.resize(nfiles, nullptr);
o.l.resize(nfiles, 0);
o.nl.resize(nfiles, 0);
o.state = false;
+ o.seq.resize(nfiles, nullptr);
+}
+#endif
+const std::string BamSequenceReader::seq_enc = "=ACMGRSVTWYHKDBN";
+
+BamSequenceReader::~BamSequenceReader() {
+ if (fp) {
+ bgzf_close(fp);
+ }
+ if (head) {
+ bam_hdr_destroy(head);
+ }
+ if (rec) {
+ bam_destroy1(rec);
+ }
+}
+
+bool BamSequenceReader::empty() {
+ return !state;
+}
+
+void BamSequenceReader::reset() {
+ SequenceReader::reset();
+}
+
+void BamSequenceReader::reserveNfiles(int n) {
+}
+
+// returns true if there is more left to read from the files
+bool BamSequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
+ std::vector<std::pair<const char *, int> > &names,
+ std::vector<std::pair<const char *, int> > &quals,
+ std::vector<uint32_t>& flags,
+ std::vector<std::string> &umis, int& read_id,
+ bool full) {
+
+ readbatch_id += 1; // increase the batch id
+ read_id = readbatch_id; // copy now because we are inside a lock
+ seqs.clear();
+ flags.clear();
+
+ int bufpos = 0;
+ while (true) {
+ if (!state) { // should we open a file
+ return false;
+ }
+
+ // the file is open and we have read into seq1 and seq2
+ if (err >= 0) {
+ if (!(rec->core.flag & BAM_FSECONDARY)) { // record is primary alignment
+ int bufadd = l_seq + l_bc + l_umi + 2;
+ // fits into the buffer
+ if (bufpos+bufadd < limit) {
+ char *pi;
+
+ pi = buf + bufpos;
+ memcpy(pi, bc, l_bc);
+ memcpy(pi + l_bc, umi, l_umi + 1);
+ seqs.emplace_back(pi, l_bc + l_umi);
+ bufpos += l_bc + l_umi + 1;
+
+ pi = buf + bufpos;
+ int len = (l_seq + 1) / 2;
+ if (l_seq % 2) --len;
+ int j = 0;
+ for (int i = 0; i < len; ++i, ++eseq) {
+ buf[bufpos++] = seq_enc[*eseq >> 4];
+ buf[bufpos++] = seq_enc[*eseq & 0x0F];
+ }
+ if (l_seq % 2) {
+ buf[bufpos++] = seq_enc[*eseq >> 4];
+ }
+ buf[bufpos++] = '\0';
+ seqs.emplace_back(pi, l_seq);
+
+ } else {
+ return true; // read it next time
+ }
+ }
+
+ // read for the next one
+ err = bam_read1(fp, rec);
+ if (err < 0) {
+ return true;
+ }
+ eseq = bam_get_seq(rec);
+ l_seq = rec->core.l_qseq;
+
+ bc = bam_aux2Z(bam_aux_get(rec, "CR"));
+ l_bc = 0;
+ for (char *pbc = bc; *pbc != '\0'; ++pbc) {
+ ++l_bc;
+ }
+
+ umi = bam_aux2Z(bam_aux_get(rec, "UR"));
+ l_umi = 0;
+ for (char *pumi = umi; *pumi != '\0'; ++pumi) {
+ ++l_umi;
+ }
+
+ } else {
+ state = false; // haven't opened file yet
+ }
+ }
}
+
=====================================
src/ProcessReads.h
=====================================
@@ -23,6 +23,7 @@
#include "GeneModel.h"
#include "BUSData.h"
#include "BUSTools.h"
+#include <htslib/sam.h>
#ifndef KSEQ_INIT_READY
@@ -41,9 +42,35 @@ class SequenceReader {
public:
SequenceReader(const ProgramOptions& opt) :
- paired(!opt.single_end), files(opt.files),
- f_umi(new std::ifstream{}),
- current_file(0), state(false), readbatch_id(-1) {
+ readbatch_id(-1) {};
+ SequenceReader() : state(false), readbatch_id(-1) {};
+ virtual ~SequenceReader() {}
+// SequenceReader(SequenceReader&& o);
+
+ virtual bool empty() = 0;
+ virtual void reset();
+ virtual void reserveNfiles(int n) = 0;
+ virtual bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
+ std::vector<std::pair<const char*, int>>& names,
+ std::vector<std::pair<const char*, int>>& quals,
+ std::vector<uint32_t>& flags,
+ std::vector<std::string>& umis, int &readbatch_id,
+ bool full=false) = 0;
+
+
+public:
+ bool state; // is the file open
+ int readbatch_id = -1;
+};
+
+class FastqSequenceReader : public SequenceReader {
+public:
+
+ FastqSequenceReader(const ProgramOptions& opt) : SequenceReader(opt),
+ current_file(0), paired(!opt.single_end), files(opt.files),
+ f_umi(new std::ifstream{}) {
+ SequenceReader::state = false;
+
if (opt.bus_mode) {
nfiles = opt.busOptions.nfiles;
} else {
@@ -51,46 +78,106 @@ public:
}
reserveNfiles(nfiles);
}
- SequenceReader() :
+ FastqSequenceReader() : SequenceReader(),
paired(false),
f_umi(new std::ifstream{}),
- current_file(0), state(false), readbatch_id(-1) {}
- SequenceReader(SequenceReader&& o);
-
- bool empty();
+ current_file(0) {};
+ FastqSequenceReader(FastqSequenceReader &&o);
+ ~FastqSequenceReader();
+
+ bool empty();
void reset();
void reserveNfiles(int n);
- ~SequenceReader();
-
bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
std::vector<std::pair<const char*, int>>& names,
std::vector<std::pair<const char*, int>>& quals,
+ std::vector<uint32_t>& flags,
std::vector<std::string>& umis, int &readbatch_id,
bool full=false);
public:
int nfiles = 1;
+ uint32_t numreads = 0;
std::vector<gzFile> fp;
- //gzFile fp1 = 0, fp2 = 0;
- std::vector<kseq_t*> seq;
std::vector<int> l;
std::vector<int> nl;
- //kseq_t *seq1 = 0, *seq2 = 0;
- //int l1,l2,nl1,nl2;
bool paired;
std::vector<std::string> files;
std::vector<std::string> umi_files;
std::unique_ptr<std::ifstream> f_umi;
int current_file;
- bool state; // is the file open
- int readbatch_id = -1;
+ std::vector<kseq_t*> seq;
+};
+
+class BamSequenceReader : public SequenceReader {
+public:
+
+ BamSequenceReader(const ProgramOptions& opt) :
+ SequenceReader(opt) {
+ SequenceReader::state = true;
+
+ fp = bgzf_open(opt.files[0].c_str(), "rb");
+ head = bam_hdr_read(fp);
+ rec = bam_init1();
+
+ err = bam_read1(fp, rec);
+ eseq = bam_get_seq(rec);
+ l_seq = rec->core.l_qseq;
+
+ bc = bam_aux2Z(bam_aux_get(rec, "CR"));
+ l_bc = 0;
+ for (char *pbc = bc; *pbc != '\0'; ++pbc) {
+ ++l_bc;
+ }
+
+ umi = bam_aux2Z(bam_aux_get(rec, "UR"));
+ l_umi = 0;
+ for (char *pumi = umi; *pumi != '\0'; ++pumi) {
+ ++l_umi;
+ }
+ }
+ BamSequenceReader() : SequenceReader() {};
+ BamSequenceReader(BamSequenceReader &&o);
+ ~BamSequenceReader();
+
+ bool empty();
+ void reset();
+ void reserveNfiles(int n);
+ bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
+ std::vector<std::pair<const char*, int>>& names,
+ std::vector<std::pair<const char*, int>>& quals,
+ std::vector<uint32_t>& flags,
+ std::vector<std::string>& umis, int &readbatch_id,
+ bool full=false);
+
+public:
+ BGZF *fp;
+ bam_hdr_t *head;
+ bam1_t *rec;
+ uint8_t *eseq;
+ int32_t l_seq;
+ char *bc;
+ int l_bc;
+ char *umi;
+ int l_umi;
+ int err;
+ char seq[256]; // Is there a better limit?
+
+private:
+ static const std::string seq_enc;
};
class MasterProcessor {
public:
MasterProcessor (KmerIndex &index, const ProgramOptions& opt, MinCollector &tc, const Transcriptome& model)
- : tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), SR(opt), numreads(0)
+ : tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), numreads(0)
,nummapped(0), num_umi(0), bufsize(1ULL<<23), tlencount(0), biasCount(0), maxBiasCount((opt.bias) ? 1000000 : 0), last_pseudobatch_id (-1) {
+ if (opt.bam) {
+ SR = new BamSequenceReader(opt);
+ } else {
+ SR = new FastqSequenceReader(opt);
+ }
+
if (opt.batch_mode) {
memset(&bus_bc_len[0],0,33);
memset(&bus_umi_len[0],0,33);
@@ -136,13 +223,14 @@ public:
bam_hdr_destroy(bamh);
bamh = nullptr;
}
+ delete SR;
}
std::mutex reader_lock;
std::mutex writer_lock;
- SequenceReader SR;
+ SequenceReader *SR;
MinCollector& tc;
KmerIndex& index;
const Transcriptome& model;
@@ -205,7 +293,7 @@ public:
std::vector<std::pair<std::vector<int>, std::string>> new_ec_umi;
const KmerIndex& index;
MasterProcessor& mp;
- SequenceReader batchSR;
+ FastqSequenceReader batchSR;
int64_t numreads;
int id;
int local_id;
@@ -215,6 +303,7 @@ public:
std::vector<std::pair<const char*, int>> seqs;
std::vector<std::pair<const char*, int>> names;
std::vector<std::pair<const char*, int>> quals;
+ std::vector<uint32_t> flags;
std::vector<std::string> umis;
std::vector<std::vector<int>> newEcs;
std::vector<int> flens;
@@ -237,6 +326,8 @@ public:
size_t bufsize;
bool paired;
+ bool bam;
+ bool num;
const MinCollector& tc;
const KmerIndex& index;
MasterProcessor& mp;
@@ -251,6 +342,7 @@ public:
std::vector<std::pair<const char*, int>> seqs;
std::vector<std::pair<const char*, int>> names;
std::vector<std::pair<const char*, int>> quals;
+ std::vector<uint32_t> flags;
std::vector<std::vector<int>> newEcs;
std::vector<int> flens;
@@ -279,7 +371,7 @@ public:
const KmerIndex& index;
const EMAlgorithm& em;
MasterProcessor& mp;
- SequenceReader batchSR;
+ FastqSequenceReader batchSR;
int64_t numreads;
int id;
PseudoAlignmentBatch pseudobatch;
@@ -291,6 +383,7 @@ public:
std::vector<std::pair<const char*, int>> seqs;
std::vector<std::pair<const char*, int>> names;
std::vector<std::pair<const char*, int>> quals;
+ std::vector<uint32_t> flags;
std::vector<std::string> umis;
void operator()();
=====================================
src/common.h
=====================================
@@ -1,11 +1,11 @@
#ifndef KALLISTO_COMMON_H
#define KALLISTO_COMMON_H
-#define KALLISTO_VERSION "0.46.0"
+#define KALLISTO_VERSION "0.46.1"
#include <string>
#include <vector>
-
+#include <iostream>
#ifdef _WIN64
typedef unsigned int uint;
@@ -67,6 +67,8 @@ struct ProgramOptions {
bool bus_mode;
BUSOptions busOptions;
bool pseudo_quant;
+ bool bam;
+ bool num;
std::string batch_file_name;
std::vector<std::vector<std::string>> batch_files;
std::vector<std::string> batch_ids;
@@ -107,6 +109,8 @@ ProgramOptions() :
batch_mode(false),
bus_mode(false),
pseudo_quant(false),
+ bam(false),
+ num(false),
plaintext(false),
write_index(false),
single_end(false),
=====================================
src/main.cpp
=====================================
@@ -533,20 +533,24 @@ void ListSingleCellTechnologies() {
<< "CELSeq CEL-Seq" << endl
<< "CELSeq2 CEL-Seq version 2" << endl
<< "DropSeq DropSeq" << endl
- << "inDrops inDrops" << endl
+ << "inDropsv1 inDrops version 1 chemistry" << endl
+ << "inDropsv2 inDrops version 2 chemistry" << endl
+ << "inDropsv3 inDrops version 3 chemistry" << endl
<< "SCRBSeq SCRB-Seq" << endl
<< "SureCell SureCell for ddSEQ" << endl
<< endl;
}
void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
- const char *opt_string = "i:o:x:t:l";
+ const char *opt_string = "i:o:x:t:lbn";
static struct option long_options[] = {
{"index", required_argument, 0, 'i'},
{"output-dir", required_argument, 0, 'o'},
{"technology", required_argument, 0, 'x'},
{"list", no_argument, 0, 'l'},
{"threads", required_argument, 0, 't'},
+ {"bam", no_argument, 0, 'b'},
+ {"num", no_argument, 0, 'n'},
{0,0,0,0}
};
@@ -584,6 +588,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
stringstream(optarg) >> opt.threads;
break;
}
+ case 'b': {
+ opt.bam = true;
+ break;
+ }
+ case 'n': {
+ opt.num = true;
+ break;
+ }
default: break;
}
}
@@ -812,84 +824,183 @@ bool CheckOptionsBus(ProgramOptions& opt) {
ret = false;
} else {
auto& busopt = opt.busOptions;
-
- if (opt.technology == "10XV2") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
- busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
- busopt.bc.push_back(BUSOptionSubstr(0,0,16));
- } else if (opt.technology == "10XV3") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
- busopt.umi = BUSOptionSubstr(0,16,28);
- busopt.bc.push_back(BUSOptionSubstr(0,0,16));
- } else if (opt.technology == "10XV1") {
- busopt.nfiles = 3;
- busopt.seq = BUSOptionSubstr(0,0,0);
- busopt.umi = BUSOptionSubstr(1,0,0);
- busopt.bc.push_back(BUSOptionSubstr(2,0,0));
- } else if (opt.technology == "SURECELL") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
- busopt.umi = BUSOptionSubstr(0,51,59);
- busopt.bc.push_back(BUSOptionSubstr(0,0,6));
- busopt.bc.push_back(BUSOptionSubstr(0,21,27));
- busopt.bc.push_back(BUSOptionSubstr(0,42,48));
- } else if (opt.technology == "DROPSEQ") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
- busopt.umi = BUSOptionSubstr(0,12,20);
- busopt.bc.push_back(BUSOptionSubstr(0,0,12));
- } else if (opt.technology == "INDROPS") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
- busopt.umi = BUSOptionSubstr(0,42,48);
- busopt.bc.push_back(BUSOptionSubstr(0,0,11));
- busopt.bc.push_back(BUSOptionSubstr(0,30,38));
- } else if (opt.technology == "CELSEQ") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
- busopt.umi = BUSOptionSubstr(0,8,12);
- busopt.bc.push_back(BUSOptionSubstr(0,0,8));
- } else if (opt.technology == "CELSEQ2") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
- busopt.umi = BUSOptionSubstr(0,0,6);
- busopt.bc.push_back(BUSOptionSubstr(0,6,12));
- } else if (opt.technology == "SCRBSEQ") {
- busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
- busopt.umi = BUSOptionSubstr(0,6,16);
- busopt.bc.push_back(BUSOptionSubstr(0,0,6));
- } else {
- vector<int> files;
- vector<BUSOptionSubstr> values;
- vector<BUSOptionSubstr> bcValues;
- vector<std::string> errorList;
- bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
- if(!invalid) {
- busopt.nfiles = files.size();
- for(int i = 0; i < bcValues.size(); i++) {
- busopt.bc.push_back(bcValues[i]);
+
+ if (opt.bam) { // Note: only 10xV2 has been tested
+ busopt.nfiles = 1;
+ if (opt.technology == "10XV2") {
+ busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
+ busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+ busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+ } else if (opt.technology == "10XV3") {
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,16,28);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+// } else if (opt.technology == "10XV1") {
+
+ } else if (opt.technology == "SURECELL") {
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,18,26);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,18));
+ } else if (opt.technology == "DROPSEQ") {
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,12,20);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,12));
+ } else if (opt.technology == "INDROPSV1") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,42,48);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,11));
+ busopt.bc.push_back(BUSOptionSubstr(0,30,38));
+ } else if (opt.technology == "INDROPSV2") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(0,0,0);
+ busopt.umi = BUSOptionSubstr(1,42,48);
+ busopt.bc.push_back(BUSOptionSubstr(1,0,11));
+ busopt.bc.push_back(BUSOptionSubstr(1,30,38));
+ } else if (opt.technology == "INDROPSV3") {
+ busopt.nfiles = 3;
+ busopt.seq = BUSOptionSubstr(2,0,0);
+ busopt.umi = BUSOptionSubstr(1,8,14);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+ busopt.bc.push_back(BUSOptionSubstr(1,0,8));
+ } else if (opt.technology == "CELSEQ") {
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,8,12);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+ } else if (opt.technology == "CELSEQ2") {
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,0,6);
+ busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+ } else if (opt.technology == "SCRBSEQ") {
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,6,16);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,6));
+ } else if (opt.technology == "INDROPSV3") {
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,0,6);
+ busopt.bc.push_back(BUSOptionSubstr(0,6,16));
+ } else {
+ vector<int> files;
+ vector<BUSOptionSubstr> values;
+ vector<BUSOptionSubstr> bcValues;
+ vector<std::string> errorList;
+ bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
+ if(!invalid) {
+ busopt.nfiles = files.size();
+ for(int i = 0; i < bcValues.size(); i++) {
+ busopt.bc.push_back(bcValues[i]);
+ }
+ busopt.umi = values[0];
+ busopt.seq = values[1];
+ } else {
+ for(int j = 0; j < errorList.size(); j++) {
+ cerr << errorList[j] << endl;
+ }
+ cerr << "Unable to create technology: " << opt.technology << endl;
+ ret = false;
}
- busopt.umi = values[0];
- busopt.seq = values[1];
+ }
+ } else {
+ if (opt.technology == "10XV2") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
+ busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+ busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+ } else if (opt.technology == "10XV3") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,16,28);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+ } else if (opt.technology == "10XV1") {
+ busopt.nfiles = 3;
+ busopt.seq = BUSOptionSubstr(0,0,0);
+ busopt.umi = BUSOptionSubstr(1,0,0);
+ busopt.bc.push_back(BUSOptionSubstr(2,0,0));
+ } else if (opt.technology == "SURECELL") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,51,59);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,6));
+ busopt.bc.push_back(BUSOptionSubstr(0,21,27));
+ busopt.bc.push_back(BUSOptionSubstr(0,42,48));
+ } else if (opt.technology == "DROPSEQ") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,12,20);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,12));
+ } else if (opt.technology == "INDROPSV1") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,42,48);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,11));
+ busopt.bc.push_back(BUSOptionSubstr(0,30,38));
+ } else if (opt.technology == "INDROPSV2") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(0,0,0);
+ busopt.umi = BUSOptionSubstr(1,42,48);
+ busopt.bc.push_back(BUSOptionSubstr(1,0,11));
+ busopt.bc.push_back(BUSOptionSubstr(1,30,38));
+ } else if (opt.technology == "INDROPSV3") {
+ busopt.nfiles = 3;
+ busopt.seq = BUSOptionSubstr(2,0,0);
+ busopt.umi = BUSOptionSubstr(1,8,14);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+ busopt.bc.push_back(BUSOptionSubstr(1,0,8));
+ } else if (opt.technology == "CELSEQ") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,8,12);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+ } else if (opt.technology == "CELSEQ2") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,0,6);
+ busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+ } else if (opt.technology == "SCRBSEQ") {
+ busopt.nfiles = 2;
+ busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.umi = BUSOptionSubstr(0,6,16);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,6));
+ } else if (opt.technology == "INDROPSV3") {
+ busopt.nfiles = 3;
+ busopt.seq = BUSOptionSubstr(2,0,0);
+ busopt.umi = BUSOptionSubstr(1,8,14);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+ busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else {
- for(int j = 0; j < errorList.size(); j++) {
- cerr << errorList[j] << endl;
+ vector<int> files;
+ vector<BUSOptionSubstr> values;
+ vector<BUSOptionSubstr> bcValues;
+ vector<std::string> errorList;
+ bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
+ if(!invalid) {
+ busopt.nfiles = files.size();
+ for(int i = 0; i < bcValues.size(); i++) {
+ busopt.bc.push_back(bcValues[i]);
+ }
+ busopt.umi = values[0];
+ busopt.seq = values[1];
+ } else {
+ for(int j = 0; j < errorList.size(); j++) {
+ cerr << errorList[j] << endl;
+ }
+ cerr << "Unable to create technology: " << opt.technology << endl;
+ ret = false;
}
- cerr << "Unable to create technology: " << opt.technology << endl;
- ret = false;
}
}
}
- if (ret && opt.files.size() % opt.busOptions.nfiles != 0) {
+ if (ret && !opt.bam && opt.files.size() % opt.busOptions.nfiles != 0) {
cerr << "Error: Number of files (" << opt.files.size() << ") does not match number of input files required by "
<< "technology " << opt.technology << " (" << opt.busOptions.nfiles << ")" << endl;
ret = false;
}
+ if (opt.bam && opt.num) {
+ cerr << "Warning: --bam option was used, so --num option will be ignored" << endl;
+ }
+
return ret;
}
@@ -1535,7 +1646,9 @@ void usageBus() {
<< "-x, --technology=STRING Single-cell technology used " << endl << endl
<< "Optional arguments:" << endl
<< "-l, --list List all single-cell technologies supported" << endl
- << "-t, --threads=INT Number of threads to use (default: 1)" << endl;
+ << "-t, --threads=INT Number of threads to use (default: 1)" << endl
+ << "-b, --bam Input file is a BAM file" << endl
+ << "-n, --num Output number of read in flag column (incompatible with --bam)" << endl;
}
void usageIndex() {
@@ -1808,6 +1921,7 @@ int main(int argc, char *argv[]) {
for (int i = 0; i < index.num_trans; i++) {
num_unique += collection.counts[i];
}
+ num_pseudoaligned = 0;
for (int i = 0; i < collection.counts.size(); i++) {
num_pseudoaligned += collection.counts[i];
}
@@ -1953,6 +2067,10 @@ int main(int argc, char *argv[]) {
num_pseudoaligned += collection.counts[i];
}
+ if (num_pseudoaligned == 0) {
+ std::cerr << "[~warn] Warning, zero reads pseudoaligned check your input files and index" << std::endl;
+ }
+
plaintext_aux(
opt.output + "/run_info.json",
std::string(std::to_string(index.num_trans)),
@@ -1968,7 +2086,17 @@ int main(int argc, char *argv[]) {
plaintext_writer(opt.output + "/abundance.tsv", em.target_names_,
em.alpha_, em.eff_lens_, index.target_lens_);
- if (opt.bootstrap > 0) {
+ if (opt.bootstrap > 0 && num_pseudoaligned == 0) {
+ // this happens if nothing aligns, then we write an empty bootstrap file
+ for (int b = 0; b < opt.bootstrap; b++) {
+ if (!opt.plaintext) {
+ writer.write_bootstrap(em, b); // em is empty
+ } else {
+ plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
+ em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
+ }
+ }
+ } else if (opt.bootstrap > 0 && num_pseudoaligned > 0) {
auto B = opt.bootstrap;
std::mt19937_64 rand;
rand.seed( opt.seed );
@@ -2016,6 +2144,9 @@ int main(int argc, char *argv[]) {
cerr << endl;
+ if (num_pseudoaligned == 0) {
+ exit(1); // exit with error
+ }
}
} else if (cmd == "quant-only") {
if (argc==2) {
@@ -2086,7 +2217,27 @@ int main(int argc, char *argv[]) {
em.alpha_, em.eff_lens_, index.target_lens_);
}
- if (opt.bootstrap > 0) {
+ int64_t num_pseudoaligned =0;
+
+ for (int i = 0; i < collection.counts.size(); i++) {
+ num_pseudoaligned += collection.counts[i];
+ }
+
+ if (num_pseudoaligned == 0) {
+ std::cerr << "[~warn] Warning, zero reads pseudoaligned check your input files and index" << std::endl;
+ }
+
+ if (opt.bootstrap > 0 && num_pseudoaligned == 0) {
+ // this happens if nothing aligns, then we write an empty bootstrap file
+ for (int b = 0; b < opt.bootstrap; b++) {
+ if (!opt.plaintext) {
+ writer.write_bootstrap(em, b); // em is empty
+ } else {
+ plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
+ em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
+ }
+ }
+ } else if (opt.bootstrap > 0 && num_pseudoaligned > 0) {
std::cerr << "Bootstrapping!" << std::endl;
auto B = opt.bootstrap;
std::mt19937_64 rand;
@@ -2125,6 +2276,10 @@ int main(int argc, char *argv[]) {
}
}
cerr << endl;
+
+ if (num_pseudoaligned == 0) {
+ exit(1);
+ }
}
} else if (cmd == "pseudo") {
if (argc==2) {
View it on GitLab: https://salsa.debian.org/med-team/kallisto/commit/fc9b521a1cceb845cda67a025962923e00578ff8
--
View it on GitLab: https://salsa.debian.org/med-team/kallisto/commit/fc9b521a1cceb845cda67a025962923e00578ff8
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191108/908ef443/attachment-0001.html>
More information about the debian-med-commit
mailing list