[med-svn] [Git][med-team/kallisto][upstream] New upstream version 0.46.1+dfsg

Steffen Möller gitlab at salsa.debian.org
Fri Nov 8 22:22:39 GMT 2019



Steffen Möller pushed to branch upstream at Debian Med / kallisto


Commits:
fc9b521a by Steffen Moeller at 2019-11-08T22:14:31Z
New upstream version 0.46.1+dfsg
- - - - -


6 changed files:

- .gitignore
- README.md
- src/ProcessReads.cpp
- src/ProcessReads.h
- src/common.h
- src/main.cpp


Changes:

=====================================
.gitignore
=====================================
@@ -18,3 +18,6 @@ CTestTestfile.cmake
 debug/
 release/
 .vscode/
+ext/htslib/src/htslib-stamp
+ext/htslib/tmp/
+.snakemake


=====================================
README.md
=====================================
@@ -5,11 +5,11 @@ RNA-Seq data, or more generally of target sequences using high-throughput
 sequencing reads. It is based on the novel idea of _pseudoalignment_ for
 rapidly determining the compatibility of reads with targets, without the need
 for alignment. On benchmarks with standard RNA-Seq data, __kallisto__ can
-quantify 30 million human reads in less than 3  minutes on a Mac desktop
+quantify 30 million human bulk RNA-seq reads in less than 3  minutes on a Mac desktop
 computer using only the read sequences and a transcriptome index that
-itself takes less than 10 minutes to build. Pseudoalignment of reads
+itself takes than 10 minutes to build. Pseudoalignment of reads
 preserves the key information needed for quantification, and __kallisto__
-is therefore not only fast, but also as accurate than existing
+is therefore not only fast, but also comparably accurate to other existing
 quantification tools. In fact, because the pseudoalignment procedure is
 robust to errors in the reads, in many benchmarks __kallisto__
 significantly outperforms existing tools. The __kallisto__ algorithms are described in more detail in:
@@ -18,7 +18,9 @@ NL Bray, H Pimentel, P Melsted and L Pachter, [Near optimal probabilistic RNA-se
 
 Scripts reproducing all the results of the paper are available [here](https://github.com/pachterlab/kallisto_paper_analysis).
 
-__kallisto__ quantified RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/).
+__kallisto__ quantified bulk RNA-Seq can be analyzed with [__sleuth__](https://github.com/pachterlab/sleuth/).
+
+__kallisto__ can be used together with [__bustools__](https://bustools.github.io/) to pre-process single-cell RNA-seq data. See the [kallistobus.tools](https://www.kallistobus.tools/) website for instructions.
 
 ## Manual
 
@@ -26,24 +28,15 @@ Please visit http://pachterlab.github.io/kallisto/manual.html for the manual.
 
 ## License
 
-Please read the license before using kallisto. The license is distributed with __kallisto__ in the file `license.txt` also viewable [here](https://pachterlab.github.io/kallisto/download).
-
-## Announcements
-
-There is a low traffic Google Group,
-[kallisto-sleuth-announcements](https://groups.google.com/d/forum/kallisto-sleuth-announcements)
-where we make announcements about new releases. This is a read-only mailing
-list.
+__kallisto__ is distributed under the BSD-2 license. The license is distributed with __kallisto__ in the file `license.txt`, which is also viewable [here](https://pachterlab.github.io/kallisto/download). Please read the license before using __kallisto__.
 
 ## Getting help
 
-For help running __kallisto__, please post to the [kallisto-sleuth-users
-Google Group](https://groups.google.com/d/forum/kallisto-sleuth-users).
+For help running __kallisto__, please post to the [kallisto-and-applications Google Group](https://groups.google.com/forum/#!forum/kallisto-and-applications).
 
 ## Reporting bugs
 
-Please report bugs to the [Github issues
-page](https://github.com/pachterlab/kallisto/issues)
+Please report bugs to the [Github issues page](https://github.com/pachterlab/kallisto/issues)
 
 ## Development and pull requests
 


=====================================
src/ProcessReads.cpp
=====================================
@@ -256,7 +256,7 @@ int64_t ProcessBUSReads(MasterProcessor& MP, const  ProgramOptions& opt) {
   if (nummapped == 0) {
     std::cerr << "[~warn] no reads pseudoaligned." << std::endl;
   }
-
+  
   return numreads;
 }
 
@@ -565,7 +565,7 @@ void MasterProcessor::processAln(const EMAlgorithm& em, bool useEM = true) {
 
   assert(opt.pseudobam);
   pseudobatchf_in.open(opt.output + "/pseudoaln.bin", std::ios::in | std::ios::binary);
-  SR.reset();
+  SR->reset();
 
   std::vector<std::thread> workers;
   for (int i = 0; i < opt.threads; i++) {
@@ -893,6 +893,7 @@ ReadProcessor::ReadProcessor(ReadProcessor && o) :
   seqs(std::move(o.seqs)),
   names(std::move(o.names)),
   quals(std::move(o.quals)),
+  flags(std::move(o.flags)),
   umis(std::move(o.umis)),
   newEcs(std::move(o.newEcs)),
   flens(std::move(o.flens)),
@@ -919,16 +920,16 @@ void ReadProcessor::operator()() {
       if (batchSR.empty()) {
         return;
       } else {
-        batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam );
+        batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam );
       }
     } else {
       std::lock_guard<std::mutex> lock(mp.reader_lock);
-      if (mp.SR.empty()) {
+      if (mp.SR->empty()) {
         // nothing to do
         return;
       } else {
         // get new sequences
-        mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
+        mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
       }
       // release the reader lock
     }
@@ -1222,7 +1223,7 @@ void ReadProcessor::clear() {
 
 
 BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp, int _id, int _local_id) :
- paired(!opt.single_end), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
+ paired(!opt.single_end), bam(opt.bam), num(opt.num), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
    // initialize buffer
    bufsize = mp.bufsize;
    buffer = new char[bufsize];  
@@ -1238,6 +1239,8 @@ BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, co
 
 BUSProcessor::BUSProcessor(BUSProcessor && o) :
   paired(o.paired),
+  bam(o.bam),
+  num(o.num),
   tc(o.tc),
   index(o.index),
   mp(o.mp),
@@ -1248,6 +1251,7 @@ BUSProcessor::BUSProcessor(BUSProcessor && o) :
   seqs(std::move(o.seqs)), 
   names(std::move(o.names)),
   quals(std::move(o.quals)),
+  flags(std::move(o.flags)),
   newEcs(std::move(o.newEcs)),
   flens(std::move(o.flens)),
   bias5(std::move(o.bias5)),
@@ -1273,13 +1277,13 @@ void BUSProcessor::operator()() {
     // grab the reader lock
     {
       std::lock_guard<std::mutex> lock(mp.reader_lock);
-      if (mp.SR.empty()) {
+      if (mp.SR->empty()) {
         // nothing to do
         return;
       } else {
         // get new sequences
         std::vector<std::string> umis;
-        mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, false);
+        mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, false);
       }
       // release the reader lock
     }
@@ -1304,7 +1308,7 @@ void BUSProcessor::processBuffer() {
   std::vector<std::pair<KmerEntry,int>> v,v2;
   std::vector<int> vtmp;
   std::vector<int> u;
-
+  
   u.reserve(1000);
   v.reserve(1000);
   vtmp.reserve(1000);
@@ -1326,9 +1330,17 @@ void BUSProcessor::processBuffer() {
   
   // actually process the sequences
   const BUSOptions& busopt = mp.opt.busOptions;
-  int incf = busopt.nfiles-1;
+  int incf, jmax;
+  if (bam) {
+    incf = 1;
+    jmax = 2;
+  } else {
+    incf = busopt.nfiles-1;
+    jmax = busopt.nfiles;
+  }
+  //int incf = (bam) ? 1 : busopt.nfiles-1;
   for (int i = 0; i + incf < seqs.size(); i++) {
-    for (int j = 0; j < busopt.nfiles; j++) {
+    for (int j = 0; j < jmax /*(bam) ? 2 : busopt.nfiles*/; j++) {
       s[j] = seqs[i+j].first;
       l[j] = seqs[i+j].second;      
     }
@@ -1350,7 +1362,7 @@ void BUSProcessor::processBuffer() {
     }
 
     
-    auto &bcc = busopt.bc[0];
+//    auto &bcc = busopt.bc[0];
     int blen = 0;
     bool bad_bc = false;
     for (auto &bcc : busopt.bc) {
@@ -1401,6 +1413,9 @@ void BUSProcessor::processBuffer() {
       b.flags |= (f) << 8;
       b.count = 1;
       //std::cout << std::string(s1,10)  << "\t" << b.barcode << "\t" << std::string(s1+10,16) << "\t" << b.UMI << "\n";
+      if (num) {
+        b.flags = (uint32_t) flags[i / jmax];
+      }
 
       //ec = tc.findEC(u);
       
@@ -1438,8 +1453,7 @@ AlnProcessor::AlnProcessor(const KmerIndex& index, const ProgramOptions& opt, Ma
    buffer = new char[bufsize];
    bambufsize = 1<<20;
    bambuffer = new char[bambufsize]; // refactor this?
-
-
+   
    if (opt.batch_mode) {
      /* need to check this later */
      assert(id != -1);
@@ -1475,6 +1489,7 @@ AlnProcessor::AlnProcessor(AlnProcessor && o) :
   seqs(std::move(o.seqs)),
   names(std::move(o.names)),
   quals(std::move(o.quals)),
+  flags(std::move(o.flags)),
   umis(std::move(o.umis)),
   batchSR(std::move(o.batchSR)) {
     buffer = o.buffer;
@@ -1515,19 +1530,19 @@ void AlnProcessor::operator()() {
       if (batchSR.empty()) {
         return;
       } else {
-        batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true );
+        batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true );
         readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
         assert(pseudobatch.batch_id == readbatch_id);
         assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
       }
     } else {
       std::lock_guard<std::mutex> lock(mp.reader_lock);
-      if (mp.SR.empty()) {
+      if (mp.SR->empty()) {
         // nothing to do
         return;
       } else {
         // get new sequences        
-        mp.SR.fetchSequences(buffer, bufsize, seqs, names, quals, umis, readbatch_id, true);
+        mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true);
         readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
         assert(pseudobatch.batch_id == readbatch_id);
         assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
@@ -2557,9 +2572,14 @@ int fillBamRecord(bam1_t &b, uint8_t* buf, const char *seq, const char *name, co
 }
 
 
-/** -- sequence reader -- **/
+/** -- sequence readers -- **/
+
+void SequenceReader::reset() {
+  state = false;
+  readbatch_id = -1;
+}
 
-SequenceReader::~SequenceReader() {
+FastqSequenceReader::~FastqSequenceReader() {
   for (auto &f : fp) {
     if (f) {
       gzclose(f);
@@ -2578,26 +2598,20 @@ SequenceReader::~SequenceReader() {
 }
 
 
-void SequenceReader::reserveNfiles(int n) {
-  fp.resize(nfiles);
-  seq.resize(nfiles, nullptr);
-  l.resize(nfiles, 0);
-  nl.resize(nfiles, 0);
+bool FastqSequenceReader::empty() {
+  return (!state && current_file >= files.size());
 }
 
-void SequenceReader::reset() {
-   for (auto &f : fp) {
+void FastqSequenceReader::reset() {
+  SequenceReader::reset();
+   
+  for (auto &f : fp) {
     if (f) {
       gzclose(f);
     }
     f = nullptr;
   }
 
-  for (auto &s : seq) {
-    kseq_destroy(s);
-    s = nullptr;
-  }
-
   if (f_umi && f_umi->is_open()) {
     f_umi->close();    
   }
@@ -2612,15 +2626,24 @@ void SequenceReader::reset() {
   }
   
   current_file = 0;
-  state = false;
-  readbatch_id = -1;
+  for (auto &s : seq) {
+    kseq_destroy(s);
+    s = nullptr;
+  }
 }
 
+void FastqSequenceReader::reserveNfiles(int n) {
+  fp.resize(nfiles);
+  l.resize(nfiles, 0);
+  nl.resize(nfiles, 0);
+  seq.resize(nfiles, nullptr);
+}
 
 // returns true if there is more left to read from the files
-bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
+bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
   std::vector<std::pair<const char *, int> > &names,
   std::vector<std::pair<const char *, int> > &quals,
+  std::vector<uint32_t>& flags,
   std::vector<std::string> &umis, int& read_id,
   bool full) {
     
@@ -2634,6 +2657,7 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
     names.clear();
     quals.clear();
   }
+  flags.clear();
    
   bool usingUMIfiles = !umi_files.empty();
   int umis_read = 0;
@@ -2678,7 +2702,7 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
     bool all_l = true;
     int bufadd = nfiles;
     for (int i = 0; i < nfiles; i++) {
-      all_l = all_l && l[i] > 0;
+      all_l = all_l && l[i] >= 0;
       bufadd += l[i]; // includes seq
     }
     if (all_l) {      
@@ -2718,6 +2742,8 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
           ss >> umi;
           umis.emplace_back(std::move(umi));
         }
+
+        flags.push_back(numreads++);
       } else {
         return true; // read it next time
       }
@@ -2732,27 +2758,146 @@ bool SequenceReader::fetchSequences(char *buf, const int limit, std::vector<std:
   }
 }
 
-bool SequenceReader::empty() {
-  return (!state && current_file >= files.size());
-}
-
 // move constructor
-SequenceReader::SequenceReader(SequenceReader&& o) :
+#if 1
+FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) :
+  nfiles(o.nfiles),
+  numreads(o.numreads),
   fp(std::move(o.fp)),
-  seq(std::move(o.seq)),
   l(std::move(o.l)),
   nl(std::move(o.nl)),
   paired(o.paired),
-  nfiles(o.nfiles),
   files(std::move(o.files)),
   umi_files(std::move(o.umi_files)),
   f_umi(std::move(o.f_umi)),
-  current_file(o.current_file),
-  state(o.state) {
+  current_file(o.current_file) {
+
+  o.fp.resize(nfiles);
+  o.l.resize(nfiles, 0);
+  o.nl.resize(nfiles, 0);
+  o.state = false;
+}
+#endif
+#if 0
+FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) {
+  fp          = (std::move(o.fp));
+  l           = (std::move(o.l));
+  nl          = (std::move(o.nl));
+  paired      = (o.paired);
+  nfiles      = (o.nfiles);
+  files       = (std::move(o.files)),
+  umi_files   = (std::move(o.umi_files));
+  f_umi       = (std::move(o.f_umi));
+  current_file= (o.current_file);
+  seq         = (std::move(o.seq));
 
   o.fp.resize(nfiles);
-  o.seq.resize(nfiles, nullptr);
   o.l.resize(nfiles, 0);
   o.nl.resize(nfiles, 0);
   o.state = false;
+  o.seq.resize(nfiles, nullptr);
+}
+#endif
+const std::string BamSequenceReader::seq_enc = "=ACMGRSVTWYHKDBN";
+
+BamSequenceReader::~BamSequenceReader() {
+  if (fp) {
+    bgzf_close(fp);
+  }
+  if (head) {
+    bam_hdr_destroy(head);
+  }
+  if (rec) {
+    bam_destroy1(rec);
+  }
+}
+
+bool BamSequenceReader::empty() {
+  return !state;
+}
+
+void BamSequenceReader::reset() {
+  SequenceReader::reset();
+}
+
+void BamSequenceReader::reserveNfiles(int n) {
+}
+
+// returns true if there is more left to read from the files
+bool BamSequenceReader::fetchSequences(char *buf, const int limit, std::vector<std::pair<const char *, int> > &seqs,
+  std::vector<std::pair<const char *, int> > &names,
+  std::vector<std::pair<const char *, int> > &quals,
+  std::vector<uint32_t>& flags,
+  std::vector<std::string> &umis, int& read_id,
+  bool full) {
+  
+  readbatch_id += 1; // increase the batch id
+  read_id = readbatch_id; // copy now because we are inside a lock
+  seqs.clear();
+  flags.clear();
+  
+  int bufpos = 0;
+  while (true) {
+    if (!state) { // should we open a file
+        return false;
+    }
+
+    // the file is open and we have read into seq1 and seq2
+    if (err >= 0) {
+      if (!(rec->core.flag & BAM_FSECONDARY)) { // record is primary alignment
+        int bufadd = l_seq + l_bc + l_umi + 2;
+        // fits into the buffer
+        if (bufpos+bufadd < limit) {
+          char *pi;
+
+          pi = buf + bufpos;
+          memcpy(pi, bc, l_bc);
+          memcpy(pi + l_bc, umi, l_umi + 1);
+          seqs.emplace_back(pi, l_bc + l_umi);
+          bufpos += l_bc + l_umi + 1;
+
+          pi = buf + bufpos;
+          int len = (l_seq + 1) / 2;
+          if (l_seq % 2) --len;
+          int j = 0;
+          for (int i = 0; i < len; ++i, ++eseq) {
+            buf[bufpos++] = seq_enc[*eseq >> 4];
+            buf[bufpos++] = seq_enc[*eseq & 0x0F];
+          }
+          if (l_seq % 2) {
+            buf[bufpos++] = seq_enc[*eseq >> 4];
+          }
+          buf[bufpos++] = '\0';
+          seqs.emplace_back(pi, l_seq);
+
+        } else {
+          return true; // read it next time
+        }
+      }
+      
+      // read for the next one
+      err = bam_read1(fp, rec);
+      if (err < 0) {
+        return true;
+      }
+      eseq = bam_get_seq(rec);
+      l_seq = rec->core.l_qseq;
+
+      bc = bam_aux2Z(bam_aux_get(rec, "CR"));
+      l_bc = 0;
+      for (char *pbc = bc; *pbc != '\0'; ++pbc) {
+        ++l_bc;
+      }
+
+      umi = bam_aux2Z(bam_aux_get(rec, "UR"));
+      l_umi = 0;
+      for (char *pumi = umi; *pumi != '\0'; ++pumi) {
+        ++l_umi;
+      }
+
+    } else {
+      state = false; // haven't opened file yet
+    }
+  }
 }
+


=====================================
src/ProcessReads.h
=====================================
@@ -23,6 +23,7 @@
 #include "GeneModel.h"
 #include "BUSData.h"
 #include "BUSTools.h"
+#include <htslib/sam.h>
 
 
 #ifndef KSEQ_INIT_READY
@@ -41,9 +42,35 @@ class SequenceReader {
 public:
 
   SequenceReader(const ProgramOptions& opt) :
-  paired(!opt.single_end), files(opt.files),
-  f_umi(new std::ifstream{}),
-  current_file(0), state(false), readbatch_id(-1) {
+  readbatch_id(-1) {};
+  SequenceReader() : state(false), readbatch_id(-1) {};
+  virtual ~SequenceReader() {}
+//  SequenceReader(SequenceReader&& o);
+  
+  virtual bool empty() = 0;
+  virtual void reset();
+  virtual void reserveNfiles(int n) = 0;
+  virtual bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
+                      std::vector<std::pair<const char*, int>>& names,
+                      std::vector<std::pair<const char*, int>>& quals,
+                      std::vector<uint32_t>& flags,
+                      std::vector<std::string>& umis, int &readbatch_id,
+                      bool full=false) = 0;
+
+
+public:
+  bool state; // is the file open
+  int readbatch_id = -1;
+};
+
+class FastqSequenceReader : public SequenceReader {
+public:
+
+  FastqSequenceReader(const ProgramOptions& opt) : SequenceReader(opt),
+  current_file(0), paired(!opt.single_end), files(opt.files),
+  f_umi(new std::ifstream{}) {
+    SequenceReader::state = false;
+
     if (opt.bus_mode) {
       nfiles = opt.busOptions.nfiles;      
     } else {
@@ -51,46 +78,106 @@ public:
     }
     reserveNfiles(nfiles);
   }
-  SequenceReader() :
+  FastqSequenceReader() : SequenceReader(), 
   paired(false), 
   f_umi(new std::ifstream{}),
-  current_file(0), state(false), readbatch_id(-1) {}
-  SequenceReader(SequenceReader&& o);
-  
-  bool empty();
+  current_file(0) {};
+  FastqSequenceReader(FastqSequenceReader &&o);
+  ~FastqSequenceReader();
+
+  bool empty();  
   void reset();
   void reserveNfiles(int n);
-  ~SequenceReader();
-
   bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
                       std::vector<std::pair<const char*, int>>& names,
                       std::vector<std::pair<const char*, int>>& quals,
+                      std::vector<uint32_t>& flags,
                       std::vector<std::string>& umis, int &readbatch_id,
                       bool full=false);
 
 public:
   int nfiles = 1;
+  uint32_t numreads = 0;
   std::vector<gzFile> fp;
-  //gzFile fp1 = 0, fp2 = 0;
-  std::vector<kseq_t*> seq;
   std::vector<int> l;
   std::vector<int> nl;
-  //kseq_t *seq1 = 0, *seq2 = 0;
-  //int l1,l2,nl1,nl2;
   bool paired;
   std::vector<std::string> files;
   std::vector<std::string> umi_files;
   std::unique_ptr<std::ifstream> f_umi;
   int current_file;
-  bool state; // is the file open
-  int readbatch_id = -1;
+  std::vector<kseq_t*> seq;
+};
+
+class BamSequenceReader : public SequenceReader {
+public:
+
+  BamSequenceReader(const ProgramOptions& opt) :
+  SequenceReader(opt) {
+    SequenceReader::state = true;
+
+    fp = bgzf_open(opt.files[0].c_str(), "rb");
+    head = bam_hdr_read(fp);
+    rec = bam_init1();
+    
+    err = bam_read1(fp, rec);
+    eseq = bam_get_seq(rec);
+    l_seq = rec->core.l_qseq;
+
+    bc = bam_aux2Z(bam_aux_get(rec, "CR"));
+    l_bc = 0;
+    for (char *pbc = bc; *pbc != '\0'; ++pbc) {
+      ++l_bc;
+    }
+
+    umi = bam_aux2Z(bam_aux_get(rec, "UR"));
+    l_umi = 0;
+    for (char *pumi = umi; *pumi != '\0'; ++pumi) {
+      ++l_umi;
+    }
+  }
+  BamSequenceReader() : SequenceReader() {};
+  BamSequenceReader(BamSequenceReader &&o);
+  ~BamSequenceReader();
+  
+  bool empty();
+  void reset();
+  void reserveNfiles(int n);
+  bool fetchSequences(char *buf, const int limit, std::vector<std::pair<const char*, int>>& seqs,
+                      std::vector<std::pair<const char*, int>>& names,
+                      std::vector<std::pair<const char*, int>>& quals,
+                      std::vector<uint32_t>& flags,
+                      std::vector<std::string>& umis, int &readbatch_id,
+                      bool full=false);
+
+public:
+  BGZF *fp;
+  bam_hdr_t *head;
+  bam1_t *rec;
+  uint8_t *eseq;
+  int32_t l_seq;
+  char *bc;
+  int l_bc;
+  char *umi;
+  int l_umi;
+  int err;
+  char seq[256]; // Is there a better limit?
+  
+private:
+  static const std::string seq_enc;
 };
 
 class MasterProcessor {
 public:
   MasterProcessor (KmerIndex &index, const ProgramOptions& opt, MinCollector &tc, const Transcriptome& model)
-    : tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), SR(opt), numreads(0)
+    : tc(tc), index(index), model(model), bamfp(nullptr), bamfps(nullptr), bamh(nullptr), opt(opt), numreads(0)
     ,nummapped(0), num_umi(0), bufsize(1ULL<<23), tlencount(0), biasCount(0), maxBiasCount((opt.bias) ? 1000000 : 0), last_pseudobatch_id (-1) { 
+      if (opt.bam) {
+        SR = new BamSequenceReader(opt);
+      } else {
+        SR = new FastqSequenceReader(opt);
+      }
+
       if (opt.batch_mode) {
         memset(&bus_bc_len[0],0,33);
         memset(&bus_umi_len[0],0,33);
@@ -136,13 +223,14 @@ public:
       bam_hdr_destroy(bamh);
       bamh = nullptr;
     }    
+    delete SR;
   }
 
   std::mutex reader_lock;
   std::mutex writer_lock;
 
 
-  SequenceReader SR;
+  SequenceReader *SR;
   MinCollector& tc;
   KmerIndex& index;
   const Transcriptome& model;
@@ -205,7 +293,7 @@ public:
   std::vector<std::pair<std::vector<int>, std::string>> new_ec_umi;
   const KmerIndex& index;
   MasterProcessor& mp;
-  SequenceReader batchSR;
+  FastqSequenceReader batchSR;
   int64_t numreads;
   int id;
   int local_id;
@@ -215,6 +303,7 @@ public:
   std::vector<std::pair<const char*, int>> seqs;
   std::vector<std::pair<const char*, int>> names;
   std::vector<std::pair<const char*, int>> quals;
+  std::vector<uint32_t> flags;
   std::vector<std::string> umis;
   std::vector<std::vector<int>> newEcs;
   std::vector<int> flens;
@@ -237,6 +326,8 @@ public:
   
   size_t bufsize;
   bool paired;
+  bool bam;
+  bool num;
   const MinCollector& tc;
   const KmerIndex& index;
   MasterProcessor& mp;
@@ -251,6 +342,7 @@ public:
   std::vector<std::pair<const char*, int>> seqs;
   std::vector<std::pair<const char*, int>> names;
   std::vector<std::pair<const char*, int>> quals;
+  std::vector<uint32_t> flags;
 
   std::vector<std::vector<int>> newEcs;
   std::vector<int> flens;
@@ -279,7 +371,7 @@ public:
   const KmerIndex& index;
   const EMAlgorithm& em;
   MasterProcessor& mp;
-  SequenceReader batchSR;
+  FastqSequenceReader batchSR;
   int64_t numreads;
   int id;
   PseudoAlignmentBatch pseudobatch;
@@ -291,6 +383,7 @@ public:
   std::vector<std::pair<const char*, int>> seqs;
   std::vector<std::pair<const char*, int>> names;
   std::vector<std::pair<const char*, int>> quals;
+  std::vector<uint32_t> flags;
   std::vector<std::string> umis;
 
   void operator()();


=====================================
src/common.h
=====================================
@@ -1,11 +1,11 @@
 #ifndef KALLISTO_COMMON_H
 #define KALLISTO_COMMON_H
 
-#define KALLISTO_VERSION "0.46.0"
+#define KALLISTO_VERSION "0.46.1"
 
 #include <string>
 #include <vector>
-
+#include <iostream>
 
 #ifdef _WIN64
 typedef unsigned int uint;
@@ -67,6 +67,8 @@ struct ProgramOptions {
   bool bus_mode;
   BUSOptions busOptions;
   bool pseudo_quant;
+  bool bam;
+  bool num;
   std::string batch_file_name;
   std::vector<std::vector<std::string>> batch_files;
   std::vector<std::string> batch_ids;
@@ -107,6 +109,8 @@ ProgramOptions() :
   batch_mode(false),
   bus_mode(false),
   pseudo_quant(false),
+  bam(false),
+  num(false),
   plaintext(false),
   write_index(false),
   single_end(false),


=====================================
src/main.cpp
=====================================
@@ -533,20 +533,24 @@ void ListSingleCellTechnologies() {
   << "CELSeq           CEL-Seq" << endl
   << "CELSeq2          CEL-Seq version 2" << endl
   << "DropSeq          DropSeq" << endl
-  << "inDrops          inDrops" << endl
+  << "inDropsv1        inDrops version 1 chemistry" << endl
+  << "inDropsv2        inDrops version 2 chemistry" << endl
+  << "inDropsv3        inDrops version 3 chemistry" << endl
   << "SCRBSeq          SCRB-Seq" << endl
   << "SureCell         SureCell for ddSEQ" << endl
   << endl;
  }
 
 void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
-  const char *opt_string = "i:o:x:t:l";
+  const char *opt_string = "i:o:x:t:lbn";
   static struct option long_options[] = {
     {"index", required_argument, 0, 'i'},
     {"output-dir", required_argument, 0, 'o'},
     {"technology", required_argument, 0, 'x'},
     {"list", no_argument, 0, 'l'},
     {"threads", required_argument, 0, 't'},
+    {"bam", no_argument, 0, 'b'},
+    {"num", no_argument, 0, 'n'},
     {0,0,0,0}
   };
 
@@ -584,6 +588,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
       stringstream(optarg) >> opt.threads;
       break;
     }
+    case 'b': {
+      opt.bam = true;
+      break;
+    }
+    case 'n': {
+      opt.num = true;
+      break;
+    }
     default: break;
     }
   }
@@ -812,84 +824,183 @@ bool CheckOptionsBus(ProgramOptions& opt) {
     ret = false;
   } else {
     auto& busopt = opt.busOptions;
-    
-    if (opt.technology == "10XV2") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
-      busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
-      busopt.bc.push_back(BUSOptionSubstr(0,0,16));
-    } else if (opt.technology == "10XV3") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0);
-      busopt.umi = BUSOptionSubstr(0,16,28);
-      busopt.bc.push_back(BUSOptionSubstr(0,0,16));
-    } else if (opt.technology == "10XV1") {
-      busopt.nfiles = 3;
-      busopt.seq = BUSOptionSubstr(0,0,0);
-      busopt.umi = BUSOptionSubstr(1,0,0);
-      busopt.bc.push_back(BUSOptionSubstr(2,0,0));
-    } else if (opt.technology == "SURECELL") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0);
-      busopt.umi = BUSOptionSubstr(0,51,59);
-      busopt.bc.push_back(BUSOptionSubstr(0,0,6));
-      busopt.bc.push_back(BUSOptionSubstr(0,21,27));
-      busopt.bc.push_back(BUSOptionSubstr(0,42,48));
-    } else if (opt.technology == "DROPSEQ") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0);
-      busopt.umi = BUSOptionSubstr(0,12,20);
-      busopt.bc.push_back(BUSOptionSubstr(0,0,12));
-    } else if (opt.technology == "INDROPS") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0);
-      busopt.umi = BUSOptionSubstr(0,42,48);
-      busopt.bc.push_back(BUSOptionSubstr(0,0,11));
-      busopt.bc.push_back(BUSOptionSubstr(0,30,38));    
-    } else if (opt.technology == "CELSEQ") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0);
-      busopt.umi = BUSOptionSubstr(0,8,12);
-      busopt.bc.push_back(BUSOptionSubstr(0,0,8));
-    } else if (opt.technology == "CELSEQ2") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0);
-      busopt.umi = BUSOptionSubstr(0,0,6);
-      busopt.bc.push_back(BUSOptionSubstr(0,6,12));
-    } else if (opt.technology == "SCRBSEQ") {
-      busopt.nfiles = 2;
-      busopt.seq = BUSOptionSubstr(1,0,0);
-      busopt.umi = BUSOptionSubstr(0,6,16);
-      busopt.bc.push_back(BUSOptionSubstr(0,0,6));
-    } else {
-      vector<int> files;
-      vector<BUSOptionSubstr> values;
-      vector<BUSOptionSubstr> bcValues;
-      vector<std::string> errorList;
-      bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
-      if(!invalid) {
-        busopt.nfiles = files.size(); 
-        for(int i = 0; i < bcValues.size(); i++) {
-          busopt.bc.push_back(bcValues[i]);
+   
+    if (opt.bam) { // Note: only 10xV2 has been tested
+      busopt.nfiles = 1;
+      if (opt.technology == "10XV2") {
+        busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
+        busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+        busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+      } else if (opt.technology == "10XV3") {
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,16,28);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+//      } else if (opt.technology == "10XV1") {
+
+      } else if (opt.technology == "SURECELL") {
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,18,26);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,18));
+      } else if (opt.technology == "DROPSEQ") {
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,12,20);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,12));
+      } else if (opt.technology == "INDROPSV1") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,42,48);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,11));
+        busopt.bc.push_back(BUSOptionSubstr(0,30,38));  
+      } else if (opt.technology == "INDROPSV2") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(0,0,0);
+        busopt.umi = BUSOptionSubstr(1,42,48);
+        busopt.bc.push_back(BUSOptionSubstr(1,0,11));
+        busopt.bc.push_back(BUSOptionSubstr(1,30,38));  
+      } else if (opt.technology == "INDROPSV3") {
+        busopt.nfiles = 3;
+        busopt.seq = BUSOptionSubstr(2,0,0);
+        busopt.umi = BUSOptionSubstr(1,8,14);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+        busopt.bc.push_back(BUSOptionSubstr(1,0,8));
+      } else if (opt.technology == "CELSEQ") {
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,8,12);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+      } else if (opt.technology == "CELSEQ2") {
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,0,6);
+        busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+      } else if (opt.technology == "SCRBSEQ") {
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,6,16);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,6));
+      } else if (opt.technology == "INDROPSV3") {
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,0,6);
+        busopt.bc.push_back(BUSOptionSubstr(0,6,16));
+      } else {
+        vector<int> files;
+        vector<BUSOptionSubstr> values;
+        vector<BUSOptionSubstr> bcValues;
+        vector<std::string> errorList;
+        bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
+        if(!invalid) {
+          busopt.nfiles = files.size(); 
+          for(int i = 0; i < bcValues.size(); i++) {
+            busopt.bc.push_back(bcValues[i]);
+          }
+          busopt.umi = values[0];
+          busopt.seq = values[1];
+        } else {
+          for(int j = 0; j < errorList.size(); j++) {
+            cerr << errorList[j] << endl;
+          }
+          cerr << "Unable to create technology: " << opt.technology << endl;
+          ret = false;
         }
-        busopt.umi = values[0];
-        busopt.seq = values[1];
+      }
+    } else {
+      if (opt.technology == "10XV2") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
+        busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+        busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+      } else if (opt.technology == "10XV3") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,16,28);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+      } else if (opt.technology == "10XV1") {
+        busopt.nfiles = 3;
+        busopt.seq = BUSOptionSubstr(0,0,0);
+        busopt.umi = BUSOptionSubstr(1,0,0);
+        busopt.bc.push_back(BUSOptionSubstr(2,0,0));
+      } else if (opt.technology == "SURECELL") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,51,59);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,6));
+        busopt.bc.push_back(BUSOptionSubstr(0,21,27));
+        busopt.bc.push_back(BUSOptionSubstr(0,42,48));
+      } else if (opt.technology == "DROPSEQ") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,12,20);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,12));
+      } else if (opt.technology == "INDROPSV1") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,42,48);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,11));
+        busopt.bc.push_back(BUSOptionSubstr(0,30,38));  
+      } else if (opt.technology == "INDROPSV2") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(0,0,0);
+        busopt.umi = BUSOptionSubstr(1,42,48);
+        busopt.bc.push_back(BUSOptionSubstr(1,0,11));
+        busopt.bc.push_back(BUSOptionSubstr(1,30,38));  
+      } else if (opt.technology == "INDROPSV3") {
+        busopt.nfiles = 3;
+        busopt.seq = BUSOptionSubstr(2,0,0);
+        busopt.umi = BUSOptionSubstr(1,8,14);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+        busopt.bc.push_back(BUSOptionSubstr(1,0,8));
+      } else if (opt.technology == "CELSEQ") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,8,12);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+      } else if (opt.technology == "CELSEQ2") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,0,6);
+        busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+      } else if (opt.technology == "SCRBSEQ") {
+        busopt.nfiles = 2;
+        busopt.seq = BUSOptionSubstr(1,0,0);
+        busopt.umi = BUSOptionSubstr(0,6,16);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,6));
+      } else if (opt.technology == "INDROPSV3") {
+        busopt.nfiles = 3;
+        busopt.seq = BUSOptionSubstr(2,0,0);
+        busopt.umi = BUSOptionSubstr(1,8,14);
+        busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+        busopt.bc.push_back(BUSOptionSubstr(1,0,8));
       } else {
-        for(int j = 0; j < errorList.size(); j++) {
-          cerr << errorList[j] << endl;
+        vector<int> files;
+        vector<BUSOptionSubstr> values;
+        vector<BUSOptionSubstr> bcValues;
+        vector<std::string> errorList;
+        bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
+        if(!invalid) {
+          busopt.nfiles = files.size(); 
+          for(int i = 0; i < bcValues.size(); i++) {
+            busopt.bc.push_back(bcValues[i]);
+          }
+          busopt.umi = values[0];
+          busopt.seq = values[1];
+        } else {
+          for(int j = 0; j < errorList.size(); j++) {
+            cerr << errorList[j] << endl;
+          }
+          cerr << "Unable to create technology: " << opt.technology << endl;
+          ret = false;
         }
-        cerr << "Unable to create technology: " << opt.technology << endl;
-        ret = false;
       }
     }
   }
 
-  if (ret && opt.files.size() %  opt.busOptions.nfiles != 0) {
+  if (ret && !opt.bam && opt.files.size() %  opt.busOptions.nfiles != 0) {
     cerr << "Error: Number of files (" << opt.files.size() << ") does not match number of input files required by "
     << "technology " << opt.technology << " (" << opt.busOptions.nfiles << ")" << endl;
     ret = false;
   }
 
+  if (opt.bam && opt.num) {
+    cerr << "Warning: --bam option was used, so --num option will be ignored" << endl;
+  }
+
   return ret;
 }
 
@@ -1535,7 +1646,9 @@ void usageBus() {
        << "-x, --technology=STRING       Single-cell technology used " << endl << endl
        << "Optional arguments:" << endl
        << "-l, --list                    List all single-cell technologies supported" << endl
-       << "-t, --threads=INT             Number of threads to use (default: 1)" << endl;
+       << "-t, --threads=INT             Number of threads to use (default: 1)" << endl
+       << "-b, --bam                     Input file is a BAM file" << endl
+       << "-n, --num                     Output number of read in flag column (incompatible with --bam)" << endl;
 }
 
 void usageIndex() {
@@ -1808,6 +1921,7 @@ int main(int argc, char *argv[]) {
         for (int i = 0; i < index.num_trans; i++) {
           num_unique += collection.counts[i];          
         }
+        num_pseudoaligned = 0;
         for (int i = 0; i < collection.counts.size(); i++) {
           num_pseudoaligned += collection.counts[i];
         }
@@ -1953,6 +2067,10 @@ int main(int argc, char *argv[]) {
           num_pseudoaligned += collection.counts[i];
         }
 
+        if (num_pseudoaligned == 0) {
+          std::cerr << "[~warn] Warning, zero reads pseudoaligned check your input files and index" << std::endl;
+        }
+
         plaintext_aux(
             opt.output + "/run_info.json",
             std::string(std::to_string(index.num_trans)),
@@ -1968,7 +2086,17 @@ int main(int argc, char *argv[]) {
         plaintext_writer(opt.output + "/abundance.tsv", em.target_names_,
             em.alpha_, em.eff_lens_, index.target_lens_);
 
-        if (opt.bootstrap > 0) {
+        if (opt.bootstrap > 0 && num_pseudoaligned == 0) {
+          // this happens if nothing aligns, then we write an empty bootstrap file
+          for (int b = 0; b < opt.bootstrap; b++) {
+            if (!opt.plaintext) {
+              writer.write_bootstrap(em, b); // em is empty
+            } else {
+              plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
+                    em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
+            }
+          }
+        } else if (opt.bootstrap > 0 && num_pseudoaligned > 0) {
           auto B = opt.bootstrap;
           std::mt19937_64 rand;
           rand.seed( opt.seed );
@@ -2016,6 +2144,9 @@ int main(int argc, char *argv[]) {
         
 
         cerr << endl;
+        if (num_pseudoaligned == 0) {
+          exit(1); // exit with error
+        }
       }
     } else if (cmd == "quant-only") {
       if (argc==2) {
@@ -2086,7 +2217,27 @@ int main(int argc, char *argv[]) {
               em.alpha_, em.eff_lens_, index.target_lens_);
         }
 
-        if (opt.bootstrap > 0) {
+        int64_t num_pseudoaligned =0;
+
+        for (int i = 0; i < collection.counts.size(); i++) {
+          num_pseudoaligned += collection.counts[i];
+        }
+
+        if (num_pseudoaligned == 0) {
+          std::cerr << "[~warn] Warning, zero reads pseudoaligned check your input files and index" << std::endl;
+        }
+
+        if (opt.bootstrap > 0 && num_pseudoaligned == 0) {
+          // this happens if nothing aligns, then we write an empty bootstrap file
+          for (int b = 0; b < opt.bootstrap; b++) {
+            if (!opt.plaintext) {
+              writer.write_bootstrap(em, b); // em is empty
+            } else {
+              plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
+                    em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
+            }
+          }
+        } else if (opt.bootstrap > 0 && num_pseudoaligned > 0) {
           std::cerr << "Bootstrapping!" << std::endl;
           auto B = opt.bootstrap;
           std::mt19937_64 rand;
@@ -2125,6 +2276,10 @@ int main(int argc, char *argv[]) {
           }
         }
         cerr << endl;
+
+        if (num_pseudoaligned == 0) {
+          exit(1);
+        }
       }
     } else if (cmd == "pseudo") {
       if (argc==2) {



View it on GitLab: https://salsa.debian.org/med-team/kallisto/commit/fc9b521a1cceb845cda67a025962923e00578ff8

-- 
View it on GitLab: https://salsa.debian.org/med-team/kallisto/commit/fc9b521a1cceb845cda67a025962923e00578ff8
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191108/908ef443/attachment-0001.html>


More information about the debian-med-commit mailing list