[med-svn] [Git][med-team/bustools][upstream] New upstream version 0.45.1+dfsg

Andreas Tille (@tille) gitlab at salsa.debian.org
Thu Oct 16 10:34:51 BST 2025



Andreas Tille pushed to branch upstream at Debian Med / bustools


Commits:
67e48431 by Andreas Tille at 2025-10-16T11:20:40+02:00
New upstream version 0.45.1+dfsg
- - - - -


11 changed files:

- + .make_bustools_binaries_linux.sh
- + .make_bustools_binaries_mac.sh
- + .make_bustools_binaries_windows.sh
- src/BUSData.cpp
- src/CMakeLists.txt
- src/Common.hpp
- src/bustools_correct.cpp
- src/bustools_count.cpp
- src/bustools_extract.cpp
- src/bustools_inspect.cpp
- src/bustools_main.cpp


Changes:

=====================================
.make_bustools_binaries_linux.sh
=====================================
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Setup docker
+
+# sudo apt install docker.io  # version 20.10.25-0ubuntu1~20.04.1
+# sudo groupadd docker
+# sudo usermod -aG docker ${USER}
+
+# Exit and log back in
+
+uid=$(id -u $USER|tr -d "\n" )
+gid=$(id -g $USER|tr -d "\n" )
+docker run --rm dockbuild/centos7:latest > ./dockbuild
+chmod +x dockbuild
+./dockbuild
+docker run --rm dockbuild/centos7-devtoolset7-gcc7:latest > dockbuild-centos7-devtoolset7-gcc7-latest
+docker run -ti -v ./:/work -e BUILDER_UID=$uid -e BUILDER_GID=$gid -e BUILDER_USER=$USER -e BUILDER_GROUP=$USER --platform linux dockbuild/centos7-devtoolset7-gcc7:latest \
+	bash -c "rm -rf bustools && git clone https://github.com/BUStools/bustools && cd bustools && mkdir build && cd build && cmake .. && make && mv src/bustools ../ && cd ../../"
+
+rm -rf bustools_linux-master.tar.gz
+tar --no-xattrs --exclude='._*' -czvf bustools_linux-master.tar.gz bustools/bustools bustools/README.md bustools/LICENSE
+
+


=====================================
.make_bustools_binaries_mac.sh
=====================================
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Run this on a Mac
+
+git clone https://github.com/BUStools/bustools
+cd bustools && mkdir build && cd build
+cmake ..
+make
+mv src/bustools ../
+cd ../../
+tar --no-xattrs --exclude='._*' -czvf bustools_mac-master.tar.gz bustools/bustools bustools/README.md bustools/LICENSE
+
+


=====================================
.make_bustools_binaries_windows.sh
=====================================
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Setup docker
+
+# sudo apt install docker.io  # version 20.10.25-0ubuntu1~20.04.1
+# sudo groupadd docker
+# sudo usermod -aG docker ${USER}
+
+# Exit and log back in
+
+
+docker run --rm dockcross/windows-static-x64:20221217-6afd127 > ./dockcross-windows-static-x64
+chmod +x ./dockcross-windows-static-x64
+./dockcross-windows-static-x64 bash -c "rm -rf zlib-1.3.1.tar.gz && wget http://www.zlib.net/zlib-1.3.1.tar.gz && tar -xvzf zlib-1.3.1.tar.gz && cd zlib-1.3.1 && ./configure --static && make && cd .. && rm -rf bustools && git clone https://github.com/BUStools/bustools && cd bustools && mkdir build && cd build && cmake ..  -DZLIB_LIBRARY=/work/zlib-1.3.1/libz.a -DZLIB_INCLUDE_DIR=/work/zlib-1.3.1/ && make && mv src/bustools.exe ../ && cd ../../"
+
+rm -rf bustools_windows-master.zip
+zip bustools_windows-master.zip bustools/bustools.exe bustools/README.md bustools/LICENSE
+


=====================================
src/BUSData.cpp
=====================================
@@ -6,6 +6,7 @@
 #include <sstream>
 #include <iostream>
 #include <limits>
+#include <unordered_set>
 
 uint64_t stringToBinary(const std::string &s, uint32_t &flag) {
   return stringToBinary(s.c_str(), s.size(), flag);
@@ -361,14 +362,17 @@ bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t>
 
   std::string line, t;
   line.reserve(10000);
+  bool ret = true;
 
   int i = 0;
+  std::unordered_set<std::string> txs_set; // Set of transcripts found in t2g file
   while (std::getline(inf,line)) {
     std::stringstream ss(line);
     std::string txp, gene;
     ss >> txp >> gene;
     auto it = txnames.find(txp);
     if (it != txnames.end()) {
+      txs_set.insert(txp);
       auto i = it->second;
       auto git = genenames.find(gene);
       auto gi = -1;
@@ -379,10 +383,14 @@ bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t>
         gi = git->second;
       }
       genemap[i] = gi;
+    } else {
+      ret = false;
     }
   }
 
-  return true;
+  if (txs_set.size() != txnames.size()) ret = false; // number of transcripts in t2g file and transcripts file don't match up
+
+  return ret;
 }
 
 bool parseGenesList(const std::string& filename, std::vector<std::string>& geneNames) {


=====================================
src/CMakeLists.txt
=====================================
@@ -28,7 +28,13 @@ endif(LINK MATCHES static)
  target_link_libraries(bustools ${ZLIB_LIBRARIES})
 
  if ( ZLIB_FOUND )
-     include_directories( ${ZLIB_INCLUDE_DIRS} )
+    if (DEFINED ZLIB_INCLUDE_DIRS)
+        include_directories( ${ZLIB_INCLUDE_DIRS} )
+    elseif (DEFINED ZLIB_INCLUDE_DIR)
+        include_directories( ${ZLIB_INCLUDE_DIR} )
+    else()
+        message(FATAL_ERROR "zlib found but no include directories are set.")
+    endif()
  else()
    message(FATAL_ERROR "zlib not found. Required for reading FASTQ files" )
  endif( ZLIB_FOUND )


=====================================
src/Common.hpp
=====================================
@@ -12,7 +12,7 @@
 #include "roaring.hh"
 #include "hash.hpp"
 
-#define BUSTOOLS_VERSION "0.43.2"
+#define BUSTOOLS_VERSION "0.45.1"
 
 #define u_map_ std::unordered_map
 enum CAPTURE_TYPE : char
@@ -88,6 +88,7 @@ struct Bustools_opt
   bool split_correct = false;
   bool barcode_replacement = false;
   bool parse_error = false;
+  bool no_correct = false;
 
   /* predict */
   std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this
@@ -113,6 +114,10 @@ struct Bustools_opt
   bool text_dumppad = false;
   bool text_showall = false;
 
+  /* extract */
+  bool extract_exclude = false;
+  bool extract_include = true;
+
   /* linker */
   int start, end;
 


=====================================
src/bustools_correct.cpp
=====================================
@@ -866,6 +866,10 @@ void bustools_correct(Bustools_opt &opt) {
             stat_white_++;
             correction |= b_;
           } else {
+            if (opt.no_correct) { // Option to skip correction and keep only barcodes in the list
+              stat_uncorr_++;
+              break;
+            }
             auto lower_mask = lower_upper_mask_vec[j].first;
             auto upper_mask = lower_upper_mask_vec[j].second;
             auto bc2 = bc2_vec[j];


=====================================
src/bustools_count.cpp
=====================================
@@ -31,7 +31,7 @@ void bustools_count(Bustools_opt &opt) {
   std::vector<int32_t> genemap(txnames.size(), -1);
   u_map_<std::string, int32_t> genenames;
   if (!parseGenes(opt.count_genes, txnames, genemap, genenames)) {
-    std::cerr << "Warning: Some transcripts exist in the transcripts file but not in the transcript-to-gene mapping file; this will likely cause errors." << std::endl;
+    std::cerr << "Warning: Some transcripts exist in the transcripts file but not in the transcript-to-gene mapping file (or vice versa); this may cause errors." << std::endl;
   }
   
   parseECs(opt.count_ecs, h);
@@ -170,7 +170,7 @@ void bustools_count(Bustools_opt &opt) {
       // v[i..j-1] share the same UMI
       ecs.resize(0);
       for (size_t k = i; k < j; k++) {
-        ecs.push_back(v[k].ec);
+        if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec);
       }
       
       if (opt.umi_gene_collapse) {
@@ -257,7 +257,7 @@ void bustools_count(Bustools_opt &opt) {
           break;
         }
       }
-      double val = j-i;
+      size_t val = j-i;
       auto which_mtx = intersect_ecs_with_subset_txs(column_v[i], ecmap, tx_split_lookup);
       auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A);
       auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A);
@@ -295,7 +295,7 @@ void bustools_count(Bustools_opt &opt) {
       ecs.resize(0);
       uint32_t counts = 0;
       for (size_t k = i; k < j; k++) {
-        ecs.push_back(v[k].ec);
+        if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec);
         counts += v[k].count;
       }
 
@@ -582,7 +582,7 @@ void bustools_count(Bustools_opt &opt) {
       auto which_mtx = intersect_ecs_with_subset_txs(column_vp[i].first, ecmap, tx_split_lookup);
       auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A);
       auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A);
-      of_ << n_rows << " " << (column_vp[i].first+1) << " " << val << "\n";
+      of_ << n_rows << " " << (column_vp[i].first+1) << " " << std::to_string(static_cast<size_t>(val)) << "\n";
       n_entries_++;
       i = j; // increment
     }
@@ -646,6 +646,14 @@ void bustools_count(Bustools_opt &opt) {
   delete[] p; p = nullptr;
 
   if (!opt.count_collapse) {
+    for (const auto& pair : txnames) { // Create (single-element) equivalence classes for transcripts without one
+	int32_t val = pair.second;
+	std::vector<int32_t> key;
+	key.push_back(val);
+	if (ecmapinv.find(key) == ecmapinv.end()) {
+	  ecmap.push_back(key);
+	}
+    }
     n_cols = ecmap.size();
   } else {
     n_cols = genenames.size();


=====================================
src/bustools_extract.cpp
=====================================
@@ -18,7 +18,7 @@ inline bool open_fastqs(
   
   for (int i = 0; i < opt.nFastqs; ++i) {
     gzclose(outFastq[i]);
-    outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w");
+    outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w1");
     gzclose(inFastq[i]);
     inFastq[i] = gzopen(opt.fastq[iFastq].c_str(), "r");
   
@@ -26,10 +26,7 @@ inline bool open_fastqs(
       kseq_destroy(seq[i]);
     }
     seq[i] = kseq_init(inFastq[i]);
-    if (kseq_read(seq[i]) < 0) {
-      return false;
-    }
-    
+        
     ++iFastq;
   }
   return true;
@@ -59,54 +56,25 @@ void bustools_extract(const Bustools_opt &opt) {
   std::vector<kseq_t *> seq(opt.nFastqs, nullptr);
   uint32_t iRead = 0;
   size_t iFastq = 0;
-  if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
-    std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl;
-    goto end_extract;
-  }
+  uint32_t lastFlag = 0;
 
-  while (true) {
-    in.read((char *) p, N * sizeof(BUSData));
-    size_t rc = in.gcount() / sizeof(BUSData);
-    if (rc == 0) {
-      break;
-    }
-    nr += rc;
-    for (size_t i = 0; i < rc; ++i) {
-      while (iRead < p[i].flags) {
-        for (const auto &s : seq) {
-          int err_kseq_read = kseq_read(s);
-          if (err_kseq_read == -1) { // Reached EOF
-            if (iFastq == opt.fastq.size()) { // Done with all files
-              std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl;
-              goto end_extract;
-            } else {
-              if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
-                std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl;
-                goto end_extract;
-              }
-            }
-          } else if (err_kseq_read == -2) {
-            std::cerr << "Error: truncated FASTQ" << std::endl;
-            goto end_extract;
-          }
-        }
-        ++iRead;
-      }
-
-      if (iRead > p[i].flags) {
-        std::cerr << "BUS file not sorted by flag" << std::endl;
-        goto end_extract;
-      }
 
-      for (int i = 0; i < opt.nFastqs; ++i) {
+  auto write_seq_to_file = [&opt, &buf, &outFastq] (std::vector<kseq_t *> &seq) {
+    for (int i = 0; i < opt.nFastqs; ++i) {
         int bufLen = 1; // Already have @ character in buffer
         
         memcpy(buf + bufLen, seq[i]->name.s, seq[i]->name.l);
         bufLen += seq[i]->name.l;
-        
-        memcpy(buf + bufLen, seq[i]->comment.s, seq[i]->comment.l);
-        bufLen += seq[i]->comment.l;
-        
+
+        // Only add space and comment if the comment is not empty
+        if (seq[i]->comment.l > 0) {
+            // Add space between name and comment
+            buf[bufLen++] = ' ';
+
+            memcpy(buf + bufLen, seq[i]->comment.s, seq[i]->comment.l);
+            bufLen += seq[i]->comment.l;
+        }
+
         buf[bufLen++] = '\n';
 
         memcpy(buf + bufLen, seq[i]->seq.s, seq[i]->seq.l);
@@ -114,13 +82,6 @@ void bustools_extract(const Bustools_opt &opt) {
         
         buf[bufLen++] = '\n';
         buf[bufLen++] = '+';
-
-        memcpy(buf + bufLen, seq[i]->name.s, seq[i]->name.l);
-        bufLen += seq[i]->name.l;
-        
-        memcpy(buf + bufLen, seq[i]->comment.s, seq[i]->comment.l);
-        bufLen += seq[i]->comment.l;
-        
         buf[bufLen++] = '\n';
 
         memcpy(buf + bufLen, seq[i]->qual.s, seq[i]->qual.l);
@@ -128,14 +89,124 @@ void bustools_extract(const Bustools_opt &opt) {
         
         buf[bufLen++] = '\n';
 
-        if (gzwrite(outFastq[i], buf, bufLen) != bufLen) {
-          std::cerr << "Error writing to FASTQ" << std::endl;
+        if (gzwrite(outFastq[i], buf, bufLen) != bufLen) {          
+          return false;
+        }
+      }
+      return true;
+  };
+
+  bool tail = false;
+  bool finished = false;
+  size_t iFlag = 0;
+  size_t rc = 0;
+  
+  if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
+    std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl;
+    goto end_extract;
+  }
+
+  // fill in the first N BUS records
+  in.read((char *) p, N * sizeof(BUSData));
+  rc = in.gcount() / sizeof(BUSData);  
+  nr += rc;
+  tail = rc==0;
+
+  while (true) {
+    // fill the next read
+
+    for (int si = 0; si < seq.size(); ++si) {
+      const auto &s = seq[si];
+      int err_kseq_read = kseq_read(s);
+      if (err_kseq_read == -1) { // Reached EOF
+        if (si != 0) {
+          std::cerr << "Error: truncated FASTQ" << std::endl;
           goto end_extract;
+        } else {
+          // let's make sure that all the files are also EOF
+          for (int sii = 1; sii < seq.size(); ++sii) {
+            int err_kseq_read2 = kseq_read(seq[sii]);
+            if (err_kseq_read2 != -1) {
+              std::cerr << "Error: truncated FASTQ" << std::endl;
+              goto end_extract;
+            }
+          }
+        }
+        // check if we are done with all files
+        if (iFastq == opt.fastq.size()) { // Done with all files
+          finished = true;
+          break;
+        } else {
+          if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
+            std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl;
+            goto end_extract;
+          }
+
+          // read the first read 
+          err_kseq_read = kseq_read(seq[si]);
+          if (err_kseq_read == -1) {
+            finished = true;
+            break;
+          }
         }
       }
+      
+      if (err_kseq_read == -2) {
+        std::cerr << "Error: truncated FASTQ" << std::endl;
+        goto end_extract;
+      }
+    } 
+
+    if (finished) {
+      break;
+    }
+
+    // inclusion, check if the current read matches the next unproccessed flag   
+    if (opt.extract_include && iRead == p[iFlag].flags) {
+      if (!write_seq_to_file(seq)) {
+        std::cerr << "Error writing to FASTQ" << std::endl;
+        goto end_extract;
+      }
+    }
+  
+    // exclusion, make sure the current read does not match the next unproccessed flag or that we are in tail mode
+    if (opt.extract_exclude && (iRead < p[iFlag].flags || tail)) {
+      if (!write_seq_to_file(seq)) {
+        std::cerr << "Error writing to FASTQ" << std::endl;
+        goto end_extract;
+      }
+    }
+
+    // if we have not exhausted the 
+    if (!tail && iRead == p[iFlag].flags) {
+      // read the next flag from the next bus record
+      iFlag++;
+      
+      if (iFlag == rc)  {
+        // read the next batch of bus
+        in.read((char *) p, N * sizeof(BUSData));
+        rc = in.gcount() / sizeof(BUSData);
+        nr += rc;
+        iFlag = 0;
+        tail = rc==0;
+      } 
+    }
+    
+    ++iRead;
+
+    if (finished) {
+      if (iFlag < rc) {
+        std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl;
+        goto end_extract;
+      }
+      break;
     }
+
+    
   }
 
+  
+
   std::cerr << "Read in " << nr << " BUS records" << std::endl;
 
 end_extract:


=====================================
src/bustools_inspect.cpp
=====================================
@@ -57,7 +57,11 @@ void bustools_inspect(Bustools_opt &opt) {
     std::string inp;
     uint32_t flag; // Unused
     while (std::getline(wl, inp)) {
-      whitelist.insert(stringToBinary(inp, flag));
+      std::string str = inp;
+      str.erase(std::remove_if(str.begin(), str.end(), [](unsigned char c) {
+        return c == ' ' || c == '\t'; // Remove spaces and tabs (e.g. if we have split barcodes in our list)
+      }), str.end());
+      whitelist.insert(stringToBinary(str, flag));
     }
     wl.close();
   }


=====================================
src/bustools_main.cpp
=====================================
@@ -610,7 +610,8 @@ void parse_ProgramOptions_fromtext(int argc, char **argv, Bustools_opt& opt) {
 
 void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
 {
-  
+ 
+  int nocorrect_flag = 0; 
   const char *opt_string = "o:w:d:spr";
   static struct option long_options[] = {
     {"output", required_argument, 0, 'o'},
@@ -620,6 +621,7 @@ void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
     {"split", no_argument, 0, 's'},
     {"pipe", no_argument, 0, 'p'},
     {"replace", no_argument, 0, 'r'},
+    {"nocorrect", no_argument, &nocorrect_flag, 1},
     {0, 0, 0, 0}};
   
   int option_index = 0, c;
@@ -668,6 +670,9 @@ void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
   {
     opt.stream_in = true;
   }
+  if (nocorrect_flag) {
+    opt.no_correct = true;
+  }
 }
 
 void parse_ProgramOptions_whitelist(int argc, char **argv, Bustools_opt &opt)
@@ -989,6 +994,8 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt)
     {"fastq", required_argument, 0, 'f'},
     {"nFastqs", required_argument, 0, 'N'},
     {"pipe", no_argument, 0, 'p'},
+    {"exclude", no_argument, 0, 'x'},
+    {"include", no_argument, 0, 'i'},
     {0, 0, 0, 0}};
   
   int option_index = 0, c;
@@ -1012,6 +1019,13 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt)
     case '?':
       opt.parse_error = true;
       break;
+    case 'x':
+      opt.extract_exclude = true;
+      opt.extract_include = false;
+      break;
+    case 'i':
+      opt.extract_include = true;
+      break;
     default:
       break;
     }
@@ -2551,6 +2565,12 @@ bool check_ProgramOptions_extract(Bustools_opt &opt)
       ret = false;
     }
   }
+
+  if (opt.extract_exclude && opt.extract_include)
+  {
+    std::cerr << "Error: cannot specify both --exclude and --include" << std::endl;
+    ret = false;
+  }
   
   return ret;
 }
@@ -2774,6 +2794,7 @@ void Bustools_correct_Usage()
             << "-p, --pipe            Write to standard output" << std::endl
             << "-d, --dump            Dump uncorrected to corrected barcodes (optional)" << std::endl
             << "-r, --replace         The file of on-list barcodes is a barcode replacement file" << std::endl
+            << "    --nocorrect       Skip barcode error correction and only keep perfect matches to on-list" << std::endl
             << std::endl;
 }
 
@@ -2904,6 +2925,8 @@ void Bustools_extract_Usage()
             << "-o, --output          Output directory for FASTQ files" << std::endl
             << "-f, --fastq           FASTQ file(s) from which to extract reads (comma-separated list)" << std::endl
             << "-N, --nFastqs         Number of FASTQ file(s) per run" << std::endl
+            << "-x, --exclude         Exclude reads in the BUS file from the specified FASTQ file(s)" << std::endl
+            << "-i, --include         Include reads in the BUS file from the specified FASTQ file(s)" << std::endl
             << std::endl;
 }
 



View it on GitLab: https://salsa.debian.org/med-team/bustools/-/commit/67e484313cd30a88bcd140497ae269ed10313dc9

-- 
View it on GitLab: https://salsa.debian.org/med-team/bustools/-/commit/67e484313cd30a88bcd140497ae269ed10313dc9
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251016/7efe573d/attachment-0001.htm>


More information about the debian-med-commit mailing list