[med-svn] [Git][med-team/bustools][upstream] New upstream version 0.45.1+dfsg
Andreas Tille (@tille)
gitlab at salsa.debian.org
Thu Oct 16 10:34:51 BST 2025
Andreas Tille pushed to branch upstream at Debian Med / bustools
Commits:
67e48431 by Andreas Tille at 2025-10-16T11:20:40+02:00
New upstream version 0.45.1+dfsg
- - - - -
11 changed files:
- + .make_bustools_binaries_linux.sh
- + .make_bustools_binaries_mac.sh
- + .make_bustools_binaries_windows.sh
- src/BUSData.cpp
- src/CMakeLists.txt
- src/Common.hpp
- src/bustools_correct.cpp
- src/bustools_count.cpp
- src/bustools_extract.cpp
- src/bustools_inspect.cpp
- src/bustools_main.cpp
Changes:
=====================================
.make_bustools_binaries_linux.sh
=====================================
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Setup docker
+
+# sudo apt install docker.io # version 20.10.25-0ubuntu1~20.04.1
+# sudo groupadd docker
+# sudo usermod -aG docker ${USER}
+
+# Exit and log back in
+
+uid=$(id -u $USER|tr -d "\n" )
+gid=$(id -g $USER|tr -d "\n" )
+docker run --rm dockbuild/centos7:latest > ./dockbuild
+chmod +x dockbuild
+./dockbuild
+docker run --rm dockbuild/centos7-devtoolset7-gcc7:latest > dockbuild-centos7-devtoolset7-gcc7-latest
+docker run -ti -v ./:/work -e BUILDER_UID=$uid -e BUILDER_GID=$gid -e BUILDER_USER=$USER -e BUILDER_GROUP=$USER --platform linux dockbuild/centos7-devtoolset7-gcc7:latest \
+ bash -c "rm -rf bustools && git clone https://github.com/BUStools/bustools && cd bustools && mkdir build && cd build && cmake .. && make && mv src/bustools ../ && cd ../../"
+
+rm -rf bustools_linux-master.tar.gz
+tar --no-xattrs --exclude='._*' -czvf bustools_linux-master.tar.gz bustools/bustools bustools/README.md bustools/LICENSE
+
+
=====================================
.make_bustools_binaries_mac.sh
=====================================
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Run this on a Mac
+
+git clone https://github.com/BUStools/bustools
+cd bustools && mkdir build && cd build
+cmake ..
+make
+mv src/bustools ../
+cd ../../
+tar --no-xattrs --exclude='._*' -czvf bustools_mac-master.tar.gz bustools/bustools bustools/README.md bustools/LICENSE
+
+
=====================================
.make_bustools_binaries_windows.sh
=====================================
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Setup docker
+
+# sudo apt install docker.io # version 20.10.25-0ubuntu1~20.04.1
+# sudo groupadd docker
+# sudo usermod -aG docker ${USER}
+
+# Exit and log back in
+
+
+docker run --rm dockcross/windows-static-x64:20221217-6afd127 > ./dockcross-windows-static-x64
+chmod +x ./dockcross-windows-static-x64
+./dockcross-windows-static-x64 bash -c "rm -rf zlib-1.3.1.tar.gz && wget http://www.zlib.net/zlib-1.3.1.tar.gz && tar -xvzf zlib-1.3.1.tar.gz && cd zlib-1.3.1 && ./configure --static && make && cd .. && rm -rf bustools && git clone https://github.com/BUStools/bustools && cd bustools && mkdir build && cd build && cmake .. -DZLIB_LIBRARY=/work/zlib-1.3.1/libz.a -DZLIB_INCLUDE_DIR=/work/zlib-1.3.1/ && make && mv src/bustools.exe ../ && cd ../../"
+
+rm -rf bustools_windows-master.zip
+zip bustools_windows-master.zip bustools/bustools.exe bustools/README.md bustools/LICENSE
+
=====================================
src/BUSData.cpp
=====================================
@@ -6,6 +6,7 @@
#include <sstream>
#include <iostream>
#include <limits>
+#include <unordered_set>
uint64_t stringToBinary(const std::string &s, uint32_t &flag) {
return stringToBinary(s.c_str(), s.size(), flag);
@@ -361,14 +362,17 @@ bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t>
std::string line, t;
line.reserve(10000);
+ bool ret = true;
int i = 0;
+ std::unordered_set<std::string> txs_set; // Set of transcripts found in t2g file
while (std::getline(inf,line)) {
std::stringstream ss(line);
std::string txp, gene;
ss >> txp >> gene;
auto it = txnames.find(txp);
if (it != txnames.end()) {
+ txs_set.insert(txp);
auto i = it->second;
auto git = genenames.find(gene);
auto gi = -1;
@@ -379,10 +383,14 @@ bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t>
gi = git->second;
}
genemap[i] = gi;
+ } else {
+ ret = false;
}
}
- return true;
+ if (txs_set.size() != txnames.size()) ret = false; // number of transcripts in t2g file and transcripts file don't match up
+
+ return ret;
}
bool parseGenesList(const std::string& filename, std::vector<std::string>& geneNames) {
=====================================
src/CMakeLists.txt
=====================================
@@ -28,7 +28,13 @@ endif(LINK MATCHES static)
target_link_libraries(bustools ${ZLIB_LIBRARIES})
if ( ZLIB_FOUND )
- include_directories( ${ZLIB_INCLUDE_DIRS} )
+ if (DEFINED ZLIB_INCLUDE_DIRS)
+ include_directories( ${ZLIB_INCLUDE_DIRS} )
+ elseif (DEFINED ZLIB_INCLUDE_DIR)
+ include_directories( ${ZLIB_INCLUDE_DIR} )
+ else()
+ message(FATAL_ERROR "zlib found but no include directories are set.")
+ endif()
else()
message(FATAL_ERROR "zlib not found. Required for reading FASTQ files" )
endif( ZLIB_FOUND )
=====================================
src/Common.hpp
=====================================
@@ -12,7 +12,7 @@
#include "roaring.hh"
#include "hash.hpp"
-#define BUSTOOLS_VERSION "0.43.2"
+#define BUSTOOLS_VERSION "0.45.1"
#define u_map_ std::unordered_map
enum CAPTURE_TYPE : char
@@ -88,6 +88,7 @@ struct Bustools_opt
bool split_correct = false;
bool barcode_replacement = false;
bool parse_error = false;
+ bool no_correct = false;
/* predict */
std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this
@@ -113,6 +114,10 @@ struct Bustools_opt
bool text_dumppad = false;
bool text_showall = false;
+ /* extract */
+ bool extract_exclude = false;
+ bool extract_include = true;
+
/* linker */
int start, end;
=====================================
src/bustools_correct.cpp
=====================================
@@ -866,6 +866,10 @@ void bustools_correct(Bustools_opt &opt) {
stat_white_++;
correction |= b_;
} else {
+ if (opt.no_correct) { // Option to skip correction and keep only barcodes in the list
+ stat_uncorr_++;
+ break;
+ }
auto lower_mask = lower_upper_mask_vec[j].first;
auto upper_mask = lower_upper_mask_vec[j].second;
auto bc2 = bc2_vec[j];
=====================================
src/bustools_count.cpp
=====================================
@@ -31,7 +31,7 @@ void bustools_count(Bustools_opt &opt) {
std::vector<int32_t> genemap(txnames.size(), -1);
u_map_<std::string, int32_t> genenames;
if (!parseGenes(opt.count_genes, txnames, genemap, genenames)) {
- std::cerr << "Warning: Some transcripts exist in the transcripts file but not in the transcript-to-gene mapping file; this will likely cause errors." << std::endl;
+ std::cerr << "Warning: Some transcripts exist in the transcripts file but not in the transcript-to-gene mapping file (or vice versa); this may cause errors." << std::endl;
}
parseECs(opt.count_ecs, h);
@@ -170,7 +170,7 @@ void bustools_count(Bustools_opt &opt) {
// v[i..j-1] share the same UMI
ecs.resize(0);
for (size_t k = i; k < j; k++) {
- ecs.push_back(v[k].ec);
+ if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec);
}
if (opt.umi_gene_collapse) {
@@ -257,7 +257,7 @@ void bustools_count(Bustools_opt &opt) {
break;
}
}
- double val = j-i;
+ size_t val = j-i;
auto which_mtx = intersect_ecs_with_subset_txs(column_v[i], ecmap, tx_split_lookup);
auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A);
auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A);
@@ -295,7 +295,7 @@ void bustools_count(Bustools_opt &opt) {
ecs.resize(0);
uint32_t counts = 0;
for (size_t k = i; k < j; k++) {
- ecs.push_back(v[k].ec);
+ if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec);
counts += v[k].count;
}
@@ -582,7 +582,7 @@ void bustools_count(Bustools_opt &opt) {
auto which_mtx = intersect_ecs_with_subset_txs(column_vp[i].first, ecmap, tx_split_lookup);
auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A);
auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A);
- of_ << n_rows << " " << (column_vp[i].first+1) << " " << val << "\n";
+ of_ << n_rows << " " << (column_vp[i].first+1) << " " << std::to_string(static_cast<size_t>(val)) << "\n";
n_entries_++;
i = j; // increment
}
@@ -646,6 +646,14 @@ void bustools_count(Bustools_opt &opt) {
delete[] p; p = nullptr;
if (!opt.count_collapse) {
+ for (const auto& pair : txnames) { // Create (single-element) equivalence classes for transcripts without one
+ int32_t val = pair.second;
+ std::vector<int32_t> key;
+ key.push_back(val);
+ if (ecmapinv.find(key) == ecmapinv.end()) {
+ ecmap.push_back(key);
+ }
+ }
n_cols = ecmap.size();
} else {
n_cols = genenames.size();
=====================================
src/bustools_extract.cpp
=====================================
@@ -18,7 +18,7 @@ inline bool open_fastqs(
for (int i = 0; i < opt.nFastqs; ++i) {
gzclose(outFastq[i]);
- outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w");
+ outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w1");
gzclose(inFastq[i]);
inFastq[i] = gzopen(opt.fastq[iFastq].c_str(), "r");
@@ -26,10 +26,7 @@ inline bool open_fastqs(
kseq_destroy(seq[i]);
}
seq[i] = kseq_init(inFastq[i]);
- if (kseq_read(seq[i]) < 0) {
- return false;
- }
-
+
++iFastq;
}
return true;
@@ -59,54 +56,25 @@ void bustools_extract(const Bustools_opt &opt) {
std::vector<kseq_t *> seq(opt.nFastqs, nullptr);
uint32_t iRead = 0;
size_t iFastq = 0;
- if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
- std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl;
- goto end_extract;
- }
+ uint32_t lastFlag = 0;
- while (true) {
- in.read((char *) p, N * sizeof(BUSData));
- size_t rc = in.gcount() / sizeof(BUSData);
- if (rc == 0) {
- break;
- }
- nr += rc;
- for (size_t i = 0; i < rc; ++i) {
- while (iRead < p[i].flags) {
- for (const auto &s : seq) {
- int err_kseq_read = kseq_read(s);
- if (err_kseq_read == -1) { // Reached EOF
- if (iFastq == opt.fastq.size()) { // Done with all files
- std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl;
- goto end_extract;
- } else {
- if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
- std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl;
- goto end_extract;
- }
- }
- } else if (err_kseq_read == -2) {
- std::cerr << "Error: truncated FASTQ" << std::endl;
- goto end_extract;
- }
- }
- ++iRead;
- }
-
- if (iRead > p[i].flags) {
- std::cerr << "BUS file not sorted by flag" << std::endl;
- goto end_extract;
- }
- for (int i = 0; i < opt.nFastqs; ++i) {
+ auto write_seq_to_file = [&opt, &buf, &outFastq] (std::vector<kseq_t *> &seq) {
+ for (int i = 0; i < opt.nFastqs; ++i) {
int bufLen = 1; // Already have @ character in buffer
memcpy(buf + bufLen, seq[i]->name.s, seq[i]->name.l);
bufLen += seq[i]->name.l;
-
- memcpy(buf + bufLen, seq[i]->comment.s, seq[i]->comment.l);
- bufLen += seq[i]->comment.l;
-
+
+ // Only add space and comment if the comment is not empty
+ if (seq[i]->comment.l > 0) {
+ // Add space between name and comment
+ buf[bufLen++] = ' ';
+
+ memcpy(buf + bufLen, seq[i]->comment.s, seq[i]->comment.l);
+ bufLen += seq[i]->comment.l;
+ }
+
buf[bufLen++] = '\n';
memcpy(buf + bufLen, seq[i]->seq.s, seq[i]->seq.l);
@@ -114,13 +82,6 @@ void bustools_extract(const Bustools_opt &opt) {
buf[bufLen++] = '\n';
buf[bufLen++] = '+';
-
- memcpy(buf + bufLen, seq[i]->name.s, seq[i]->name.l);
- bufLen += seq[i]->name.l;
-
- memcpy(buf + bufLen, seq[i]->comment.s, seq[i]->comment.l);
- bufLen += seq[i]->comment.l;
-
buf[bufLen++] = '\n';
memcpy(buf + bufLen, seq[i]->qual.s, seq[i]->qual.l);
@@ -128,14 +89,124 @@ void bustools_extract(const Bustools_opt &opt) {
buf[bufLen++] = '\n';
- if (gzwrite(outFastq[i], buf, bufLen) != bufLen) {
- std::cerr << "Error writing to FASTQ" << std::endl;
+ if (gzwrite(outFastq[i], buf, bufLen) != bufLen) {
+ return false;
+ }
+ }
+ return true;
+ };
+
+ bool tail = false;
+ bool finished = false;
+ size_t iFlag = 0;
+ size_t rc = 0;
+
+ if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
+ std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl;
+ goto end_extract;
+ }
+
+ // fill in the first N BUS records
+ in.read((char *) p, N * sizeof(BUSData));
+ rc = in.gcount() / sizeof(BUSData);
+ nr += rc;
+ tail = rc==0;
+
+ while (true) {
+ // fill the next read
+
+ for (int si = 0; si < seq.size(); ++si) {
+ const auto &s = seq[si];
+ int err_kseq_read = kseq_read(s);
+ if (err_kseq_read == -1) { // Reached EOF
+ if (si != 0) {
+ std::cerr << "Error: truncated FASTQ" << std::endl;
goto end_extract;
+ } else {
+ // let's make sure that all the files are also EOF
+ for (int sii = 1; sii < seq.size(); ++sii) {
+ int err_kseq_read2 = kseq_read(seq[sii]);
+ if (err_kseq_read2 != -1) {
+ std::cerr << "Error: truncated FASTQ" << std::endl;
+ goto end_extract;
+ }
+ }
+ }
+ // check if we are done with all files
+ if (iFastq == opt.fastq.size()) { // Done with all files
+ finished = true;
+ break;
+ } else {
+ if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
+ std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl;
+ goto end_extract;
+ }
+
+ // read the first read
+ err_kseq_read = kseq_read(seq[si]);
+ if (err_kseq_read == -1) {
+ finished = true;
+ break;
+ }
}
}
+
+ if (err_kseq_read == -2) {
+ std::cerr << "Error: truncated FASTQ" << std::endl;
+ goto end_extract;
+ }
+ }
+
+ if (finished) {
+ break;
+ }
+
+ // inclusion, check if the current read matches the next unproccessed flag
+ if (opt.extract_include && iRead == p[iFlag].flags) {
+ if (!write_seq_to_file(seq)) {
+ std::cerr << "Error writing to FASTQ" << std::endl;
+ goto end_extract;
+ }
+ }
+
+ // exclusion, make sure the current read does not match the next unproccessed flag or that we are in tail mode
+ if (opt.extract_exclude && (iRead < p[iFlag].flags || tail)) {
+ if (!write_seq_to_file(seq)) {
+ std::cerr << "Error writing to FASTQ" << std::endl;
+ goto end_extract;
+ }
+ }
+
+ // if we have not exhausted the
+ if (!tail && iRead == p[iFlag].flags) {
+ // read the next flag from the next bus record
+ iFlag++;
+
+ if (iFlag == rc) {
+ // read the next batch of bus
+ in.read((char *) p, N * sizeof(BUSData));
+ rc = in.gcount() / sizeof(BUSData);
+ nr += rc;
+ iFlag = 0;
+ tail = rc==0;
+ }
+ }
+
+ ++iRead;
+
+ if (finished) {
+ if (iFlag < rc) {
+ std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl;
+ goto end_extract;
+ }
+ break;
}
+
+
}
+
+
std::cerr << "Read in " << nr << " BUS records" << std::endl;
end_extract:
=====================================
src/bustools_inspect.cpp
=====================================
@@ -57,7 +57,11 @@ void bustools_inspect(Bustools_opt &opt) {
std::string inp;
uint32_t flag; // Unused
while (std::getline(wl, inp)) {
- whitelist.insert(stringToBinary(inp, flag));
+ std::string str = inp;
+ str.erase(std::remove_if(str.begin(), str.end(), [](unsigned char c) {
+ return c == ' ' || c == '\t'; // Remove spaces and tabs (e.g. if we have split barcodes in our list)
+ }), str.end());
+ whitelist.insert(stringToBinary(str, flag));
}
wl.close();
}
=====================================
src/bustools_main.cpp
=====================================
@@ -610,7 +610,8 @@ void parse_ProgramOptions_fromtext(int argc, char **argv, Bustools_opt& opt) {
void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
{
-
+
+ int nocorrect_flag = 0;
const char *opt_string = "o:w:d:spr";
static struct option long_options[] = {
{"output", required_argument, 0, 'o'},
@@ -620,6 +621,7 @@ void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
{"split", no_argument, 0, 's'},
{"pipe", no_argument, 0, 'p'},
{"replace", no_argument, 0, 'r'},
+ {"nocorrect", no_argument, &nocorrect_flag, 1},
{0, 0, 0, 0}};
int option_index = 0, c;
@@ -668,6 +670,9 @@ void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt)
{
opt.stream_in = true;
}
+ if (nocorrect_flag) {
+ opt.no_correct = true;
+ }
}
void parse_ProgramOptions_whitelist(int argc, char **argv, Bustools_opt &opt)
@@ -989,6 +994,8 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt)
{"fastq", required_argument, 0, 'f'},
{"nFastqs", required_argument, 0, 'N'},
{"pipe", no_argument, 0, 'p'},
+ {"exclude", no_argument, 0, 'x'},
+ {"include", no_argument, 0, 'i'},
{0, 0, 0, 0}};
int option_index = 0, c;
@@ -1012,6 +1019,13 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt)
case '?':
opt.parse_error = true;
break;
+ case 'x':
+ opt.extract_exclude = true;
+ opt.extract_include = false;
+ break;
+ case 'i':
+ opt.extract_include = true;
+ break;
default:
break;
}
@@ -2551,6 +2565,12 @@ bool check_ProgramOptions_extract(Bustools_opt &opt)
ret = false;
}
}
+
+ if (opt.extract_exclude && opt.extract_include)
+ {
+ std::cerr << "Error: cannot specify both --exclude and --include" << std::endl;
+ ret = false;
+ }
return ret;
}
@@ -2774,6 +2794,7 @@ void Bustools_correct_Usage()
<< "-p, --pipe Write to standard output" << std::endl
<< "-d, --dump Dump uncorrected to corrected barcodes (optional)" << std::endl
<< "-r, --replace The file of on-list barcodes is a barcode replacement file" << std::endl
+ << " --nocorrect Skip barcode error correction and only keep perfect matches to on-list" << std::endl
<< std::endl;
}
@@ -2904,6 +2925,8 @@ void Bustools_extract_Usage()
<< "-o, --output Output directory for FASTQ files" << std::endl
<< "-f, --fastq FASTQ file(s) from which to extract reads (comma-separated list)" << std::endl
<< "-N, --nFastqs Number of FASTQ file(s) per run" << std::endl
+ << "-x, --exclude Exclude reads in the BUS file from the specified FASTQ file(s)" << std::endl
+ << "-i, --include Include reads in the BUS file from the specified FASTQ file(s)" << std::endl
<< std::endl;
}
View it on GitLab: https://salsa.debian.org/med-team/bustools/-/commit/67e484313cd30a88bcd140497ae269ed10313dc9
--
View it on GitLab: https://salsa.debian.org/med-team/bustools/-/commit/67e484313cd30a88bcd140497ae269ed10313dc9
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20251016/7efe573d/attachment-0001.htm>
More information about the debian-med-commit
mailing list