[med-svn] [Git][med-team/bowtie2][upstream] New upstream version 2.3.4.3
Alexandre Mestiashvili
gitlab at salsa.debian.org
Tue Sep 25 09:59:49 BST 2018
Alexandre Mestiashvili pushed to branch upstream at Debian Med / bowtie2
Commits:
be7df110 by Alexandre Mestiashvili at 2018-09-25T07:56:20Z
New upstream version 2.3.4.3
- - - - -
15 changed files:
- CMakeLists.txt
- NEWS
- VERSION
- bowtie2-build
- bowtie2-inspect
- bt2_build.cpp
- bt2_inspect.cpp
- bt2_search.cpp
- doc/website/manual.ssi
- doc/website/recent_news.ssi
- doc/website/rhsidebar.ssi
- + example/reads/conversion_utilities.sh
- formats.h
- pat.cpp
- pat.h
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
cmake_policy(SET CMP0048 NEW)
cmake_policy(SET CMP0005 NEW)
-project(bowtie2 LANGUAGES CXX VERSION "2.3.4.2")
+project(bowtie2 LANGUAGES CXX VERSION "2.3.4.3")
enable_testing()
=====================================
NEWS
=====================================
@@ -19,6 +19,14 @@ Please report any issues to the Bowtie 2 Github page or using the Sourceforge bu
Version Release History
=======================
+Version 2.3.4.3 - Sep 17, 2018
+
+ * Fixed an issue causing `bowtie2-build` and `bowtie2-inspect`
+ to output incomplete help text.
+ * Fixed an issue causing `bowtie2-inspect` to crash.
+ * Fixed an issue preventing `bowtie2` from processing paired and/or
+ unpaired FASTQ reads together with interleaved FASTQ reads.
+
Version 2.3.4.2 - Aug 7, 2018
* Fixed issue causing `bowtie2` to fail in `--fast-local` mode.
=====================================
VERSION
=====================================
@@ -1 +1 @@
-2.3.4.2
+2.3.4.3
=====================================
bowtie2-build
=====================================
@@ -29,15 +29,15 @@ import subprocess
from collections import deque
def main():
- parser = argparse.ArgumentParser()
+ parser = argparse.ArgumentParser(add_help = False)
+
+ parser.add_argument('--large-index', action='store_true')
+ parser.add_argument('--verbose', action='store_true')
group = parser.add_mutually_exclusive_group()
group.add_argument('--debug', action='store_true')
group.add_argument('--sanitized', action='store_true')
- parser.add_argument('--verbose', action='store_true')
- parser.add_argument('--large-index', action='store_true')
-
logging.basicConfig(level=logging.ERROR,
format='%(levelname)s: %(message)s'
)
@@ -51,8 +51,12 @@ def main():
build_bin_spec = os.path.join(ex_path,build_bin_s)
script_options, argv = parser.parse_known_args()
+ print_help = False
argv = deque(argv)
+ if '-h' in argv or '--help' in argv:
+ print_help = True
+
if script_options.verbose:
logging.getLogger().setLevel(logging.INFO)
=====================================
bowtie2-inspect
=====================================
@@ -22,6 +22,7 @@
import os
import imp
+import sys
import inspect
import logging
import argparse
@@ -29,7 +30,7 @@ import subprocess
from collections import deque
def main():
- parser = argparse.ArgumentParser()
+ parser = argparse.ArgumentParser(add_help = False)
group = parser.add_mutually_exclusive_group()
group.add_argument('--debug', action='store_true')
=====================================
bt2_build.cpp
=====================================
@@ -142,7 +142,10 @@ static void printUsage(ostream& out) {
<< " <reference_in>)" << endl;
if(wrapper == "basic-0") {
out << " --large-index force generated index to be 'large', even if ref" << endl
- << " has fewer than 4 billion nucleotides" << endl;
+ << " has fewer than 4 billion nucleotides" << endl
+ << " --debug use the debug binary; slower, assertions enabled" << endl
+ << " --sanitized use sanitized binary; slower, uses ASan and/or UBSan" << endl
+ << " --verbose log the issued command" << endl;
}
out << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl
<< " -p/--packed use packed strings internally; slower, less memory" << endl
=====================================
bt2_inspect.cpp
=====================================
@@ -76,14 +76,16 @@ static void printUsage(ostream& out) {
<< "Options:" << endl;
if(wrapper == "basic-0") {
out << " --large-index force inspection of the 'large' index, even if a" << endl
- << " 'small' one is present." << endl;
+ << " 'small' one is present." << endl
+ << " --debug use the debug binary; slower, assertions enabled" << endl
+ << " --sanitized use sanitized binary; slower, uses ASan and/or UBSan" << endl
+ << " --verbose log the issued command" << endl;
}
out << " -a/--across <int> Number of characters across in FASTA output (default: 60)" << endl
<< " -n/--names Print reference sequence names only" << endl
<< " -s/--summary Print summary incl. ref names, lengths, index properties" << endl
<< " -v/--verbose Verbose output (for debugging)" << endl
<< " -h/--help print detailed description of tool and its options" << endl
- << " --help print this usage message" << endl
;
if(wrapper.empty()) {
cerr << endl
=====================================
bt2_search.cpp
=====================================
@@ -84,6 +84,7 @@ static bool startVerbose; // be talkative at startup
int gQuiet; // print nothing but the alignments
static int sanityCheck; // enable expensive sanity checks
static int format; // default read format is FASTQ
+static bool interleaved; // reads are interleaved
static string origString; // reference text, or filename(s)
static int seed; // srandom() seed
static int timing; // whether to report basic timing data
@@ -278,6 +279,7 @@ static void resetOptions() {
gQuiet = false;
sanityCheck = 0; // enable expensive sanity checks
format = FASTQ; // default read format is FASTQ
+ interleaved = false; // reads are not interleaved by default
origString = ""; // reference text, or filename(s)
seed = 0; // srandom() seed
timing = 0; // whether to report basic timing data
@@ -999,7 +1001,12 @@ static void parseOption(int next_option, const char *arg) {
case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break;
- case ARG_INTERLEAVED_FASTQ: tokenize(arg, ",", mates12); format = INTERLEAVED; break;
+ case ARG_INTERLEAVED_FASTQ: {
+ tokenize(arg, ",", mates12);
+ format = FASTQ;
+ interleaved = true;
+ break;
+ }
case 'b': {
format = BAM;
saw_bam = true;
@@ -1735,7 +1742,7 @@ createPatsrcFactory(
int tid)
{
PatternSourcePerThreadFactory *patsrcFact;
- patsrcFact = new PatternSourcePerThreadFactory(patcomp, pp);
+ patsrcFact = new PatternSourcePerThreadFactory(patcomp, pp, tid);
assert(patsrcFact != NULL);
return patsrcFact;
}
@@ -4782,6 +4789,7 @@ static void driver(
}
PatternParams pp(
format, // file format
+ interleaved, // some or all of the reads are interleaved
fileParallel, // true -> wrap files with separate PairedPatternSources
seed, // pseudo-random seed
readsPerBatch, // # reads in a light parsing batch
=====================================
doc/website/manual.ssi
=====================================
@@ -152,7 +152,7 @@ Fedora, CentOS
<pre><code>yum check-update</code></pre>
</td>
<td>
-<p>yum search tbb</p>
+<pre><code>yum search tbb</code></pre>
</td>
<td>
<pre><code>yum install tbb-devel.x86_64</code></pre>
=====================================
doc/website/recent_news.ssi
=====================================
@@ -1,17 +1,22 @@
-<h2>Version 2.3.4.2 - August 07, 1987</h2>
+<h2>Version 2.3.4.3 - September 17, 2018</h2>
<ul>
- <li>Fixed issue causing <code>bowtie2</code> to fail in <code><a href="bowtie2-options-fast-local">--fast-local</a></code> mode.</li>
- <li>Fixed issue causing <code><a href="bowtie2-options-soft-clipped-unmapped-tlen>--soft-clipped-unmapped-tlen</a></code> to be a positional argument.</li>
- <li>New option <code><a href="bowtie2-options-trim-to">--trim-to</a> N</code> causes <code>bowtie2</code> to trim reads longer than <code>N</code> bases to exactly <code>N</code> bases. Can trim from either 3' or 5' end, e.g. <code><a href="bowtie2-options-trim-to">--trim-to</a> 5:30</code> trims reads to 30 bases, truncating at the 5' end.</li>
- <li>Updated <a href="#building-from-source">"Building from source"</a> manual section with additional instructions on installing TBB.</li>
+ <li>Fixed an issue causing <code>bowtie2-build</code> and <code>bowtie2-inspect</code> to output incomplete help text.</li>
+ <li>Fixed an issue causing <code>bowtie2-align</code> to crash.</li>
+ <li>Fixed an issue preventing <code>bowtie2</code> from processing paired and/or unpaired FASTQ reads together with interleaved FASTQ reads.</li>
+</ul>
+
+<h2>Version 2.3.4.2 - August 07, 2018</h2>
+<ul>
+ <li>Fixed issue causing <code>bowtie2</code> to fail in <code><a href="manual.shtml#bowtie2-options-fast-local">--fast-local</a></code> mode.</li>
+ <li>Fixed issue causing <code><a href="manual.shtml#bowtie2-options-soft-clipped-unmapped-tlen">--soft-clipped-unmapped-tlen</a></code> to be a positional argument.</li>
+ <li>New option <code><a href="manual.shtml#bowtie2-options-trim-to">--trim-to</a> N</code> causes <code>bowtie2</code> to trim reads longer than <code>N</code> bases to exactly <code>N</code> bases. Can trim from either 3' or 5' end, e.g. <code><a href="manual.shtml#bowtie2-options-trim-to">--trim-to</a> 5:30</code> trims reads to 30 bases, truncating at the 5' end.</li>
+ <li>Updated <a href="manual.shtml#building-from-source">"Building from source"</a> manual section with additional instructions on installing TBB.</li>
<li>Several other updates to manual, including new mentions of <a href="http://bioconda.github.io">Bioconda</a> and <a href="https://biocontainers.pro">Biocontainers</a>.</li>
<li>Fixed an issue preventing <code>bowtie2</code> from processing more than one pattern source when running single threaded.</li>
<li>Fixed an issue causing <code>bowtie2</code> and <code>bowtie2-inspect</code> to crash if the index contains a gap-only segment.</li>
<li>Added <i>experimental</i> BAM input mode <code>-b</code>. Works only with unpaired input reads and BAM files that are sorted by read name (<code>samtools sort -n</code>). BAM input mode also supports the following options:<li>
- <ul>
- <li><code>--preserve-sam-tags</code>: Preserve any optional fields present in BAM record</li>
- <li><code>--align-paired-reads</code>: Paired-end mode for BAM files</li>
- </ul>
+     <code>--preserve-sam-tags</code>: Preserve any optional fields present in BAM record</li>
+     <code>--align-paired-reads</code>: Paired-end mode for BAM files</li>
<li>Add <i>experimental</i> CMake support</li>
</ul>
=====================================
doc/website/rhsidebar.ssi
=====================================
@@ -18,10 +18,10 @@
</tr>
<tr>
<td>
- <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.4.2">Bowtie2 2.3.4.2</a>
+ <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.4.3">Bowtie2 2.3.4.3</a>
</td>
<td align="right">
- 08/07/18
+ 09/17/18
</td>
</tr>
<tr>
@@ -164,6 +164,7 @@
<h2>Related publications</h2>
<div class="box">
<ul>
+ <li style="font-size: x-small; line-height: 130%">Langmead B, Wilks C., Antonescu V., Charles R. <a href="https://doi.org/10.1093/bioinformatics/bty648"><b>Scaling read aligners to hundreds of threads on general-purpose processors</b></a>. <i><a href="https://academic.oup.com/bioinformatics">Bioinformatics</a></i>. bty648.<br/><br/></li>
<li style="font-size: x-small; line-height: 130%">Langmead B, Salzberg S. <a href="http://www.nature.com/nmeth/journal/v9/n4/full/nmeth.1923.html"><b>Fast gapped-read alignment with Bowtie 2</b></a>. <i><a href="http://www.nature.com/nmeth">Nature Methods</a></i>. 2012, 9:357-359.<br/><br/></li>
<li style="font-size: x-small; line-height: 130%">Langmead B, Trapnell C, Pop M, Salzberg SL. <a href="http://genomebiology.com/2009/10/3/R25"><b>Ultrafast and memory-efficient alignment of short DNA sequences to the human genome</b></a>. <i><a href="http://genomebiology.com">Genome Biology</a></i> <b>10</b>:R25.<br/><br/></li>
</ul>
=====================================
example/reads/conversion_utilities.sh
=====================================
@@ -0,0 +1,7 @@
+# aliases for converting sample read files
+# `source conversion_utilities.sh` to add them to your environment
+
+alias fastq_to_fasta="sed 'N;x;N;N;x;s/@/>/"
+alias paired_to_tab5="paste <(sed 'N;x;N;g;N;s/\n/ /g' reads_1.fq) <(sed -n 'n;h;n;g;N;s/\n/ /g;p' reads_2.fq) > reads_12.tab5"
+alias paired_to_tab6="paste <(sed 'N;x;N;g;N;s/\n/ /g' reads_1.fq) <(sed 'N;x;N;g;N;s/\n/ /g' reads_2.fq) > reads_12.tab6"
+alias paired_to_interleaved="paste -d'\n' <(sed 'N;N;N;s/\n/ /g' reads_1.fq) <(sed 'N;N;N;s/\n/ /g' reads_2.fq) | tr '\t' '\n' > reads_12.fq"
=====================================
formats.h
=====================================
@@ -30,7 +30,6 @@ enum file_format {
FASTA = 1,
FASTA_CONT,
FASTQ,
- INTERLEAVED,
BAM,
TAB_MATE5,
TAB_MATE6,
@@ -44,7 +43,6 @@ static const std::string file_format_names[] = {
"FASTA",
"FASTA sampling",
"FASTQ",
- "Interleaved FASTQ",
"BAM",
"Tabbed mated",
"Raw",
=====================================
pat.cpp
=====================================
@@ -100,9 +100,8 @@ PatternSource* PatternSource::patsrcFromStrings(
case FASTA: return new FastaPatternSource(qs, p);
case FASTA_CONT: return new FastaContinuousPatternSource(qs, p);
case RAW: return new RawPatternSource(qs, p);
- case FASTQ: return new FastqPatternSource(qs, p);
+ case FASTQ: return new FastqPatternSource(qs, p, p.interleaved);
case BAM: return new BAMPatternSource(qs, p);
- case INTERLEAVED: return new FastqPatternSource(qs, p, true /* interleaved */);
case TAB_MATE5: return new TabbedPatternSource(qs, p, false);
case TAB_MATE6: return new TabbedPatternSource(qs, p, true);
case CMDLINE: return new VectorPatternSource(qs, p);
@@ -232,14 +231,13 @@ pair<bool, int> DualPatternComposer::nextBatch(PerThreadReadBuf& pt) {
pt,
true, // batch A (or pairs)
true); // grab lock below
- bool done = res.first;
- if(!done && res.second == 0) {
+ if(res.second == 0 && cur < srca_->size() - 1) {
ThreadSafe ts(mutex_m);
if(cur + 1 > cur_) cur_++;
cur = cur_; // Move on to next PatternSource
continue; // on to next pair of PatternSources
}
- return make_pair(done, res.second);
+ return make_pair(res.first && cur == srca_->size() - 1, res.second);
} else {
pair<bool, int> resa, resb;
// Lock to ensure that this thread gets parallel reads
@@ -295,12 +293,11 @@ PatternComposer* PatternComposer::setupPatternComposer(
const EList<string>& q, // qualities associated with singles
const EList<string>& q1, // qualities associated with m1
const EList<string>& q2, // qualities associated with m2
- const PatternParams& p, // read-in parameters
+ PatternParams& p, // read-in parameters
bool verbose) // be talkative?
{
EList<PatternSource*>* a = new EList<PatternSource*>();
EList<PatternSource*>* b = new EList<PatternSource*>();
- EList<PatternSource*>* ab = new EList<PatternSource*>();
// Create list of pattern sources for paired reads appearing
// interleaved in a single file
for(size_t i = 0; i < m12.size(); i++) {
@@ -312,7 +309,9 @@ PatternComposer* PatternComposer::setupPatternComposer(
tmp.push_back(m12[i]);
assert_eq(1, tmp.size());
}
- ab->push_back(PatternSource::patsrcFromStrings(p, *qs));
+ a->push_back(PatternSource::patsrcFromStrings(p, *qs));
+ b->push_back(NULL);
+ p.interleaved = false;
if(!p.fileParallel) {
break;
}
@@ -376,16 +375,7 @@ PatternComposer* PatternComposer::setupPatternComposer(
}
PatternComposer *patsrc = NULL;
- if(m12.size() > 0) {
- patsrc = new SoloPatternComposer(ab, p);
- for(size_t i = 0; i < a->size(); i++) delete (*a)[i];
- for(size_t i = 0; i < b->size(); i++) delete (*b)[i];
- delete a; delete b;
- } else {
- patsrc = new DualPatternComposer(a, b, p);
- for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i];
- delete ab;
- }
+ patsrc = new DualPatternComposer(a, b, p);
return patsrc;
}
@@ -471,9 +461,15 @@ void CFilePatternSource::open() {
}
else {
const char* filename = infiles_[filecur_].c_str();
- compressed_ = is_gzipped_file(filename);
- if (compressed_) {
+ const char* ext = filename + infiles_[filecur_].length();
+
+ // Only temporary until we have a separate BGZFPatternSource
+ while (ext != filename && *--ext != '.') ;
+ bool bam_file = ext != filename && strncmp(ext, ".bam", 4) == 0;
+
+ if (!bam_file && is_gzipped_file(filename)) {
zfp_ = gzopen(filename, "rb");
+ compressed_ = true;
} else {
fp_ = fopen(filename, "rb");
}
@@ -1146,6 +1142,12 @@ const int BAMPatternSource::offset[] = {
32, //read_name
};
+const uint8_t BAMPatternSource::EOF_MARKER[] = {
+ 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+ 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
bool BAMPatternSource::parse_bam_header() {
char magic[4];
@@ -1203,58 +1205,281 @@ bool BAMPatternSource::parse_bam_header() {
return true;
}
-std::pair<bool, int> BAMPatternSource::nextBatchFromFile(PerThreadReadBuf& pt,
- bool batch_a, unsigned readi) {
- if (first_) {
- first_ = false;
- if (!parse_bam_header()) {
- std::cerr << "Unable to parse BAM file" << std::endl;
- return make_pair(true, 0);
+uint16_t BAMPatternSource::nextBGZFBlockFromFile(BGZF& b) {
+ fread(&b.hdr, sizeof(b.hdr), 1, fp_);
+ if (feof_unlocked(fp_) || ferror_unlocked(fp_)) {
+ return 0;
+ }
+
+ uint8_t *extra = new uint8_t[b.hdr.xlen];
+ fread(extra, b.hdr.xlen, 1, fp_);
+ if (ferror_unlocked(fp_)) {
+ return 0;
+ }
+
+ if (memcmp(EOF_MARKER, &b.hdr, sizeof(b.hdr)) == 0
+ && memcmp(EOF_MARKER + sizeof(b.hdr), extra, 6 /* sizeof BAM subfield */) == 0)
+ {
+ delete[] extra;
+ fclose(fp_);
+ return 0;
+ }
+
+ uint16_t bsize = 0;
+ for (uint16_t i = 0; i < b.hdr.xlen;) {
+ if (extra[0] == 66 && extra[1] == 67) {
+ bsize = *((uint16_t *)(extra + 4));
+ bsize -= (b.hdr.xlen + 19);
+ break;
}
+ i = i + 2;
+ uint16_t sub_field_len = *((uint16_t *)(extra + 2));
+ i = i + 2 + sub_field_len;
+ }
+
+ delete[] extra;
+
+ if (bsize == 0) {
+ return 0;
+ }
+
+ fread(b.cdata, bsize, 1, fp_);
+ if (ferror_unlocked(fp_)) {
+ return 0;
}
+ fread(&b.ftr, sizeof b.ftr, 1, fp_);
+ if (ferror_unlocked(fp_)) {
+ return 0;
+ }
+
+ return bsize;
+}
+
+std::pair<bool, int> BAMPatternSource::nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock) {
+ uint16_t cdata_len;
+
+ unsigned nread = 0;
+ bool done = false;
+
+ pt.setReadId(readCnt_);
+
+ do {
+ if (bam_batch_indexes_[pt.tid_] >= bam_batches_[pt.tid_].size()) {
+ BGZF& block = blocks_[pt.tid_];
+ std::vector<uint8_t>& batch = bam_batches_[pt.tid_];
+ if (lock) {
+ ThreadSafe ts(mutex);
+ if (first_) {
+ nextBGZFBlockFromFile(block);
+ first_ = false;
+ }
+ cdata_len = nextBGZFBlockFromFile(block);
+ } else {
+ cdata_len = nextBGZFBlockFromFile(block);
+ }
+
+ if (cdata_len == 0) {
+ ThreadSafe ts(orphan_mates_mutex_);
+ get_orphaned_pairs(pt.bufa_, pt.bufb_, pt.max_buf_, nread);
+ return make_pair(nread == 0, nread);
+ }
+
+ bam_batch_indexes_[pt.tid_] = 0;
+
+ batch.resize(block.ftr.isize);
+
+ int ret_code = decompress_bgzf_block(&batch[0], block.ftr.isize, block.cdata, cdata_len);
+
+ if (ret_code != Z_OK) {
+ return make_pair(true, 0);
+ }
+
+ uLong crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, &batch[0], batch.size());
+ assert(crc == block.ftr.crc32);
+ }
+
+ std::pair<bool, int> ret = get_alignments(pt, batch_a, nread);
+
+ done = ret.first;
+ } while (!done && nread < pt.max_buf_);
+
+ readCnt_ += 1;
+
+ return make_pair(done, nread);
+}
+
+std::pair<bool, int> BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi) {
+ size_t& i = bam_batch_indexes_[pt.tid_];
bool done = false;
bool read1 = true;
while (readi < pt.max_buf_) {
- int r;
+ if (i >= bam_batches_[pt.tid_].size()) {
+ return make_pair(false, readi);
+ }
+
uint16_t flag;
int32_t block_size;
EList<Read>& readbuf = pp_.align_paired_reads && !read1 ? pt.bufb_ : pt.bufa_;
- if ((r = zread(&block_size, sizeof(block_size))) != sizeof(block_size)) {
- return make_pair(true, readi);
- }
- if (readbuf[readi].readOrigBuf.length() < block_size) {
- readbuf[readi].readOrigBuf.resize(block_size);
- }
- if (zread(readbuf[readi].readOrigBuf.wbuf(), block_size) != block_size) {
- done = true;
- break;
+ memcpy(&block_size, &bam_batches_[pt.tid_][0] + i, sizeof(block_size));
+ if (block_size <= 0) {
+ return make_pair(done, readi);
}
- memcpy(&flag, readbuf[readi].readOrigBuf.buf() + offset[BAMField::flag], sizeof(flag));
+ i += sizeof(block_size);
+
+ memcpy(&flag, &bam_batches_[pt.tid_][0] + i + offset[BAMField::flag], sizeof(flag));
if (!pp_.align_paired_reads && ((flag & 0x40) != 0 || (flag & 0x80) != 0)) {
readbuf[readi].readOrigBuf.clear();
+ i += block_size;
continue;
}
+
if (pp_.align_paired_reads && ((flag & 0x40) == 0 && (flag & 0x80) == 0)) {
readbuf[readi].readOrigBuf.clear();
+ i += block_size;
continue;
}
- if (pp_.align_paired_reads && read1 && (flag & 0x40) == 0) {
- std::cerr << "Paired reads are out of order" << std::endl;
- return make_pair(true, readi == 0 ? readi : readi-1);
+
+ if (pp_.align_paired_reads && (((flag & 0x40) != 0
+ && i + block_size == bam_batches_[pt.tid_].size())
+ || ((flag & 0x80) != 0 && i == sizeof(block_size))))
+ {
+ ThreadSafe ts(orphan_mates_mutex_);
+ store_orphan_mate(&bam_batches_[pt.tid_][0] + i, block_size);
+ i += block_size;
+ get_orphaned_pairs(pt.bufa_, pt.bufb_, pt.max_buf_, readi);
+ } else {
+ readbuf[readi].readOrigBuf.resize(block_size);
+
+ memcpy(readbuf[readi].readOrigBuf.wbuf(), &bam_batches_[pt.tid_][0] + i, block_size);
+ i += block_size;
+
+ read1 = !read1;
+ readi = (pp_.align_paired_reads
+ && pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1;
}
- if (pp_.align_paired_reads && !read1 && (flag & 0x80) == 0) {
- std::cerr << "Paired reads are out of order" << std::endl;
- return make_pair(true, readi == 0 ? readi : readi-1);
+ }
+
+ return make_pair(done, readi);
+}
+
+void BAMPatternSource::store_orphan_mate(const uint8_t* r, size_t read_len) {
+ uint8_t flag;
+ memcpy(&flag, r + offset[BAMField::flag], sizeof(flag));
+
+ std::vector<orphan_mate_t>&
+ orphan_mates = (flag & 0x40) != 0 ? orphan_mate1s : orphan_mate2s;
+
+ size_t i;
+ for (i = 0; i < orphan_mates.size() && !orphan_mates[i].empty(); ++i) ;
+
+ if (i == orphan_mates.size()) {
+ orphan_mates.resize(orphan_mates.size() * 2);
+ }
+
+ orphan_mate_t& mate = orphan_mates[i];
+
+ if (mate.data == NULL || mate.cap < read_len) {
+ mate.data = new uint8_t[read_len];
+ mate.cap = read_len;
+ }
+
+ if (mate.cap < read_len) {
+ mate.data = new uint8_t[read_len];
+ mate.cap = read_len;
+ }
+
+ mate.size = read_len;
+
+ memcpy(mate.data, r, read_len);
+}
+
+int BAMPatternSource::compare_read_names(const void* m1, const void* m2) {
+ const orphan_mate_t* mate1 = (const orphan_mate_t*)m1;
+ const orphan_mate_t* mate2 = (const orphan_mate_t*)m2;
+
+ const char* r1 = (const char *)(mate1->data + offset[BAMField::read_name]);
+ const char* r2 = (const char *)(mate2->data + offset[BAMField::read_name]);
+
+ return strcmp(r1, r2);
+}
+
+bool BAMPatternSource::compare_read_names2(const orphan_mate_t& m1, const orphan_mate_t& m2) {
+ if (m1.empty()) {
+ return false;
+ }
+
+ if (m2.empty()) {
+ return true;
+ }
+
+ const char* r1 = (const char *)(m1.data + offset[BAMField::read_name]);
+ const char* r2 = (const char *)(m2.data + offset[BAMField::read_name]);
+
+ return strcmp(r1, r2);
+}
+
+void BAMPatternSource::get_orphaned_pairs(EList<Read>& buf_a, EList<Read>& buf_b, const size_t max_buf, unsigned& readi) {
+ std::sort(orphan_mate1s.begin(), orphan_mate1s.end(), &BAMPatternSource::compare_read_names2);
+ std::sort(orphan_mate2s.begin(), orphan_mate2s.end(), &BAMPatternSource::compare_read_names2);
+
+ size_t lim1, lim2;
+ for (lim1 = 0; lim1 < orphan_mate1s.size() && !orphan_mate1s[lim1].empty(); lim1++) ;
+ for (lim2 = 0; lim2 < orphan_mate2s.size() && !orphan_mate2s[lim2].empty(); lim2++) ;
+
+ if (lim2 == 0) {
+ return;
+ }
+
+ for (size_t i = 0; i < lim1 && readi < max_buf; ++i) {
+ orphan_mate_t* mate1 = &orphan_mate1s[i];
+ orphan_mate_t* mate2 = (orphan_mate_t*)bsearch(mate1, &orphan_mate2s[0], lim2,
+ sizeof(orphan_mate2s[0]), compare_read_names);
+
+ if (mate2 != NULL) {
+ Read& ra = buf_a[readi];
+ Read& rb = buf_b[readi];
+
+ ra.readOrigBuf.resize(mate1->size);
+ rb.readOrigBuf.resize(mate2->size);
+
+ memcpy(ra.readOrigBuf.wbuf(), mate1->data, mate1->size);
+ memcpy(rb.readOrigBuf.wbuf(), mate2->data, mate2->size);
+
+ mate1->reset();
+ mate2->reset();
+
+ readi++;
}
+ }
+}
- read1 = !read1;
- readi = (pp_.align_paired_reads
- && pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1;
+int BAMPatternSource::decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len) {
+ z_stream strm;
+
+ strm.zalloc = Z_NULL;
+ strm.zfree = Z_NULL;
+ strm.opaque = Z_NULL;
+
+ strm.avail_in = src_len;
+ strm.next_in = src;
+ strm.avail_out = dst_len;
+ strm.next_out = dst;
+
+ int ret = inflateInit2(&strm, -8);
+ if (ret != Z_OK) {
+ return ret;
}
- return make_pair(done, readi);
+
+ ret = inflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END) {
+ return ret;
+ }
+
+ return inflateEnd(&strm);
}
bool BAMPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
=====================================
pat.h
=====================================
@@ -26,6 +26,7 @@
#include <cassert>
#include <string>
#include <ctype.h>
+#include <vector>
#include "alphabet.h"
#include "assert_helpers.h"
#include "random_source.h"
@@ -54,6 +55,7 @@ struct PatternParams {
PatternParams(
int format_,
+ bool interleaved_,
bool fileParallel_,
uint32_t seed_,
size_t max_buf_,
@@ -71,6 +73,7 @@ struct PatternParams {
bool preserve_sam_tags_,
bool align_paired_reads_) :
format(format_),
+ interleaved(interleaved_),
fileParallel(fileParallel_),
seed(seed_),
max_buf(max_buf_),
@@ -89,6 +92,7 @@ struct PatternParams {
align_paired_reads(align_paired_reads_) { }
int format; // file format
+ bool interleaved; // some or all of the FASTQ reads are interleaved
bool fileParallel; // true -> wrap files with separate PatternComposers
uint32_t seed; // pseudo-random seed
size_t max_buf; // number of reads to buffer in one read
@@ -112,11 +116,12 @@ struct PatternParams {
*/
struct PerThreadReadBuf {
- PerThreadReadBuf(size_t max_buf) :
+ PerThreadReadBuf(size_t max_buf, int tid) :
max_buf_(max_buf),
bufa_(max_buf),
bufb_(max_buf),
- rdid_()
+ rdid_(),
+ tid_(tid)
{
bufa_.resize(max_buf);
bufb_.resize(max_buf);
@@ -185,6 +190,7 @@ struct PerThreadReadBuf {
EList<Read> bufb_; // Read buffer for mate bs
size_t cur_buf_; // Read buffer currently active
TReadId rdid_; // index of read at offset 0 of bufa_/bufb_
+ int tid_;
};
extern void wrongQualityFormat(const BTString& read_name);
@@ -682,7 +688,7 @@ public:
FastqPatternSource(
const EList<std::string>& infiles,
- const PatternParams& p, bool interleaved = false) :
+ const PatternParams& p, bool interleaved) :
CFilePatternSource(infiles, p),
first_(true),
interleaved_(interleaved) { }
@@ -719,6 +725,63 @@ protected:
};
class BAMPatternSource : public CFilePatternSource {
+ struct hdr_t {
+ uint8_t id1;
+ uint8_t id2;
+ uint8_t cm;
+ uint8_t flg;
+ uint32_t mtime;
+ uint8_t xfl;
+ uint8_t os;
+ uint16_t xlen;
+ };
+
+ struct ftr_t {
+ uint32_t crc32;
+ uint32_t isize;
+ };
+
+ struct BGZF {
+ hdr_t hdr;
+ uint8_t cdata[1 << 16];
+ ftr_t ftr;
+ };
+
+ struct orphan_mate_t {
+ orphan_mate_t() :
+ data(NULL),
+ size(0),
+ cap(0) {}
+
+ void reset() {
+ size = 0;
+ }
+
+ bool empty() const {
+ return size == 0;
+ }
+
+ uint8_t* data;
+ uint16_t size;
+ uint16_t cap;
+ };
+
+ struct BAMField {
+ enum aln_rec_field_name {
+ refID,
+ pos,
+ l_read_name,
+ mapq,
+ bin,
+ n_cigar_op,
+ flag,
+ l_seq,
+ next_refID,
+ next_pos,
+ tlen,
+ read_name,
+ };
+ };
public:
@@ -726,7 +789,19 @@ public:
const EList<std::string>& infiles,
const PatternParams& p) :
CFilePatternSource(infiles, p),
- first_(true) {}
+ first_(true),
+ blocks_(p.nthreads),
+ bam_batches_(p.nthreads),
+ bam_batch_indexes_(p.nthreads),
+ orphan_mate1s(p.nthreads * 2),
+ orphan_mate2s(p.nthreads * 2),
+ orphan_mates_mutex_(),
+ pp_(p) {
+ // uncompressed size of BGZF block is limited to 2**16 bytes
+ for (size_t i = 0; i < bam_batches_.size(); ++i) {
+ bam_batches_[i].reserve(1 << 16);
+ }
+ }
virtual void reset() {
first_ = true;
@@ -738,14 +813,27 @@ public:
*/
virtual bool parse(Read& ra, Read& rb, TReadId rdid) const;
+ ~BAMPatternSource() {
+ // only temporary until c++11
+ for (size_t i = 0; i < orphan_mate1s.size(); i++) {
+ if (orphan_mate1s[i].data != NULL) {
+ delete[] orphan_mate1s[i].data;
+ }
+ }
+
+ for (size_t i = 0; i < orphan_mate2s.size(); i++) {
+ if (orphan_mate2s[i].data != NULL) {
+ delete[] orphan_mate2s[i].data;
+ }
+ }
+ }
+
+
protected:
- /**
- * Light-parse a batch into the given buffer.
- */
- virtual std::pair<bool, int> nextBatchFromFile(PerThreadReadBuf& pt, bool batch_a,
- unsigned readi);
+ virtual std::pair<bool, int> nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock = true);
+ uint16_t nextBGZFBlockFromFile(BGZF& block);
/**
* Reset state to be ready for the next file.
@@ -760,24 +848,36 @@ private:
bool parse_bam_header();
- struct BAMField {
- enum aln_rec_field_name {
- refID,
- pos,
- l_read_name,
- mapq,
- bin,
- n_cigar_op,
- flag,
- l_seq,
- next_refID,
- next_pos,
- tlen,
- read_name,
- };
- };
+ virtual std::pair<bool, int> nextBatchFromFile(PerThreadReadBuf&, bool, unsigned) {
+ return make_pair(true, 0);
+ }
+
+ int decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len);
+
+ std::pair<bool, int> get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi);
+
+ void store_orphan_mate(const uint8_t* read, size_t read_len);
+
+ void get_orphaned_pairs(EList<Read>& buf_a, EList<Read>& buf_b, const size_t max_buf, unsigned& readi);
+
+ size_t get_matching_read(const uint8_t* rec);
+
+ static int compare_read_names(const void* m1, const void* m2);
+
+ static bool compare_read_names2(const orphan_mate_t& m1, const orphan_mate_t& m2);
static const int offset[];
+ static const uint8_t EOF_MARKER[];
+
+ std::vector<BGZF> blocks_;
+ std::vector<std::vector<uint8_t> > bam_batches_;
+ std::vector<size_t> bam_batch_indexes_;
+
+ std::vector<orphan_mate_t> orphan_mate1s;
+ std::vector<orphan_mate_t> orphan_mate2s;
+ MUTEX_T orphan_mates_mutex_;
+
+ PatternParams pp_;
};
/**
@@ -861,7 +961,7 @@ public:
const EList<std::string>& q, // qualities associated with singles
const EList<std::string>& q1, // qualities associated with m1
const EList<std::string>& q2, // qualities associated with m2
- const PatternParams& p, // read-in params
+ PatternParams& p, // read-in params
bool verbose); // be talkative?
protected:
@@ -1017,9 +1117,9 @@ public:
PatternSourcePerThread(
PatternComposer& composer,
- const PatternParams& pp) :
+ const PatternParams& pp, int tid) :
composer_(composer),
- buf_(pp.max_buf),
+ buf_(pp.max_buf, tid),
pp_(pp),
last_batch_(false),
last_batch_size_(0) { }
@@ -1107,15 +1207,16 @@ class PatternSourcePerThreadFactory {
public:
PatternSourcePerThreadFactory(
PatternComposer& composer,
- const PatternParams& pp) :
+ const PatternParams& pp, int tid) :
composer_(composer),
- pp_(pp) { }
+ pp_(pp),
+ tid_(tid) { }
/**
* Create a new heap-allocated PatternSourcePerThreads.
*/
virtual PatternSourcePerThread* create() const {
- return new PatternSourcePerThread(composer_, pp_);
+ return new PatternSourcePerThread(composer_, pp_, tid_);
}
/**
@@ -1125,7 +1226,7 @@ public:
virtual EList<PatternSourcePerThread*>* create(uint32_t n) const {
EList<PatternSourcePerThread*>* v = new EList<PatternSourcePerThread*>;
for(size_t i = 0; i < n; i++) {
- v->push_back(new PatternSourcePerThread(composer_, pp_));
+ v->push_back(new PatternSourcePerThread(composer_, pp_, tid_));
assert(v->back() != NULL);
}
return v;
@@ -1135,6 +1236,7 @@ private:
/// Container for obtaining paired reads from PatternSources
PatternComposer& composer_;
const PatternParams& pp_;
+ int tid_;
};
#endif /*PAT_H_*/
View it on GitLab: https://salsa.debian.org/med-team/bowtie2/commit/be7df110efc3562e1282c236658e1caefa7c134b
--
View it on GitLab: https://salsa.debian.org/med-team/bowtie2/commit/be7df110efc3562e1282c236658e1caefa7c134b
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180925/62dc25b5/attachment-0001.html>
More information about the debian-med-commit
mailing list