[med-svn] [bowtie2] 01/05: New upstream version 2.3.1
Alex Mestiashvili
malex-guest at moszumanska.debian.org
Tue Mar 7 11:57:49 UTC 2017
This is an automated email from the git hooks/post-receive script.
malex-guest pushed a commit to branch master
in repository bowtie2.
commit b224a1e79a213864732cf644ee347207713a78cc
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date: Tue Mar 7 10:37:50 2017 +0100
New upstream version 2.3.1
---
MANUAL | 19 ++++
MANUAL.markdown | 40 +++++++
Makefile | 5 +-
NEWS | 28 ++++-
VERSION | 2 +-
bowtie2 | 8 +-
bowtie_main.cpp | 241 +++++++++++++++++++++++++++++++++++++++++--
bt2_idx.h | 6 +-
bt2_search.cpp | 7 +-
doc/manual.html | 24 +++++
doc/website/manual.ssi | 23 ++++-
doc/website/old_news.ssi | 40 +++++++
doc/website/recent_news.ssi | 54 ++--------
doc/website/rhsidebar.ssi | 13 +--
formats.h | 1 +
opts.h | 3 +-
pat.cpp | 145 +++++++++++++++++---------
pat.h | 182 +++++++++++++++++++-------------
pthreadGC2.dll | Bin 60273 -> 0 bytes
read_qseq.cpp | 8 +-
scoring.h | 4 +-
scripts/test/simple_tests.pl | 27 +++--
scripts/test/simple_tests.sh | 3 +-
23 files changed, 676 insertions(+), 207 deletions(-)
diff --git a/MANUAL b/MANUAL
index 359bb68..b1395ac 100644
--- a/MANUAL
+++ b/MANUAL
@@ -863,6 +863,25 @@ Reads (specified with `<m1>`, `<m2>`, `<s>`) are FASTQ files. FASTQ files
usually have extension `.fq` or `.fastq`. FASTQ is the default format. See
also: `--solexa-quals` and `--int-quals`.
+ --interleaved
+
+Reads interleaved FASTQ files where the first two records (8 lines)
+represent a mate pair.
+
+ --tab5
+
+Each read or pair is on a single line. An unpaired read line is
+[name]\t[seq]\t[qual]\n. A paired-end read line is
+[name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. An input file can be a
+mix of unpaired and paired-end reads and Bowtie 2 recognizes each
+according to the number of fields, handling each as it should.
+
+ --tab6
+
+Similar to `--tab5` except, for paired-end reads, the second end can have a
+different name from the first:
+[name1]\t[seq1]\t[qual1]\t[name2]\t[seq2]\t[qual2]\n
+
--qseq
Reads (specified with `<m1>`, `<m2>`, `<s>`) are QSEQ files. QSEQ files usually
diff --git a/MANUAL.markdown b/MANUAL.markdown
index 2ec2828..7378b1a 100644
--- a/MANUAL.markdown
+++ b/MANUAL.markdown
@@ -921,6 +921,46 @@ usually have extension `.fq` or `.fastq`. FASTQ is the default format. See
also: [`--solexa-quals`] and [`--int-quals`].
</td></tr>
+<tr><td id="bowtie2-options-interleaved">
+
+[`--interleaved`]: #bowtie2-options-interleaved
+
+ --interleaved
+
+</td><td>
+
+Reads interleaved FASTQ files where the first two records (8 lines)
+represent a mate pair.
+
+</td></tr>
+<tr><td id="bowtie2-options-tab5">
+
+[`--tab5`]: #bowtie2-options-tab5
+
+ --tab5
+
+</td><td>
+
+Each read or pair is on a single line. An unpaired read line is
+[name]\t[seq]\t[qual]\n. A paired-end read line is
+[name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. An input file can be a
+mix of unpaired and paired-end reads and Bowtie 2 recognizes each
+according to the number of fields, handling each as it should.
+
+</td></tr>
+<tr><td id="bowtie2-options-tab6">
+
+[`--tab6`]: #bowtie2-options-tab6
+
+ --tab6
+
+</td><td>
+
+Similar to [`--tab5`] except, for paired-end reads, the second end can have a
+different name from the first:
+[name1]\t[seq1]\t[qual1]\t[name2]\t[seq2]\t[qual2]\n
+
+</td></tr>
<tr><td id="bowtie2-options-qseq">
[`--qseq`]: #bowtie2-options-qseq
diff --git a/Makefile b/Makefile
index f2aa557..cca55fb 100644
--- a/Makefile
+++ b/Makefile
@@ -25,6 +25,7 @@ prefix = /usr/local
bindir = $(prefix)/bin
INC =
+LIBS = -lreadline -ltermcap -lz
GCC_PREFIX = $(shell dirname `which gcc`)
GCC_SUFFIX =
CC ?= $(GCC_PREFIX)/gcc$(GCC_SUFFIX)
@@ -94,10 +95,10 @@ endif
#default is to use Intel TBB
ifneq (1,$(NO_TBB))
- LIBS = $(PTHREAD_LIB) -ltbb -ltbbmalloc_proxy
+ LIBS += $(PTHREAD_LIB) -ltbb -ltbbmalloc_proxy
override EXTRA_FLAGS += -DWITH_TBB
else
- LIBS = $(PTHREAD_LIB)
+ LIBS += $(PTHREAD_LIB)
endif
SEARCH_LIBS =
BUILD_LIBS =
diff --git a/NEWS b/NEWS
index c52d521..de7e1a6 100644
--- a/NEWS
+++ b/NEWS
@@ -3,7 +3,7 @@ Bowtie 2 NEWS
Bowtie 2 is now available for download from the project website,
http://bowtie-bio.sf.net/bowtie2. 2.0.0-beta1 is the first version released to
-the public and 2.3.0 is the latest version. Bowtie 2 is licensed under
+the public and 2.3.1 is the latest version. Bowtie 2 is licensed under
the GPLv3 license. See `LICENSE' file for details.
Reporting Issues
@@ -16,6 +16,32 @@ Please report any issues using the Sourceforge bug tracker:
Version Release History
=======================
+Version 2.3.1 - Mar 04, 2017
+Please note that as of this release Bowtie 2 now has dependencies on zlib and readline libraries.
+Make sure that all dependencies are met before attempting to build from source.
+
+ * Added native support for gzipped read files. The wrapper
+ script is no longer responsible for decompression. This
+ simplifies the wrapper and improves speed and thread scalability
+ for gzipped inputs.
+ * Fixed a bug that caused `bowtie2-build` to crash when the
+ first FASTA sequence contains all Ns.
+ * Add support for interleaved FASTQ format (`-—interleaved`)
+ * Empty FASTQ inputs would yield an error in Bowtie 2 2.3.0,
+ whereas previous versions would simply align 0 reads and report
+ the SAM header as usual. This version returns to the pre-2.3.0
+ behavior, resolving a compatibility issue between TopHat2 and
+ Bowtie 2 2.3.0.
+ * Fixed a bug whereby combining `-—un-conc*` with `-k` or `-a`
+ would cause `bowtie2` to print duplicate reads in one or both
+ of the `--un-conc*` output files, causing the ends to be
+ misaligned.
+ * The default `--score-min` for `--local` mode is now `G,20,8`.
+ That was the stated default in the documentation for a while,
+ but the actual default was `G,0,10` for many versions. Now the
+ default matches the documentation and, we find, yields more
+ accurate alignments than `G,0,10`
+
Version 2.3.0 - Dec 13, 2016
* Code related to read parsing was completely rewritten to improve
scalability to many threads. In short, the critical section is
diff --git a/VERSION b/VERSION
index 276cbf9..2bf1c1c 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.3.0
+2.3.1
diff --git a/bowtie2 b/bowtie2
index 12e3567..16a0175 100755
--- a/bowtie2
+++ b/bowtie2
@@ -217,6 +217,10 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
last;
}
}
+ if ($arg =~ /(bz2|lz4)$/) {
+ push @bt2w_args, ("-U", $arg);
+ $bt2_args[$i] = undef;
+ }
}
# If the user asked us to redirect some reads to files, or to suppress
# unaligned reads, then we need to capture the output from Bowtie 2 and pass it
@@ -312,7 +316,7 @@ sub cat_file($$) {
sub wrapInput($$$) {
my ($unps, $mate1s, $mate2s) = @_;
for my $fn (@$unps, @$mate1s, @$mate2s) {
- return 1 if $fn =~ /\.gz$/ || $fn =~ /\.bz2$/ || $fn =~ /\.lz4$/;
+ return 1 if $fn =~ /\.bz2$/ || $fn =~ /\.lz4$/;
}
return 0;
}
@@ -546,7 +550,7 @@ if(defined($cap_out)) {
my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
my $unal = ($fl & 4) != 0;
$filt = 1 if $no_unal && $unal;
- if($passthru) {
+ if($passthru && ($fl & 256) == 0) {
if(scalar(keys %read_fhs) == 0) {
# Next line is read with some whitespace escaped
my $l = <BT>;
diff --git a/bowtie_main.cpp b/bowtie_main.cpp
index 844c8ca..3e39acf 100644
--- a/bowtie_main.cpp
+++ b/bowtie_main.cpp
@@ -19,10 +19,23 @@
#include <iostream>
#include <fstream>
+#include <sstream>
+#include <iomanip>
#include <string.h>
#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
#include "tokenize.h"
-#include "ds.h"
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <vector>
+#include <iterator>
+#include <algorithm>
+#include <readline/readline.h>
+#include <readline/history.h>
using namespace std;
@@ -39,23 +52,237 @@ extern "C" {
* per line, and will dispatch each batch of arguments one at a time to
* bowtie.
*/
+
+static volatile sig_atomic_t done = false;
+
+static const char *options[] = {
+"--al", "--al-conc", "--dpad", "--end-to-end",
+"--fast", "--fast-local", "--fr", "--gbar",
+"--ignore-quals", "--int-quals", "--local", "--ma",
+"--met", "--met-file", "--met-stderr", "--mm",
+"--mp", "--n-ceil", "--no-1mm-upfront", "--no-contain",
+"--no-discordant", "--no-dovetail", "--no-head", "--no-mixed",
+"--no-overlap", "--no-sq", "--no-unal", "--nofw",
+"--non-deterministic", "--norc", "--np", "--omit-sec-seq",
+"--phred33", "--phred64", "--qc-filter", "--qseq",
+"--quiet", "--rdg", "--reorder", "--rfg",
+"--rg", "--rg-id", "--score-min", "--seed",
+"--sensitive", "--sensitive-local", "--un", "--un-conc",
+"--un-gz", "--version", "--very-fast", "--very-fast-local",
+"--very-sensitive", "--very-sensitive-local", "-3", "-5",
+"-D", "-I", "-L", "-N",
+"-R", "-X", "-a", "-c",
+"-f", "-h", "-i", "-k",
+"-p", "-q", "-r", "-s",
+"-t", "-u", "-1", "-2",
+"-S", "-U", "--all", "--ff",
+"--help", "--maxins", "--minins", "--rf",
+"--skip", "--threads", "--time", "--trim3",
+"--trim5", "--upto", NULL
+};
+
+
+static bool isdirectory(const char *path) {
+ struct stat statbuf;
+ if(stat(path, &statbuf) != 0) {
+ perror("stat");
+ return true;
+ }
+ return S_ISDIR(statbuf.st_mode);
+}
+
+static char *optgen(const char *text, int state) {
+ static int list_index, len;
+ const char *name = NULL;
+
+ if (!state) {
+ list_index = 0;
+ len = strlen(text);
+ }
+
+ name = rl_filename_completion_function(text, state);
+ if (name != NULL) {
+ rl_completion_append_character = isdirectory(name) ? '/': ' ';
+ return strdup(name);
+ }
+
+ if (text[0] == '-') {
+ while ((name = options[list_index++])) {
+ if (strncmp(name, text, len) == 0) {
+ return strdup(name);
+ }
+ }
+ }
+ return NULL;
+}
+
+static char **optcomplete(const char *text, int start, int end) {
+ rl_attempted_completion_over = 1;
+ return rl_completion_matches(text, optgen);
+}
+
+static void rlinit() {
+ rl_attempted_completion_function = optcomplete;
+}
+
+static void handler(int sig) {
+ done = true;
+}
+
+static int _getline(istream *in, char *buf, size_t len) {
+ if (in == &cin) {
+ char *input = readline("bowtie2> ");
+ if (!input) {
+ buf[0] = '\0';
+ }
+ else {
+ strncpy(buf, input, len-1);
+ add_history(buf);
+ free(input);
+ }
+ }
+ else {
+ in->getline(buf, len-1);
+ if (!in->good())
+ buf[0] = '\0';
+ }
+ buf[len-1] = '\0';
+ return strlen(buf);
+}
+
+static int createfifo(vector<char *>& v) {
+ const char *pattern = "/tmp/btfifo.XX";
+ char *fn = (char *)malloc(strlen(pattern)+1);
+ memset(fn, 0, strlen(pattern)+1);
+ strcpy(fn, pattern);
+ mktemp(fn);
+
+ if (mkfifo(fn, S_IRUSR | S_IWUSR) == -1) {
+ perror("mkfifo");
+ return 1;
+ }
+ v.push_back(fn);
+ return 0;
+}
+
+void printargs(const char **args, size_t size) {
+ copy(args, args+size, ostream_iterator<string>(cout, " "));
+ cout << endl;
+}
+
+bool called_from_wrapper(int argc, const char **argv) {
+ if (argc > 2 && strcmp(argv[1], "--wrapper") == 0
+ && strcmp(argv[2], "basic-0") == 0)
+ return true;
+ return false;
+}
+
int main(int argc, const char **argv) {
- if(argc > 2 && strcmp(argv[1], "-A") == 0) {
- const char *file = argv[2];
+ int offset = called_from_wrapper(argc, argv) ? 3 : 1;
+
+ if(argc > offset + 1 && strcmp(argv[offset], "-A") == 0) {
+ const char *file = argv[offset+1];
ifstream in;
- in.open(file);
+ istream *inptr = ∈
+ if (strcmp(file, "-") == 0) {
+ inptr = &cin;
+ }
+ else {
+ in.open(file);
+ }
char buf[4096];
int lastret = -1;
- while(in.getline(buf, 4095)) {
- EList<string> args;
+
+ rlinit();
+ while(_getline(inptr, buf, 4096)) {
+ done = false;
+ vector<string> args;
args.push_back(string(argv[0]));
+
+ if (offset > 1) {
+ args.push_back(string(argv[1]));
+ args.push_back(string(argv[2]));
+ }
+
tokenize(buf, " \t", args);
const char **myargs = (const char**)malloc(sizeof(char*)*args.size());
+ vector<char *> fifonames;
+ int sam_outfile_pos = -1;
+
for(size_t i = 0; i < args.size(); i++) {
+ if (args[i] == "_") {
+ if (i > 0 && args[i-1] == "-S") {
+ sam_outfile_pos = i;
+ }
+ else {
+ createfifo(fifonames);
+ args[i] = fifonames.back();
+ }
+ }
myargs[i] = args[i].c_str();
}
if(args.size() == 1) continue;
- lastret = bowtie((int)args.size(), myargs);
+
+ if (fifonames.size() > 0) {
+ struct sigaction sa;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ sa.sa_handler = handler;
+
+ if (sigaction(SIGINT, &sa, NULL) == -1
+ || signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ struct pollfd *pollfds = (struct pollfd *)calloc(fifonames.size(), sizeof(struct pollfd));
+ if (pollfds == NULL) {
+ perror("calloc");
+ exit(EXIT_FAILURE);
+ }
+
+ for (size_t i = 0; i < fifonames.size(); i++) {
+ pollfds[i].fd = open(fifonames[i], O_NONBLOCK | O_EXCL);
+ pollfds[i].events = POLLIN;
+ }
+
+ printargs(myargs, args.size());
+
+ for (int count = 0;; count++) {
+ size_t r = poll(pollfds, fifonames.size(), -1);
+ // wait until all fifos are ready
+ if (r != fifonames.size()) {
+ if (done)
+ break;
+ continue;
+ }
+ if (sam_outfile_pos >= 0) {
+ ostringstream os;
+ os << "out" << getpid() << "_" << setfill('0') << setw(3) << count << ".sam";
+ myargs[sam_outfile_pos] = os.str().c_str();
+ printargs(myargs, args.size());
+ }
+
+ lastret = bowtie((int)args.size(), myargs);
+
+ // replace the args shuffled by getopt
+ for (size_t i = 0; i < args.size(); i++) {
+ myargs[i] = args[i].c_str();
+ }
+ }
+
+ for (size_t i = 0; i < fifonames.size(); i++) {
+ if (close(pollfds[i].fd))
+ perror("close");
+ if (remove(fifonames[i]))
+ perror("remove");
+ free(fifonames[i]);
+ }
+ free(pollfds);
+ }
+ else {
+ lastret = bowtie((int)args.size(), myargs);
+ }
free(myargs);
}
if(lastret == -1) {
diff --git a/bt2_idx.h b/bt2_idx.h
index 0b63148..e2d5083 100644
--- a/bt2_idx.h
+++ b/bt2_idx.h
@@ -2595,9 +2595,11 @@ void Ebwt::joinToDisk(
if(npat >= 0) {
writeU<TIndexOffU>(out1, this->plen()[npat], this->toBe());
}
- npat++;
- this->plen()[npat] = (szs[i].len + szs[i].off);
+ this->plen()[++npat] = (szs[i].len + szs[i].off);
} else {
+ // edge case, but we could get here with npat == -1
+ // e.g. when building from a reference of all Ns
+ if (npat < 0) npat = 0;
this->plen()[npat] += (szs[i].len + szs[i].off);
}
}
diff --git a/bt2_search.cpp b/bt2_search.cpp
index 783f544..e5e02c6 100644
--- a/bt2_search.cpp
+++ b/bt2_search.cpp
@@ -477,6 +477,7 @@ static struct option long_options[] = {
{(char*)"12", required_argument, 0, ARG_ONETWO},
{(char*)"tab5", required_argument, 0, ARG_TAB5},
{(char*)"tab6", required_argument, 0, ARG_TAB6},
+ {(char*)"interleaved", required_argument, 0, ARG_INTERLEAVED_FASTQ},
{(char*)"phred33-quals", no_argument, 0, ARG_PHRED33},
{(char*)"phred64-quals", no_argument, 0, ARG_PHRED64},
{(char*)"phred33", no_argument, 0, ARG_PHRED33},
@@ -685,6 +686,9 @@ static void printUsage(ostream& out) {
<< endl
<< " Input:" << endl
<< " -q query input files are FASTQ .fq/.fastq (default)" << endl
+ << " --interleaved query input files are interleaved paired-end FASTQ reads" << endl
+ << " --tab5 query input files are TAB5 .tab5" << endl
+ << " --tab6 query input files are TAB6 .tab6" << endl
<< " --qseq query input files are in Illumina's qseq format" << endl
<< " -f query input files are (multi-)FASTA .fa/.mfa" << endl
<< " -r query input files are raw one-sequence-per-line" << endl
@@ -753,7 +757,7 @@ static void printUsage(ostream& out) {
<< " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)" << endl
<< " --no-mixed suppress unpaired alignments for paired reads" << endl
<< " --no-discordant suppress discordant alignments for paired reads" << endl
- << " --no-dovetail not concordant when mates extend past each other" << endl
+ << " --dovetail concordant when mates extend past each other" << endl
<< " --no-contain not concordant when one mate alignment contains other" << endl
<< " --no-overlap not concordant when mates overlap at all" << endl
<< endl
@@ -935,6 +939,7 @@ static void parseOption(int next_option, const char *arg) {
case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break;
+ case ARG_INTERLEAVED_FASTQ: tokenize(arg, ",", mates12); format = INTERLEAVED; break;
case 'f': format = FASTA; break;
case 'F': {
format = FASTA_CONT;
diff --git a/doc/manual.html b/doc/manual.html
index 0b2ba00..fade3a3 100644
--- a/doc/manual.html
+++ b/doc/manual.html
@@ -358,6 +358,30 @@ Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC</code></pr
</td>
</tr>
<tr>
+<td id="bowtie2-options-interleaved">
+<pre><code>--interleaved</code></pre>
+</td>
+<td>
+<p>Reads interleaved FASTQ files where the first two records (8 lines) represent a mate pair.</p>
+</td>
+</tr>
+<tr>
+<td id="bowtie2-options-tab5">
+<pre><code>--tab5</code></pre>
+</td>
+<td>
+<p>Each read or pair is on a single line. An unpaired read line is [name]. A paired-end read line is [name]. An input file can be a mix of unpaired and paired-end reads and Bowtie 2 recognizes each according to the number of fields, handling each as it should.</p>
+</td>
+</tr>
+<tr>
+<td id="bowtie2-options-tab6">
+<pre><code>--tab6</code></pre>
+</td>
+<td>
+<p>Similar to <a href="#bowtie2-options-tab5"><code>--tab5</code></a> except, for paired-end reads, the second end can have a different name from the first: [name1]</p>
+</td>
+</tr>
+<tr>
<td id="bowtie2-options-qseq">
<pre><code>--qseq</code></pre>
</td>
diff --git a/doc/website/manual.ssi b/doc/website/manual.ssi
index 038aa6a..7c50b2a 100644
--- a/doc/website/manual.ssi
+++ b/doc/website/manual.ssi
@@ -1,5 +1,5 @@
<h1>Table of Contents</h1>
-<p> Version <b>2.3.0</b></p>
+<p> Version <b>2.3.1</b></p>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
@@ -347,6 +347,27 @@ Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC</code></pr
<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTQ files. FASTQ files usually have extension <code>.fq</code> or <code>.fastq</code>. FASTQ is the default format. See also: <a href="#bowtie2-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#bowtie2-options-int-quals"><code>--int-quals</code></a>.</p>
</td></tr>
+<tr><td id="bowtie2-options-interleaved">
+
+<pre><code>--interleaved</code></pre>
+</td><td>
+
+<p>Reads interleaved FASTQ files where the first two records (8 lines) represent a mate pair.</p>
+</td></tr>
+<tr><td id="bowtie2-options-tab5">
+
+<pre><code>--tab5</code></pre>
+</td><td>
+
+<p>Each read or pair is on a single line. An unpaired read line is [name]. A paired-end read line is [name]. An input file can be a mix of unpaired and paired-end reads and Bowtie 2 recognizes each according to the number of fields, handling each as it should.</p>
+</td></tr>
+<tr><td id="bowtie2-options-tab6">
+
+<pre><code>--tab6</code></pre>
+</td><td>
+
+<p>Similar to <a href="#bowtie2-options-tab5"><code>--tab5</code></a> except, for paired-end reads, the second end can have a different name from the first: [name1]</p>
+</td></tr>
<tr><td id="bowtie2-options-qseq">
<pre><code>--qseq</code></pre>
diff --git a/doc/website/old_news.ssi b/doc/website/old_news.ssi
index dc0983b..7df0a31 100644
--- a/doc/website/old_news.ssi
+++ b/doc/website/old_news.ssi
@@ -1,3 +1,43 @@
+<h2>Version 2.2.4 - Oct 22, 2014</h2>
+<ul>
+ <li>Fixed a Mavericks OSX specific bug caused by some linkage ambiguities.</li>
+ <li>Added lz4 compression option for the wrapper.</li>
+ <li>Fixed the vanishing <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> help line.</li>
+ <li>Added the static linkage for MinGW builds.</li>
+ <li>Added extra seed-hit output.</li>
+ <li>Fixed missing 0-length read in fastq <tt>--passthrough</tt> output.</li>
+ <li>Fixed an issue that would cause different output in <tt>-a</tt> mode depending on random seed.</li>
+</ul>
+
+<h2>Version 2.2.3 - May 30, 2014</h2>
+<ul>
+ <li>Fixed a bug that made loading an index into memory crash sometimes.</li>
+ <li>Fixed a silent failure to warn the user in case the <tt><a href="manual.shtml#bowtie2-options-x">-x</a></tt> option is missing.</li>
+ <li>Updated <tt><a href="manual.shtml#bowtie2-options-al">--al</a></tt>, <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>, <tt><a href="manual.shtml#bowtie2-options-al-conc">al-conc</a></tt> and <tt><a href="manual.shtml#bowtie2-options-un-conc">un-conc</a></tt> options to avoid confusion in cases where the user does not provide a base file name.</li>
+ <li>Fixed a spurious assert that made bowtie2-inspect debug fail.</li>
+</ul>
+
+<h2>Version 2.2.2 - April 10, 2014</h2>
+<ul>
+ <li>Improved performance for cases where the reference contains ambiguous
+ or masked nucleobases represented by Ns. </li>
+</ul>
+
+<h2>Version 2.2.1 - February 28, 2014</h2>
+<ul>
+ <li>Improved way in which index files are loaded for alignment. Should fix
+ efficiency problems on some filesystems.</li>
+ <li>Fixed a bug that made older systems unable to correctly deal with bowtie
+ relative symbolic links.</li>
+ <li>Fixed a bug that, for very big indexes, could determine to determine file
+ offsets correctly.</li>
+ <li>Fixed a bug where using <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> option incorrectly suppressed
+ <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>/<tt><a href="manual.shtml#bowtie2-options-un-conc">--un-conc</a></tt> output.</li>
+ <li>Dropped a perl dependency that could cause problems on old systems.</li>
+ <li>Added <tt><a href="manual.shtml#bowtie2-options-no-1mm-upfront">--no-1mm-upfront</a></tt> option and clarified documentation for parameters
+ governing the multiseed heuristic.</li>
+</ul>
+
<h2>Bowtie 2 on GitHub - February 4, 2014</h2>
<ul>
<li>Bowtie 2 source now lives in a <a href="https://github.com/BenLangmead/bowtie2">public GitHub repository</a>.</li>
diff --git a/doc/website/recent_news.ssi b/doc/website/recent_news.ssi
index 958a2fa..07103af 100644
--- a/doc/website/recent_news.ssi
+++ b/doc/website/recent_news.ssi
@@ -1,3 +1,13 @@
+<h2>Version 2.3.1 - Mar 03, 2017</h2>
+<p>Please note that as of this release Bowtie 2 now has dependencies on zlib and readline libraries. Make sure that all dependencies are met before attempting to build from source.</p>
+<ul>
+ <li>Added native support for gzipped read files. The wrapper script is no longer responsible for decompression. This simplifies the wrapper and improves speed and thread scalability for gzipped inputs.</li>
+ <li>Fixed a bug that caused <tt>'bowtie2-build'</tt> to crash when the first FASTA sequence contains all Ns.</li>
+ <li>Add support for interleaved FASTQ format <tt><a href="manual.shtml#bowtie2-options-interleaved">-—interleaved</a></tt>.</li>
+ <li>Empty FASTQ inputs would yield an error in Bowtie 2 2.3.0, whereas previous versions would simply align 0 reads and report the SAM header as usual. This version returns to the pre-2.3.0 behavior, resolving a compatibility issue between TopHat2 and Bowtie 2 2.3.0.</li>
+ <li>Fixed a bug whereby combining <tt>'-—un-conc*</tt> with <tt>'-k'</tt> or <tt>'-a'</tt> would cause <tt>'bowtie2'</tt> to print duplicate reads in one or both of the <tt>'--un-conc*'</tt> output files, causing the ends to be misaligned.</li>
+ <li>The default <tt>'--score-min'</tt> for <tt>'--local'</tt> mode is now <tt>'G,20,8'</tt>. That was the stated default in the documentation for a while, but the actual default was <tt>'G,0,10'</tt> for many versions. Now the default matches the documentation and, we find, yields more accurate alignments than <tt>'G,0,10'</tt></li>
+</ul>
<h2>Version 2.3.0 - Dec 13, 2016</h2>
<p>This is a major release with some larger and many smaller changes. These notes emphasize the large changes. See commit history for details.</p>
<ul>
@@ -58,47 +68,3 @@
<ul>
<li>Lighter is an extremely fast and memory-efficient program for correcting sequencing errors in DNA sequencing data. For details on how error correction can help improve the speed and accuracy of downstream analysis tools, see the <a href="http://genomebiology.com/2014/15/11/509">paper in Genome Biology</a>. Source and software <a href="https://github.com/mourisl/Lighter">available at GitHub</a>.
</ul>
-
-
-<h2>Version 2.2.4 - Oct 22, 2014</h2>
-<ul>
- <li>Fixed a Mavericks OSX specific bug caused by some linkage ambiguities.</li>
- <li>Added lz4 compression option for the wrapper.</li>
- <li>Fixed the vanishing <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> help line.</li>
- <li>Added the static linkage for MinGW builds.</li>
- <li>Added extra seed-hit output.</li>
- <li>Fixed missing 0-length read in fastq <tt>--passthrough</tt> output.</li>
- <li>Fixed an issue that would cause different output in <tt>-a</tt> mode depending on random seed.</li>
-</ul>
-
-<h2>Version 2.2.3 - May 30, 2014</h2>
-<ul>
- <li>Fixed a bug that made loading an index into memory crash sometimes.</li>
- <li>Fixed a silent failure to warn the user in case the <tt><a href="manual.shtml#bowtie2-options-x">-x</a></tt> option is missing.</li>
- <li>Updated <tt><a href="manual.shtml#bowtie2-options-al">--al</a></tt>, <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>, <tt><a href="manual.shtml#bowtie2-options-al-conc">al-conc</a></tt> and <tt><a href="manual.shtml#bowtie2-options-un-conc">un-conc</a></tt> options to avoid confusion in cases where the user does not provide a base file name.</li>
- <li>Fixed a spurious assert that made bowtie2-inspect debug fail.</li>
-</ul>
-
-<h2>Version 2.2.2 - April 10, 2014</h2>
-<ul>
- <li>Improved performance for cases where the reference contains ambiguous
- or masked nucleobases represented by Ns. </li>
-</ul>
-
-<h2>Version 2.2.1 - February 28, 2014</h2>
-<ul>
- <li>Improved way in which index files are loaded for alignment. Should fix
- efficiency problems on some filesystems.</li>
- <li>Fixed a bug that made older systems unable to correctly deal with bowtie
- relative symbolic links.</li>
- <li>Fixed a bug that, for very big indexes, could determine to determine file
- offsets correctly.</li>
- <li>Fixed a bug where using <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> option incorrectly suppressed
- <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>/<tt><a href="manual.shtml#bowtie2-options-un-conc">--un-conc</a></tt> output.</li>
- <li>Dropped a perl dependency that could cause problems on old systems.</li>
- <li>Added <tt><a href="manual.shtml#bowtie2-options-no-1mm-upfront">--no-1mm-upfront</a></tt> option and clarified documentation for parameters
- governing the multiseed heuristic.</li>
-</ul>
-
-
-
diff --git a/doc/website/rhsidebar.ssi b/doc/website/rhsidebar.ssi
index ba11d60..8f976dc 100644
--- a/doc/website/rhsidebar.ssi
+++ b/doc/website/rhsidebar.ssi
@@ -18,10 +18,10 @@
</tr>
<tr>
<td>
- <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.0">Bowtie2 2.3.0</a>
+ <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.1">Bowtie2 2.3.1</a>
</td>
<td align="right">
- 12/13/16
+ 03/04/17
</td>
</tr>
<tr>
@@ -36,7 +36,7 @@
<ul>
<li><a href="https://github.com/BenLangmead/bowtie2">Bowtie GitHub repository</a></li>
<li><a href="https://github.com/BenLangmead/bowtie2/issues">Report an issue</a></li>
- <li><a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce">Bowtie Mailing list</a></li>
+ <!--<li><a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce">Bowtie Mailing list</a></li>-->
</ul>
</div>
<h2>Related Tools</h2>
@@ -173,24 +173,25 @@
<ul>
<li><a href="http://www.cs.jhu.edu/~langmea/index.shtml">Ben Langmead</a></li>
<li><a href="http://www.ccb.jhu.edu/people/infphilo">Daehwan Kim</a></li>
- <li>Valentin Antonescu</li>
+ <li>Rone Charles</li>
<li>Chris Wilks</li>
+ <li>Valentin Antonescu</li>
</ul>
</div>
+<!--
<h2>Other Documentation</h2>
<div class="box">
<ul>
<li>Bio. of Genomes poster, 5/11 (<a href="http://www.cbcb.umd.edu/~langmead/bog2011_poster.ppt">.ppt</a>, <a href="http://www.cbcb.umd.edu/~langmead/bog2011_poster.pdf">.pdf</a>)</li>
</ul>
</div>
+-->
<h2>Related links</h2>
<div class="box">
<ul>
<li><a href="http://scholar.google.com/scholar?oi=bibs&hl=en&cites=4636016874467197548">Papers citing Bowtie 2</a></li>
<li><a href="http://www.jhu.edu/">Johns Hopkins University</a></li>
<li><a href="http://www.cs.jhu.edu/">JHU Computer Science</a></li>
- <li><a href="http://www.biostat.jhsph.edu/">JHSPH Biostatistics</a></li>
- <li><a href="http://seqanswers.com/">SEQanswers</a></li>
</ul>
</div>
</div> <!-- End of "rightside" -->
diff --git a/formats.h b/formats.h
index 7627827..5067939 100644
--- a/formats.h
+++ b/formats.h
@@ -30,6 +30,7 @@ enum file_format {
FASTA = 1,
FASTA_CONT,
FASTQ,
+ INTERLEAVED,
TAB_MATE5,
TAB_MATE6,
RAW,
diff --git a/opts.h b/opts.h
index a90669b..e57aa26 100644
--- a/opts.h
+++ b/opts.h
@@ -151,7 +151,8 @@ enum {
ARG_DESC_PRIORITIZE, // --desc-prioritize
ARG_DESC_FMOPS, // --desc-fmops
ARG_LOG_DP, // --log-dp
- ARG_LOG_DP_OPP // --log-dp-opp
+ ARG_LOG_DP_OPP, // --log-dp-opp
+ ARG_INTERLEAVED_FASTQ // --interleaved
};
#endif
diff --git a/pat.cpp b/pat.cpp
index 782130f..e5f387b 100644
--- a/pat.cpp
+++ b/pat.cpp
@@ -89,6 +89,7 @@ PatternSource* PatternSource::patsrcFromStrings(
case FASTA_CONT: return new FastaContinuousPatternSource(qs, p);
case RAW: return new RawPatternSource(qs, p);
case FASTQ: return new FastqPatternSource(qs, p);
+ case INTERLEAVED: return new FastqPatternSource(qs, p, true /* interleaved */);
case TAB_MATE5: return new TabbedPatternSource(qs, p, false);
case TAB_MATE6: return new TabbedPatternSource(qs, p, true);
case CMDLINE: return new VectorPatternSource(qs, p);
@@ -166,7 +167,7 @@ pair<bool, bool> PatternSourcePerThread::nextReadPair() {
} else {
finalize(buf_.read_a());
}
- bool this_is_last = buf_.cur_buf_ == last_batch_size_-1;
+ bool this_is_last = buf_.cur_buf_ == static_cast<unsigned int>(last_batch_size_-1);
return make_pair(true, this_is_last ? last_batch_ : false);
}
@@ -373,14 +374,14 @@ PatternComposer* PatternComposer::setupPatternComposer(
}
void PatternComposer::free_EList_pmembers( const EList<PatternSource*> &elist) {
- for (size_t i = 0; i < elist.size(); i++)
- if (elist[i] != NULL)
- delete elist[i];
+ for (size_t i = 0; i < elist.size(); i++)
+ if (elist[i] != NULL)
+ delete elist[i];
}
/**
* Fill Read with the sequence, quality and name for the next
- * read in the list of read files. This function gets called by
+ * read in the list of read files. This function gets called by
* all the search threads, so we must handle synchronization.
*
* Returns pair<bool, int> where bool indicates whether we're
@@ -425,24 +426,44 @@ pair<bool, int> CFilePatternSource::nextBatch(
void CFilePatternSource::open() {
if(is_open_) {
is_open_ = false;
- fclose(fp_);
- fp_ = NULL;
+ if (compressed_) {
+ gzclose(zfp_);
+ zfp_ = NULL;
+ }
+ else {
+ fclose(fp_);
+ fp_ = NULL;
+ }
}
while(filecur_ < infiles_.size()) {
if(infiles_[filecur_] == "-") {
- fp_ = stdin;
- } else if((fp_ = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
- if(!errs_[filecur_]) {
- cerr << "Warning: Could not open read file \""
- << infiles_[filecur_].c_str()
- << "\" for reading; skipping..." << endl;
- errs_[filecur_] = true;
+ // always assume that data from stdin is compressed
+ compressed_ = true;
+ int fn = dup(fileno(stdin));
+ zfp_ = gzdopen(fn, "rb");
+ }
+ else {
+ compressed_ = false;
+ if (is_gzipped_file(infiles_[filecur_])) {
+ compressed_ = true;
+ zfp_ = gzopen(infiles_[filecur_].c_str(), "rb");
+ }
+ else {
+ fp_ = fopen(infiles_[filecur_].c_str(), "rb");
+ }
+ if((compressed_ && zfp_ == NULL) || (!compressed_ && fp_ == NULL)) {
+ if(!errs_[filecur_]) {
+ cerr << "Warning: Could not open read file \""
+ << infiles_[filecur_].c_str()
+ << "\" for reading; skipping..." << endl;
+ errs_[filecur_] = true;
+ }
+ filecur_++;
+ continue;
}
- filecur_++;
- continue;
}
is_open_ = true;
- setvbuf(fp_, buf_, _IOFBF, 64*1024);
+ compressed_ ? gzbuffer(zfp_, 64*1024) : setvbuf(fp_, buf_, _IOFBF, 64*1024);
return;
}
cerr << "Error: No input read files were valid" << endl;
@@ -499,7 +520,7 @@ VectorPatternSource::VectorPatternSource(
/**
* Read next batch. However, batch concept is not very applicable for this
* PatternSource where all the info has already been parsed into the fields
- * in the contsructor. This essentially modifies the pt as though we read
+ * in the contsructor. This essentially modifies the pt as though we read
* in some number of patterns.
*/
pair<bool, int> VectorPatternSource::nextBatch(
@@ -622,9 +643,12 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile(
int c;
EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
if(first_) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
+ if (c == EOF) {
+ return make_pair(true, 0);
+ }
while(c == '\r' || c == '\n') {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
if(c != '>') {
cerr << "Error: reads file does not look like a FASTA file" << endl;
@@ -634,16 +658,13 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile(
}
bool done = false;
size_t readi = 0;
- if (feof(fp_)) {
- done = true;
- }
// Read until we run out of input or until we've filled the buffer
for(; readi < pt.max_buf_ && !done; readi++) {
Read::TBuf& buf = readbuf[readi].readOrigBuf;
buf.clear();
buf.append('>');
while(true) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
if(c < 0 || c == '>') {
done = c < 0;
break;
@@ -659,7 +680,7 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile(
*/
bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
// We assume the light parser has put the raw data for the separate ends
- // into separate Read objects. That doesn't have to be the case, but
+ // into separate Read objects. That doesn't have to be the case, but
// that's how we've chosen to do it for FastqPatternSource
assert(!r.readOrigBuf.empty());
assert(r.empty());
@@ -729,7 +750,7 @@ bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
* 1. Reads are substrings of a longer FASTA input string
* 2. Reads may overlap w/r/t the longer FASTA string
* 3. Read names depend on the most recently observed FASTA
- * record name
+ * record name
*/
pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
@@ -739,13 +760,13 @@ pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
while(readi < pt.max_buf_) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
if(c < 0) {
break;
}
if(c == '>') {
resetForNextFile();
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
bool sawSpace = false;
while(c != '\n' && c != '\r') {
if(!sawSpace) {
@@ -754,10 +775,10 @@ pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
if(!sawSpace) {
name_prefix_buf_.append(c);
}
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
while(c == '\n' || c == '\r') {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
if(c < 0) {
break;
@@ -869,7 +890,7 @@ bool FastaContinuousPatternSource::parse(
/**
- * "Light" parser. This is inside the critical section, so the key is to do
+ * "Light" parser. This is inside the critical section, so the key is to do
* just enough parsing so that another function downstream (finalize()) can do
* the rest of the parsing. Really this function's only job is to stick every
* for lines worth of the input file into a buffer (r.readOrigBuf). finalize()
@@ -880,28 +901,32 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
bool batch_a)
{
int c;
- EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
+ EList<Read>* readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
if(first_) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
+ if (c == EOF) {
+ return make_pair(true, 0);
+ }
while(c == '\r' || c == '\n') {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
if(c != '@') {
cerr << "Error: reads file does not look like a FASTQ file" << endl;
throw 1;
}
first_ = false;
- readbuf[0].readOrigBuf.append('@');
+ (*readbuf)[0].readOrigBuf.append('@');
}
+
bool done = false, aborted = false;
size_t readi = 0;
// Read until we run out of input or until we've filled the buffer
- for(; readi < pt.max_buf_ && !done; readi++) {
- Read::TBuf& buf = readbuf[readi].readOrigBuf;
+ while (readi < pt.max_buf_ && !done) {
+ Read::TBuf& buf = (*readbuf)[readi].readOrigBuf;
assert(readi == 0 || buf.empty());
int newlines = 4;
while(newlines) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
done = c < 0;
if(c == '\n' || (done && newlines == 1)) {
// Saw newline, or EOF that we're
@@ -909,11 +934,29 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
newlines--;
c = '\n';
} else if(done) {
- aborted = true; // Unexpected EOF
+ // account for newline at the end of the file
+ if (newlines == 4) {
+ newlines = 0;
+ }
+ else {
+ aborted = true; // Unexpected EOF
+ }
break;
}
buf.append(c);
}
+ if (c > 0) {
+ if (interleaved_) {
+ // alternate between read buffers
+ batch_a = !batch_a;
+ readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
+ // increment read counter after each pair gets read
+ readi = batch_a ? readi+1 : readi;
+ }
+ else {
+ readi++;
+ }
+ }
}
if(aborted) {
readi--;
@@ -926,7 +969,7 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
*/
bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const {
// We assume the light parser has put the raw data for the separate ends
- // into separate Read objects. That doesn't have to be the case, but
+ // into separate Read objects. That doesn't have to be the case, but
// that's how we've chosen to do it for FastqPatternSource
assert(!r.readOrigBuf.empty());
assert(r.empty());
@@ -1043,9 +1086,9 @@ pair<bool, int> TabbedPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
- int c = getc_unlocked(fp_);
+ int c = getc_wrapper();
while(c >= 0 && (c == '\n' || c == '\r')) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
@@ -1054,10 +1097,10 @@ pair<bool, int> TabbedPatternSource::nextBatchFromFile(
readbuf[readi].readOrigBuf.clear();
while(c >= 0 && c != '\n' && c != '\r') {
readbuf[readi].readOrigBuf.append(c);
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
while(c >= 0 && (c == '\n' || c == '\r')) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
}
return make_pair(c < 0, readi);
@@ -1177,9 +1220,9 @@ pair<bool, int> RawPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
- int c = getc_unlocked(fp_);
+ int c = getc_wrapper();
while(c >= 0 && (c == '\n' || c == '\r')) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
@@ -1188,15 +1231,15 @@ pair<bool, int> RawPatternSource::nextBatchFromFile(
readbuf[readi].readOrigBuf.clear();
while(c >= 0 && c != '\n' && c != '\r') {
readbuf[readi].readOrigBuf.append(c);
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
while(c >= 0 && (c == '\n' || c == '\r')) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
}
// incase a valid character is consumed between batches
- if (c >= 0 && c != '\n' && c != '\r') {
- ungetc(c, fp_);
+ if (c >= 0 && c != '\n' && c != '\r') {
+ ungetc_wrapper(c);
}
return make_pair(c < 0, readi);
}
@@ -1252,7 +1295,7 @@ bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
void wrongQualityFormat(const BTString& read_name) {
cerr << "Error: Encountered one or more spaces while parsing the quality "
- << "string for read " << read_name << ". If this is a FASTQ file "
+ << "string for read " << read_name << ". If this is a FASTQ file "
<< "with integer (non-ASCII-encoded) qualities, try re-running with "
<< "the --integer-quals option." << endl;
throw 1;
diff --git a/pat.h b/pat.h
index e5071db..a2556c7 100644
--- a/pat.h
+++ b/pat.h
@@ -20,6 +20,9 @@
#ifndef PAT_H_
#define PAT_H_
+#include <stdio.h>
+#include <sys/stat.h>
+#include <zlib.h>
#include <cassert>
#include <string>
#include <ctype.h>
@@ -75,18 +78,18 @@ struct PatternParams {
nthreads(nthreads_),
fixName(fixName_) { }
- int format; // file format
- bool fileParallel; // true -> wrap files with separate PatternComposers
- uint32_t seed; // pseudo-random seed
- size_t max_buf; // number of reads to buffer in one read
- bool solexa64; // true -> qualities are on solexa64 scale
- bool phred64; // true -> qualities are on phred64 scale
- bool intQuals; // true -> qualities are space-separated numbers
- int sampleLen; // length of sampled reads for FastaContinuous...
- int sampleFreq; // frequency of sampled reads for FastaContinuous...
- size_t skip; // skip the first 'skip' patterns
- int nthreads; // number of threads for locking
- bool fixName; //
+ int format; // file format
+ bool fileParallel; // true -> wrap files with separate PatternComposers
+ uint32_t seed; // pseudo-random seed
+ size_t max_buf; // number of reads to buffer in one read
+ bool solexa64; // true -> qualities are on solexa64 scale
+ bool phred64; // true -> qualities are on phred64 scale
+ bool intQuals; // true -> qualities are space-separated numbers
+ int sampleLen; // length of sampled reads for FastaContinuous...
+ int sampleFreq; // frequency of sampled reads for FastaContinuous...
+ size_t skip; // skip the first 'skip' patterns
+ int nthreads; // number of threads for locking
+ bool fixName; //
};
/**
@@ -163,10 +166,10 @@ struct PerThreadReadBuf {
}
const size_t max_buf_; // max # reads to read into buffer at once
- EList<Read> bufa_; // Read buffer for mate as
- EList<Read> bufb_; // Read buffer for mate bs
- size_t cur_buf_; // Read buffer currently active
- TReadId rdid_; // index of read at offset 0 of bufa_/bufb_
+ EList<Read> bufa_; // Read buffer for mate as
+ EList<Read> bufb_; // Read buffer for mate bs
+ size_t cur_buf_; // Read buffer currently active
+ TReadId rdid_; // index of read at offset 0 of bufa_/bufb_
};
extern void wrongQualityFormat(const BTString& read_name);
@@ -256,7 +259,7 @@ public:
/**
* Read next batch. However, batch concept is not very applicable for this
* PatternSource where all the info has already been parsed into the fields
- * in the contsructor. This essentially modifies the pt as though we read
+ * in the contsructor. This essentially modifies the pt as though we read
* in some number of patterns.
*/
virtual std::pair<bool, int> nextBatch(
@@ -280,12 +283,12 @@ public:
private:
- size_t cur_; // index for first read of next batch
- size_t skip_; // # reads to skip
- bool paired_; // whether reads are paired
- EList<string> tokbuf_; // buffer for storing parsed tokens
+ size_t cur_; // index for first read of next batch
+ size_t skip_; // # reads to skip
+ bool paired_; // whether reads are paired
+ EList<string> tokbuf_; // buffer for storing parsed tokens
EList<Read::TBuf> bufs_; // per-read buffers
- char nametmp_[20]; // temp buffer for constructing name
+ char nametmp_[20]; // temp buffer for constructing name
};
/**
@@ -303,9 +306,11 @@ public:
infiles_(infiles),
filecur_(0),
fp_(NULL),
+ zfp_(NULL),
is_open_(false),
skip_(p.skip),
- first_(true)
+ first_(true),
+ compressed_(false)
{
assert_gt(infiles.size(), 0);
errs_.resize(infiles_.size());
@@ -319,14 +324,20 @@ public:
*/
virtual ~CFilePatternSource() {
if(is_open_) {
- assert(fp_ != NULL);
- fclose(fp_);
+ if (compressed_) {
+ assert(zfp_ != NULL);
+ gzclose(zfp_);
+ }
+ else {
+ assert(fp_ != NULL);
+ fclose(fp_);
+ }
}
}
/**
* Fill Read with the sequence, quality and name for the next
- * read in the list of read files. This function gets called by
+ * read in the list of read files. This function gets called by
* all the search threads, so we must handle synchronization.
*
* Returns pair<bool, int> where bool indicates whether we're
@@ -352,7 +363,7 @@ protected:
/**
* Light-parse a batch of unpaired reads from current file into the given
- * buffer. Called from CFilePatternSource.nextBatch().
+ * buffer. Called from CFilePatternSource.nextBatch().
*/
virtual std::pair<bool, int> nextBatchFromFile(
PerThreadReadBuf& pt,
@@ -367,15 +378,42 @@ protected:
* Open the next file in the list of input files.
*/
void open();
+
+ int getc_wrapper() {
+ return compressed_ ? gzgetc(zfp_) : getc_unlocked(fp_);
+ }
+
+ int ungetc_wrapper(int c) {
+ return compressed_ ? gzungetc(c, zfp_) : ungetc(c, fp_);
+ }
+
+ bool is_gzipped_file(const std::string& filename) {
+ struct stat s;
+ if (stat(filename.c_str(), &s) != 0) {
+ perror("stat");
+ }
+ else {
+ if (S_ISFIFO(s.st_mode))
+ return true;
+ }
+ size_t pos = filename.find_last_of(".");
+ std::string ext = (pos == std::string::npos) ? "" : filename.substr(pos + 1);
+ if (ext == "" || ext == "gz" || ext == "Z") {
+ return true;
+ }
+ return false;
+ }
EList<std::string> infiles_; // filenames for read files
- EList<bool> errs_; // whether we've already printed an error for each file
- size_t filecur_; // index into infiles_ of next file to read
- FILE *fp_; // read file currently being read from
- bool is_open_; // whether fp_ is currently open
- TReadId skip_; // number of reads to skip
- bool first_; // parsing first record in first file?
- char buf_[64*1024]; // file buffer
+ EList<bool> errs_; // whether we've already printed an error for each file
+ size_t filecur_; // index into infiles_ of next file to read
+ FILE *fp_; // read file currently being read from
+ gzFile zfp_;
+ bool is_open_; // whether fp_ is currently open
+ TReadId skip_; // number of reads to skip
+ bool first_; // parsing first record in first file?
+ char buf_[64*1024]; // file buffer
+ bool compressed_;
};
/**
@@ -469,10 +507,10 @@ protected:
PerThreadReadBuf& pt,
bool batch_a);
- bool solQuals_; // base qualities are log odds
+ bool solQuals_; // base qualities are log odds
bool phred64Quals_; // base qualities are on -64 scale
- bool intQuals_; // base qualities are space-separated strings
- bool secondName_; // true if --tab6, false if --tab5
+ bool intQuals_; // base qualities are space-separated strings
+ bool secondName_; // true if --tab6, false if --tab5
};
/**
@@ -487,8 +525,8 @@ protected:
* 5. X coordinate of spot
* 6. Y coordinate of spot
* 7. Index: "Index sequence or 0. For no indexing, or for a file that
- * has not been demultiplexed yet, this field should have a value of
- * 0."
+ * has not been demultiplexed yet, this field should have a value of
+ * 0."
* 8. Read number: 1 for unpaired, 1 or 2 for paired
* 9. Sequence
* 10. Quality
@@ -520,9 +558,9 @@ protected:
PerThreadReadBuf& pt,
bool batch_a);
- bool solQuals_; // base qualities are log odds
+ bool solQuals_; // base qualities are log odds
bool phred64Quals_; // base qualities are on -64 scale
- bool intQuals_; // base qualities are space-separated strings
+ bool intQuals_; // base qualities are space-separated strings
EList<std::string> qualToks_;
};
@@ -581,19 +619,19 @@ protected:
private:
const size_t length_; /// length of reads to generate
const size_t freq_; /// frequency to sample reads
- size_t eat_; /// number of characters we need to skip before
- /// we have flushed all of the ambiguous or
- /// non-existent characters out of our read
- /// window
- bool beginning_; /// skipping over the first read length?
- char buf_[1024]; /// FASTA sequence buffer
+ size_t eat_; /// number of characters we need to skip before
+ /// we have flushed all of the ambiguous or
+ /// non-existent characters out of our read
+ /// window
+ bool beginning_; /// skipping over the first read length?
+ char buf_[1024]; /// FASTA sequence buffer
Read::TBuf name_prefix_buf_; /// FASTA sequence name buffer
char name_int_buf_[20]; /// for composing offsets for names
- size_t bufCur_; /// buffer cursor; points to where we should
- /// insert the next character
+ size_t bufCur_; /// buffer cursor; points to where we should
+ /// insert the next character
uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
- /// the pat id to output (so it resets to 0 for
- /// each new sequence)
+ /// the pat id to output (so it resets to 0 for
+ /// each new sequence)
};
/**
@@ -606,12 +644,13 @@ public:
FastqPatternSource(
const EList<std::string>& infiles,
- const PatternParams& p) :
+ const PatternParams& p, bool interleaved = false) :
CFilePatternSource(infiles, p),
first_(true),
solQuals_(p.solexa64),
phred64Quals_(p.phred64),
- intQuals_(p.intQuals) { }
+ intQuals_(p.intQuals),
+ interleaved_(interleaved) { }
virtual void reset() {
first_ = true;
@@ -639,14 +678,15 @@ protected:
first_ = true;
}
- bool first_; // parsing first read in file
- bool solQuals_; // base qualities are log odds
+ bool first_; // parsing first read in file
+ bool solQuals_; // base qualities are log odds
bool phred64Quals_; // base qualities are on -64 scale
- bool intQuals_; // base qualities are space-separated strings
+ bool intQuals_; // base qualities are space-separated strings
+ bool interleaved_; // fastq reads are interleaved
};
/**
- * Read a Raw-format file (one sequence per line). No quality strings
+ * Read a Raw-format file (one sequence per line). No quality strings
* allowed. All qualities are assumed to be 'I' (40 on the Phred-33
* scale).
*/
@@ -718,15 +758,15 @@ public:
* dispense them.
*/
static PatternComposer* setupPatternComposer(
- const EList<std::string>& si, // singles, from argv
- const EList<std::string>& m1, // mate1's, from -1 arg
- const EList<std::string>& m2, // mate2's, from -2 arg
- const EList<std::string>& m12, // both mates on each line, from --12
- const EList<std::string>& q, // qualities associated with singles
- const EList<std::string>& q1, // qualities associated with m1
- const EList<std::string>& q2, // qualities associated with m2
- const PatternParams& p, // read-in params
- bool verbose); // be talkative?
+ const EList<std::string>& si, // singles, from argv
+ const EList<std::string>& m1, // mate1's, from -1 arg
+ const EList<std::string>& m2, // mate2's, from -2 arg
+ const EList<std::string>& m12, // both mates on each line, from --12
+ const EList<std::string>& q, // qualities associated with singles
+ const EList<std::string>& q1, // qualities associated with m1
+ const EList<std::string>& q2, // qualities associated with m2
+ const PatternParams& p, // read-in params
+ bool verbose); // be talkative?
protected:
@@ -814,7 +854,7 @@ public:
// srca_ and srcb_ must be parallel
assert_eq(srca_->size(), srcb_->size());
for(size_t i = 0; i < srca_->size(); i++) {
- // Can't have NULL first-mate sources. Second-mate sources
+ // Can't have NULL first-mate sources. Second-mate sources
// can be NULL, in the case when the corresponding first-
// mate source is unpaired.
assert((*srca_)[i] != NULL);
@@ -937,10 +977,10 @@ private:
}
PatternComposer& composer_; // pattern composer
- PerThreadReadBuf buf_; // read data buffer
- const PatternParams& pp_; // pattern-related parameters
- bool last_batch_; // true if this is final batch
- int last_batch_size_; // # reads read in previous batch
+ PerThreadReadBuf buf_; // read data buffer
+ const PatternParams& pp_; // pattern-related parameters
+ bool last_batch_; // true if this is final batch
+ int last_batch_size_; // # reads read in previous batch
};
/**
diff --git a/pthreadGC2.dll b/pthreadGC2.dll
deleted file mode 100644
index 8b9116c..0000000
Binary files a/pthreadGC2.dll and /dev/null differ
diff --git a/read_qseq.cpp b/read_qseq.cpp
index be1e868..286e28d 100644
--- a/read_qseq.cpp
+++ b/read_qseq.cpp
@@ -55,9 +55,9 @@ pair<bool, int> QseqPatternSource::nextBatchFromFile(
PerThreadReadBuf& pt,
bool batch_a)
{
- int c = getc_unlocked(fp_);
+ int c = getc_wrapper();
while(c >= 0 && (c == '\n' || c == '\r')) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
size_t readi = 0;
@@ -66,10 +66,10 @@ pair<bool, int> QseqPatternSource::nextBatchFromFile(
readbuf[readi].readOrigBuf.clear();
while(c >= 0 && c != '\n' && c != '\r') {
readbuf[readi].readOrigBuf.append(c);
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
while(c >= 0 && (c == '\n' || c == '\r')) {
- c = getc_unlocked(fp_);
+ c = getc_wrapper();
}
}
return make_pair(c < 0, readi);
diff --git a/scoring.h b/scoring.h
index 2ac50a4..54038a6 100644
--- a/scoring.h
+++ b/scoring.h
@@ -51,8 +51,8 @@
// Linear coefficient a
#define DEFAULT_MIN_LINEAR (-0.6f)
// Different defaults for --local mode
-#define DEFAULT_MIN_CONST_LOCAL (1.0f)
-#define DEFAULT_MIN_LINEAR_LOCAL (10.0f)
+#define DEFAULT_MIN_CONST_LOCAL (20.0f)
+#define DEFAULT_MIN_LINEAR_LOCAL (8.0f)
// Constant coefficient b in linear function f(x) = ax + b determining
// maximum permitted number of Ns f in a read before it is filtered &
diff --git a/scripts/test/simple_tests.pl b/scripts/test/simple_tests.pl
index c27744e..6cc1fb4 100644
--- a/scripts/test/simple_tests.pl
+++ b/scripts/test/simple_tests.pl
@@ -36,10 +36,12 @@ use Test::Deep;
my $bowtie2 = "";
my $bowtie2_build = "";
+my $compressed;
GetOptions(
- "bowtie2=s" => \$bowtie2,
- "bowtie2-build=s" => \$bowtie2_build) || die "Bad options";
+ "compressed-reads" => \$compressed,
+ "bowtie2=s" => \$bowtie2,
+ "bowtie2-build=s" => \$bowtie2_build) || die "Bad options";
if(! -x $bowtie2 || ! -x $bowtie2_build) {
my $bowtie2_dir = `dirname $bowtie2`;
@@ -4294,8 +4296,8 @@ sub writeReads($$$$$$$$$) {
$fq1,
$fq2) = @_;
- open(FQ1, ">$fq1") || die "Could not open '$fq1' for writing";
- open(FQ2, ">$fq2") || die "Could not open '$fq2' for writing";
+ open(FQ1, defined($compressed) ? "| gzip -c >$fq1.gz" : ">$fq1") || die "Could not open '$fq1' for writing";
+ open(FQ2, defined($compressed) ? "| gzip -c >$fq2.gz" : ">$fq2") || die "Could not open '$fq2' for writing";
my $pe = (defined($mate1s) && $mate1s ne "");
if($pe) {
for (0..scalar(@$mate1s)-1) {
@@ -4392,7 +4394,7 @@ my $idx_type = "";
$formatarg = "-q";
$ext = ".fq";
} elsif($read_file_format eq "tabbed") {
- $formatarg = "--12";
+ $formatarg = "--tab5";
$ext = ".tab";
} elsif($read_file_format eq "cline_reads") {
$formatarg = "-c";
@@ -4417,9 +4419,14 @@ my $idx_type = "";
die "Bad format: $read_file_format";
}
if($formatarg ne "-c") {
+ my $cmd = ">";
+ if (defined($compressed)) {
+ $cmd = "| gzip -c" . $cmd;
+ $ext = $ext . ".gz";
+ }
if(defined($read_file)) {
# Unpaired
- open(RD, ">.simple_tests$ext") || die;
+ open(RD, $cmd . ".simple_tests$ext") || die;
print RD $read_file;
close(RD);
$readarg = ".simple_tests$ext";
@@ -4427,10 +4434,10 @@ my $idx_type = "";
defined($mate1_file) || die;
defined($mate2_file) || die;
# Paired
- open(M1, ">.simple_tests.1$ext") || die;
+ open(M1, $cmd . ".simple_tests.1$ext") || die;
print M1 $mate1_file;
close(M1);
- open(M2, ">.simple_tests.2$ext") || die;
+ open(M2, $cmd . ".simple_tests.2$ext") || die;
print M2 $mate2_file;
close(M2);
$mate1arg = ".simple_tests.1$ext";
@@ -4448,8 +4455,8 @@ my $idx_type = "";
$names,
".simple_tests.1.fq",
".simple_tests.2.fq");
- $mate1arg = ".simple_tests.1.fq";
- $mate2arg = ".simple_tests.2.fq";
+ $mate1arg = defined($compressed) ? ".simple_tests.1.fq.gz" : ".simple_tests.1.fq";
+ $mate2arg = defined($compressed) ? ".simple_tests.2.fq.gz" : ".simple_tests.2.fq";
$formatarg = "-q";
$readarg = $mate1arg;
}
diff --git a/scripts/test/simple_tests.sh b/scripts/test/simple_tests.sh
index 90b13f2..b71ade5 100644
--- a/scripts/test/simple_tests.sh
+++ b/scripts/test/simple_tests.sh
@@ -31,4 +31,5 @@ make $* bowtie2-align-s \
bowtie2-build-l-debug && \
perl scripts/test/simple_tests.pl \
--bowtie2=./bowtie2 \
- --bowtie2-build=./bowtie2-build
+ --bowtie2-build=./bowtie2-build \
+ --compressed
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bowtie2.git
More information about the debian-med-commit
mailing list