[med-svn] [bowtie2] 01/05: New upstream version 2.3.1

Alex Mestiashvili malex-guest at moszumanska.debian.org
Tue Mar 7 11:57:49 UTC 2017


This is an automated email from the git hooks/post-receive script.

malex-guest pushed a commit to branch master
in repository bowtie2.

commit b224a1e79a213864732cf644ee347207713a78cc
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date:   Tue Mar 7 10:37:50 2017 +0100

    New upstream version 2.3.1
---
 MANUAL                       |  19 ++++
 MANUAL.markdown              |  40 +++++++
 Makefile                     |   5 +-
 NEWS                         |  28 ++++-
 VERSION                      |   2 +-
 bowtie2                      |   8 +-
 bowtie_main.cpp              | 241 +++++++++++++++++++++++++++++++++++++++++--
 bt2_idx.h                    |   6 +-
 bt2_search.cpp               |   7 +-
 doc/manual.html              |  24 +++++
 doc/website/manual.ssi       |  23 ++++-
 doc/website/old_news.ssi     |  40 +++++++
 doc/website/recent_news.ssi  |  54 ++--------
 doc/website/rhsidebar.ssi    |  13 +--
 formats.h                    |   1 +
 opts.h                       |   3 +-
 pat.cpp                      | 145 +++++++++++++++++---------
 pat.h                        | 182 +++++++++++++++++++-------------
 pthreadGC2.dll               | Bin 60273 -> 0 bytes
 read_qseq.cpp                |   8 +-
 scoring.h                    |   4 +-
 scripts/test/simple_tests.pl |  27 +++--
 scripts/test/simple_tests.sh |   3 +-
 23 files changed, 676 insertions(+), 207 deletions(-)

diff --git a/MANUAL b/MANUAL
index 359bb68..b1395ac 100644
--- a/MANUAL
+++ b/MANUAL
@@ -863,6 +863,25 @@ Reads (specified with `<m1>`, `<m2>`, `<s>`) are FASTQ files.  FASTQ files
 usually have extension `.fq` or `.fastq`.  FASTQ is the default format.  See
 also: `--solexa-quals` and `--int-quals`.
 
+    --interleaved
+
+Reads interleaved FASTQ files where the first two records (8 lines)
+represent a mate pair.
+
+    --tab5
+
+Each read or pair is on a single line. An unpaired read line is
+[name]\t[seq]\t[qual]\n. A paired-end read line is
+[name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. An input file can be a
+mix of unpaired and paired-end reads and Bowtie 2 recognizes each
+according to the number of fields, handling each as it should.
+
+    --tab6
+
+Similar to `--tab5` except, for paired-end reads, the second end can have a
+different name from the first:
+[name1]\t[seq1]\t[qual1]\t[name2]\t[seq2]\t[qual2]\n
+
     --qseq
 
 Reads (specified with `<m1>`, `<m2>`, `<s>`) are QSEQ files.  QSEQ files usually
diff --git a/MANUAL.markdown b/MANUAL.markdown
index 2ec2828..7378b1a 100644
--- a/MANUAL.markdown
+++ b/MANUAL.markdown
@@ -921,6 +921,46 @@ usually have extension `.fq` or `.fastq`.  FASTQ is the default format.  See
 also: [`--solexa-quals`] and [`--int-quals`].
 
 </td></tr>
+<tr><td id="bowtie2-options-interleaved">
+
+[`--interleaved`]: #bowtie2-options-interleaved
+
+    --interleaved
+
+</td><td>
+
+Reads interleaved FASTQ files where the first two records (8 lines)
+represent a mate pair.
+
+</td></tr>
+<tr><td id="bowtie2-options-tab5">
+
+[`--tab5`]: #bowtie2-options-tab5
+
+    --tab5
+
+</td><td>
+
+Each read or pair is on a single line. An unpaired read line is
+[name]\t[seq]\t[qual]\n. A paired-end read line is
+[name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. An input file can be a
+mix of unpaired and paired-end reads and Bowtie 2 recognizes each
+according to the number of fields, handling each as it should.
+
+</td></tr>
+<tr><td id="bowtie2-options-tab6">
+
+[`--tab6`]: #bowtie2-options-tab6
+
+    --tab6
+
+</td><td>
+
+Similar to [`--tab5`] except, for paired-end reads, the second end can have a
+different name from the first:
+[name1]\t[seq1]\t[qual1]\t[name2]\t[seq2]\t[qual2]\n
+
+</td></tr>
 <tr><td id="bowtie2-options-qseq">
 
 [`--qseq`]: #bowtie2-options-qseq
diff --git a/Makefile b/Makefile
index f2aa557..cca55fb 100644
--- a/Makefile
+++ b/Makefile
@@ -25,6 +25,7 @@ prefix = /usr/local
 bindir = $(prefix)/bin
 
 INC =
+LIBS = -lreadline -ltermcap -lz
 GCC_PREFIX = $(shell dirname `which gcc`)
 GCC_SUFFIX =
 CC ?= $(GCC_PREFIX)/gcc$(GCC_SUFFIX)
@@ -94,10 +95,10 @@ endif
 
 #default is to use Intel TBB
 ifneq (1,$(NO_TBB))
-	LIBS = $(PTHREAD_LIB) -ltbb -ltbbmalloc_proxy
+	LIBS += $(PTHREAD_LIB) -ltbb -ltbbmalloc_proxy
 	override EXTRA_FLAGS += -DWITH_TBB
 else
-	LIBS = $(PTHREAD_LIB)
+	LIBS += $(PTHREAD_LIB)
 endif
 SEARCH_LIBS =
 BUILD_LIBS =
diff --git a/NEWS b/NEWS
index c52d521..de7e1a6 100644
--- a/NEWS
+++ b/NEWS
@@ -3,7 +3,7 @@ Bowtie 2 NEWS
 
 Bowtie 2 is now available for download from the project website,
 http://bowtie-bio.sf.net/bowtie2.  2.0.0-beta1 is the first version released to
-the public and 2.3.0 is the latest version.  Bowtie 2 is licensed under
+the public and 2.3.1 is the latest version.  Bowtie 2 is licensed under
 the GPLv3 license.  See `LICENSE' file for details.
 
 Reporting Issues
@@ -16,6 +16,32 @@ Please report any issues using the Sourceforge bug tracker:
 Version Release History
 =======================
 
+Version 2.3.1 - Mar 04, 2017
+Please note that as of this release Bowtie 2 now has dependencies on zlib and readline libraries.
+Make sure that all dependencies are met before attempting to build from source.
+
+    * Added native support for gzipped read files.  The wrapper
+      script is no longer responsible for decompression.  This
+      simplifies the wrapper and improves speed and thread scalability
+      for gzipped inputs.
+    * Fixed a bug that caused `bowtie2-build` to crash when the
+      first FASTA sequence contains all Ns.
+    * Add support for interleaved FASTQ format (`-—interleaved`)
+    * Empty FASTQ inputs would yield an error in Bowtie 2 2.3.0,
+      whereas previous versions would simply align 0 reads and report
+      the SAM header as usual.  This version returns to the pre-2.3.0
+      behavior, resolving a compatibility issue between TopHat2 and
+      Bowtie 2 2.3.0.
+    * Fixed a bug whereby combining `-—un-conc*` with `-k` or `-a`
+      would cause `bowtie2` to print duplicate reads in one or both
+      of the `--un-conc*` output files, causing the ends to be
+      misaligned.
+    * The default `--score-min` for `--local` mode is now `G,20,8`.
+      That was the stated default in the documentation for a while,
+      but the actual default was `G,0,10` for many versions.  Now the
+      default matches the documentation and, we find, yields more
+      accurate alignments than `G,0,10`
+
 Version 2.3.0 - Dec 13, 2016
    * Code related to read parsing was completely rewritten to improve
      scalability to many threads.  In short, the critical section is
diff --git a/VERSION b/VERSION
index 276cbf9..2bf1c1c 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.3.0
+2.3.1
diff --git a/bowtie2 b/bowtie2
index 12e3567..16a0175 100755
--- a/bowtie2
+++ b/bowtie2
@@ -217,6 +217,10 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
 			last;
 		}
 	}
+    if ($arg =~ /(bz2|lz4)$/) {
+        push @bt2w_args, ("-U", $arg);
+        $bt2_args[$i] = undef;
+    }
 }
 # If the user asked us to redirect some reads to files, or to suppress
 # unaligned reads, then we need to capture the output from Bowtie 2 and pass it
@@ -312,7 +316,7 @@ sub cat_file($$) {
 sub wrapInput($$$) {
 	my ($unps, $mate1s, $mate2s) = @_;
 	for my $fn (@$unps, @$mate1s, @$mate2s) {
-		return 1 if $fn =~ /\.gz$/ || $fn =~ /\.bz2$/ || $fn =~ /\.lz4$/;
+		return 1 if $fn =~ /\.bz2$/ || $fn =~ /\.lz4$/;
 	}
 	return 0;
 }
@@ -546,7 +550,7 @@ if(defined($cap_out)) {
 			my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
 			my $unal = ($fl & 4) != 0;
 			$filt = 1 if $no_unal && $unal;
-			if($passthru) {
+			if($passthru && ($fl & 256) == 0) {
 				if(scalar(keys %read_fhs) == 0) {
 					# Next line is read with some whitespace escaped
 					my $l = <BT>;
diff --git a/bowtie_main.cpp b/bowtie_main.cpp
index 844c8ca..3e39acf 100644
--- a/bowtie_main.cpp
+++ b/bowtie_main.cpp
@@ -19,10 +19,23 @@
 
 #include <iostream>
 #include <fstream>
+#include <sstream>
+#include <iomanip>
 #include <string.h>
 #include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
 #include "tokenize.h"
-#include "ds.h"
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <vector>
+#include <iterator>
+#include <algorithm>
+#include <readline/readline.h>
+#include <readline/history.h>
 
 using namespace std;
 
@@ -39,23 +52,237 @@ extern "C" {
  * per line, and will dispatch each batch of arguments one at a time to
  * bowtie.
  */
+
+static volatile sig_atomic_t done = false;
+
+static const char *options[] = {
+"--al",                 "--al-conc",               "--dpad",            "--end-to-end",
+"--fast",               "--fast-local",            "--fr",              "--gbar",
+"--ignore-quals",       "--int-quals",             "--local",           "--ma",
+"--met",                "--met-file",              "--met-stderr",      "--mm",
+"--mp",                 "--n-ceil",                "--no-1mm-upfront",  "--no-contain",
+"--no-discordant",      "--no-dovetail",           "--no-head",         "--no-mixed",
+"--no-overlap",         "--no-sq",                 "--no-unal",         "--nofw",
+"--non-deterministic",  "--norc",                  "--np",              "--omit-sec-seq",
+"--phred33",            "--phred64",               "--qc-filter",       "--qseq",
+"--quiet",              "--rdg",                   "--reorder",         "--rfg",
+"--rg",                 "--rg-id",                 "--score-min",       "--seed",
+"--sensitive",          "--sensitive-local",       "--un",              "--un-conc",
+"--un-gz",              "--version",               "--very-fast",       "--very-fast-local",
+"--very-sensitive",     "--very-sensitive-local",  "-3",                "-5",
+"-D",                   "-I",                      "-L",                "-N",
+"-R",                   "-X",                      "-a",                "-c",
+"-f",                   "-h",                      "-i",                "-k",
+"-p",                   "-q",                      "-r",                "-s",
+"-t",                   "-u",                      "-1",                "-2",
+"-S",                   "-U",                      "--all",             "--ff",
+"--help",               "--maxins",                "--minins",          "--rf",
+"--skip",               "--threads",               "--time",            "--trim3",
+"--trim5",              "--upto",                  NULL
+};
+
+
+static bool isdirectory(const char *path) {
+	struct stat statbuf;
+	if(stat(path, &statbuf) != 0) {
+		perror("stat");
+		return true;
+	}
+	return S_ISDIR(statbuf.st_mode);
+}
+
+static char *optgen(const char *text, int state) {
+	static int list_index, len;
+	const char *name = NULL;
+
+	if (!state) {
+		list_index = 0;
+		len = strlen(text);
+	}
+
+	name = rl_filename_completion_function(text, state);
+	if (name != NULL) {
+		rl_completion_append_character = isdirectory(name) ? '/': ' ';
+		return strdup(name);
+	}
+
+	if (text[0] == '-') {
+		while ((name = options[list_index++])) {
+			if (strncmp(name, text, len) == 0) {
+				return strdup(name);
+			}
+		}
+	}
+	return NULL;
+}
+
+static char **optcomplete(const char *text, int start, int end) {
+	rl_attempted_completion_over = 1;
+	return rl_completion_matches(text, optgen);
+}
+
+static void rlinit() {
+	rl_attempted_completion_function = optcomplete;
+}
+
+static void handler(int sig) {
+	done = true;
+}
+
+static int _getline(istream *in, char *buf, size_t len) {
+	if (in == &cin) {
+		char *input = readline("bowtie2> ");
+		if (!input) {
+			buf[0] = '\0';
+		}
+		else {
+			strncpy(buf, input, len-1);
+			add_history(buf);
+			free(input);
+		}
+	}
+	else {
+		in->getline(buf, len-1);
+		if (!in->good())
+			buf[0] = '\0';
+	}
+	buf[len-1] = '\0';
+	return strlen(buf);
+}
+
+static int createfifo(vector<char *>& v) {
+	const char *pattern = "/tmp/btfifo.XX";
+	char *fn = (char *)malloc(strlen(pattern)+1);
+	memset(fn, 0, strlen(pattern)+1);
+	strcpy(fn, pattern);
+	mktemp(fn);
+
+	if (mkfifo(fn, S_IRUSR | S_IWUSR) == -1) {
+		perror("mkfifo");
+		return 1;
+	}
+	v.push_back(fn);
+	return 0; 
+}
+
+void printargs(const char **args, size_t size) {
+	copy(args, args+size, ostream_iterator<string>(cout, " "));
+	cout << endl;
+}
+
+bool called_from_wrapper(int argc, const char **argv) {
+	if (argc > 2 && strcmp(argv[1], "--wrapper") == 0
+			&& strcmp(argv[2], "basic-0") == 0)
+		return true;
+	return false;
+}
+
 int main(int argc, const char **argv) {
-	if(argc > 2 && strcmp(argv[1], "-A") == 0) {
-		const char *file = argv[2];
+	int offset = called_from_wrapper(argc, argv) ? 3 : 1;
+
+	if(argc > offset + 1 && strcmp(argv[offset], "-A") == 0) {
+		const char *file = argv[offset+1];
 		ifstream in;
-		in.open(file);
+		istream *inptr = ∈
+		if (strcmp(file, "-") == 0) {
+			inptr = &cin;
+		}
+		else {
+			in.open(file);
+		}
 		char buf[4096];
 		int lastret = -1;
-		while(in.getline(buf, 4095)) {
-			EList<string> args;
+
+		rlinit();
+		while(_getline(inptr, buf, 4096)) {
+			done = false;
+			vector<string> args;
 			args.push_back(string(argv[0]));
+
+			if (offset > 1) {
+				args.push_back(string(argv[1]));
+				args.push_back(string(argv[2]));
+			}
+
 			tokenize(buf, " \t", args);
 			const char **myargs = (const char**)malloc(sizeof(char*)*args.size());
+			vector<char *> fifonames;
+			int sam_outfile_pos = -1;
+
 			for(size_t i = 0; i < args.size(); i++) {
+				if (args[i] == "_") {
+					if (i > 0 && args[i-1] == "-S") {
+						sam_outfile_pos = i;
+					}
+					else {
+						createfifo(fifonames);
+						args[i] = fifonames.back();
+					}
+				}
 				myargs[i] = args[i].c_str();
 			}
 			if(args.size() == 1) continue;
-			lastret = bowtie((int)args.size(), myargs);
+
+			if (fifonames.size() > 0) {
+				struct sigaction sa;
+				sigemptyset(&sa.sa_mask);
+				sa.sa_flags = 0;
+				sa.sa_handler = handler;
+
+				if (sigaction(SIGINT, &sa, NULL) == -1
+						|| signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
+					perror("sigaction");
+					exit(EXIT_FAILURE);
+				}
+
+				struct pollfd *pollfds = (struct pollfd *)calloc(fifonames.size(), sizeof(struct pollfd));
+				if (pollfds == NULL) {
+					perror("calloc");
+					exit(EXIT_FAILURE);
+				}
+
+				for (size_t i = 0; i < fifonames.size(); i++) {
+					pollfds[i].fd = open(fifonames[i], O_NONBLOCK | O_EXCL);
+					pollfds[i].events = POLLIN;
+				}
+
+				printargs(myargs, args.size());
+
+				for (int count = 0;; count++) {
+					size_t r = poll(pollfds, fifonames.size(), -1);
+					// wait until all fifos are ready
+					if (r != fifonames.size()) {
+						if (done)
+							break;
+						continue;
+					}
+					if (sam_outfile_pos >= 0) {
+						ostringstream os;
+						os << "out" << getpid() << "_" << setfill('0') << setw(3) << count << ".sam"; 
+						myargs[sam_outfile_pos] = os.str().c_str();
+						printargs(myargs, args.size());
+					}
+
+					lastret = bowtie((int)args.size(), myargs);
+
+					// replace the args shuffled by getopt
+					for (size_t i = 0; i < args.size(); i++) {
+						myargs[i] = args[i].c_str();
+					}
+				}
+
+				for (size_t i = 0; i < fifonames.size(); i++) {
+					if (close(pollfds[i].fd))
+						perror("close");
+					if (remove(fifonames[i]))
+						perror("remove");
+					free(fifonames[i]);
+				}
+				free(pollfds);
+			}
+			else {
+				lastret = bowtie((int)args.size(), myargs);
+			}
 			free(myargs);
 		}
 		if(lastret == -1) {
diff --git a/bt2_idx.h b/bt2_idx.h
index 0b63148..e2d5083 100644
--- a/bt2_idx.h
+++ b/bt2_idx.h
@@ -2595,9 +2595,11 @@ void Ebwt::joinToDisk(
 			if(npat >= 0) {
 				writeU<TIndexOffU>(out1, this->plen()[npat], this->toBe());
 			}
-			npat++;
-			this->plen()[npat] = (szs[i].len + szs[i].off);
+			this->plen()[++npat] = (szs[i].len + szs[i].off);
 		} else {
+			// edge case, but we could get here with npat == -1
+			// e.g. when building from a reference of all Ns
+			if (npat < 0) npat = 0;
 			this->plen()[npat] += (szs[i].len + szs[i].off);
 		}
 	}
diff --git a/bt2_search.cpp b/bt2_search.cpp
index 783f544..e5e02c6 100644
--- a/bt2_search.cpp
+++ b/bt2_search.cpp
@@ -477,6 +477,7 @@ static struct option long_options[] = {
 	{(char*)"12",           required_argument, 0,            ARG_ONETWO},
 	{(char*)"tab5",         required_argument, 0,            ARG_TAB5},
 	{(char*)"tab6",         required_argument, 0,            ARG_TAB6},
+	{(char*)"interleaved",  required_argument, 0,            ARG_INTERLEAVED_FASTQ},
 	{(char*)"phred33-quals", no_argument,      0,            ARG_PHRED33},
 	{(char*)"phred64-quals", no_argument,      0,            ARG_PHRED64},
 	{(char*)"phred33",       no_argument,      0,            ARG_PHRED33},
@@ -685,6 +686,9 @@ static void printUsage(ostream& out) {
 		<< endl
 	    << " Input:" << endl
 	    << "  -q                 query input files are FASTQ .fq/.fastq (default)" << endl
+		<< "  --interleaved      query input files are interleaved paired-end FASTQ reads" << endl
+		<< "  --tab5             query input files are TAB5 .tab5" << endl
+		<< "  --tab6             query input files are TAB6 .tab6" << endl
 	    << "  --qseq             query input files are in Illumina's qseq format" << endl
 	    << "  -f                 query input files are (multi-)FASTA .fa/.mfa" << endl
 	    << "  -r                 query input files are raw one-sequence-per-line" << endl
@@ -753,7 +757,7 @@ static void printUsage(ostream& out) {
 	    << "  --fr/--rf/--ff     -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)" << endl
 		<< "  --no-mixed         suppress unpaired alignments for paired reads" << endl
 		<< "  --no-discordant    suppress discordant alignments for paired reads" << endl
-		<< "  --no-dovetail      not concordant when mates extend past each other" << endl
+		<< "  --dovetail         concordant when mates extend past each other" << endl
 		<< "  --no-contain       not concordant when one mate alignment contains other" << endl
 		<< "  --no-overlap       not concordant when mates overlap at all" << endl
 		<< endl
@@ -935,6 +939,7 @@ static void parseOption(int next_option, const char *arg) {
 		case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
 		case ARG_TAB5:   tokenize(arg, ",", mates12); format = TAB_MATE5; break;
 		case ARG_TAB6:   tokenize(arg, ",", mates12); format = TAB_MATE6; break;
+		case ARG_INTERLEAVED_FASTQ: tokenize(arg, ",", mates12); format = INTERLEAVED; break;
 		case 'f': format = FASTA; break;
 		case 'F': {
 			format = FASTA_CONT;
diff --git a/doc/manual.html b/doc/manual.html
index 0b2ba00..fade3a3 100644
--- a/doc/manual.html
+++ b/doc/manual.html
@@ -358,6 +358,30 @@ Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC</code></pr
 </td>
 </tr>
 <tr>
+<td id="bowtie2-options-interleaved">
+<pre><code>--interleaved</code></pre>
+</td>
+<td>
+<p>Reads interleaved FASTQ files where the first two records (8 lines) represent a mate pair.</p>
+</td>
+</tr>
+<tr>
+<td id="bowtie2-options-tab5">
+<pre><code>--tab5</code></pre>
+</td>
+<td>
+<p>Each read or pair is on a single line. An unpaired read line is [name]. A paired-end read line is [name]. An input file can be a mix of unpaired and paired-end reads and Bowtie 2 recognizes each according to the number of fields, handling each as it should.</p>
+</td>
+</tr>
+<tr>
+<td id="bowtie2-options-tab6">
+<pre><code>--tab6</code></pre>
+</td>
+<td>
+<p>Similar to <a href="#bowtie2-options-tab5"><code>--tab5</code></a> except, for paired-end reads, the second end can have a different name from the first: [name1]</p>
+</td>
+</tr>
+<tr>
 <td id="bowtie2-options-qseq">
 <pre><code>--qseq</code></pre>
 </td>
diff --git a/doc/website/manual.ssi b/doc/website/manual.ssi
index 038aa6a..7c50b2a 100644
--- a/doc/website/manual.ssi
+++ b/doc/website/manual.ssi
@@ -1,5 +1,5 @@
 <h1>Table of Contents</h1>
-<p> Version <b>2.3.0</b></p>
+<p> Version <b>2.3.1</b></p>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head>
@@ -347,6 +347,27 @@ Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC</code></pr
 
 <p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTQ files. FASTQ files usually have extension <code>.fq</code> or <code>.fastq</code>. FASTQ is the default format. See also: <a href="#bowtie2-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#bowtie2-options-int-quals"><code>--int-quals</code></a>.</p>
 </td></tr>
+<tr><td id="bowtie2-options-interleaved">
+
+<pre><code>--interleaved</code></pre>
+</td><td>
+
+<p>Reads interleaved FASTQ files where the first two records (8 lines) represent a mate pair.</p>
+</td></tr>
+<tr><td id="bowtie2-options-tab5">
+
+<pre><code>--tab5</code></pre>
+</td><td>
+
+<p>Each read or pair is on a single line. An unpaired read line is [name]. A paired-end read line is [name]. An input file can be a mix of unpaired and paired-end reads and Bowtie 2 recognizes each according to the number of fields, handling each as it should.</p>
+</td></tr>
+<tr><td id="bowtie2-options-tab6">
+
+<pre><code>--tab6</code></pre>
+</td><td>
+
+<p>Similar to <a href="#bowtie2-options-tab5"><code>--tab5</code></a> except, for paired-end reads, the second end can have a different name from the first: [name1]</p>
+</td></tr>
 <tr><td id="bowtie2-options-qseq">
 
 <pre><code>--qseq</code></pre>
diff --git a/doc/website/old_news.ssi b/doc/website/old_news.ssi
index dc0983b..7df0a31 100644
--- a/doc/website/old_news.ssi
+++ b/doc/website/old_news.ssi
@@ -1,3 +1,43 @@
+<h2>Version 2.2.4 - Oct 22, 2014</h2>
+<ul>
+   <li>Fixed a Mavericks OSX specific bug caused by some linkage ambiguities.</li>
+   <li>Added lz4 compression option for the wrapper.</li>
+   <li>Fixed the vanishing <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> help line.</li>
+   <li>Added the static linkage for MinGW builds.</li>
+   <li>Added extra seed-hit output.</li>
+   <li>Fixed missing 0-length read in fastq <tt>--passthrough</tt> output.</li>
+   <li>Fixed an issue that would cause different output in <tt>-a</tt> mode depending on random seed.</li>
+</ul>
+
+<h2>Version 2.2.3 - May 30, 2014</h2>
+<ul>
+   <li>Fixed a bug that made loading an index into memory crash sometimes.</li>
+   <li>Fixed a silent failure to warn the user in case the <tt><a href="manual.shtml#bowtie2-options-x">-x</a></tt> option is missing.</li>
+   <li>Updated <tt><a href="manual.shtml#bowtie2-options-al">--al</a></tt>, <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>, <tt><a href="manual.shtml#bowtie2-options-al-conc">al-conc</a></tt> and <tt><a href="manual.shtml#bowtie2-options-un-conc">un-conc</a></tt> options to avoid confusion in cases where the user does not provide a base file name.</li>
+   <li>Fixed a spurious assert that made bowtie2-inspect debug fail.</li>
+</ul>
+
+<h2>Version 2.2.2 - April 10, 2014</h2>
+<ul>
+   <li>Improved performance for cases where the reference contains ambiguous
+     or masked nucleobases represented by Ns. </li>
+</ul>
+
+<h2>Version 2.2.1 - February 28, 2014</h2>
+<ul>
+   <li>Improved way in which index files are loaded for alignment.  Should fix
+     efficiency problems on some filesystems.</li>
+   <li>Fixed a bug that made older systems unable to correctly deal with bowtie
+     relative symbolic links.</li>
+   <li>Fixed a bug that, for very big indexes, could determine to determine file
+     offsets correctly.</li>
+   <li>Fixed a bug where using <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> option incorrectly suppressed
+     <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>/<tt><a href="manual.shtml#bowtie2-options-un-conc">--un-conc</a></tt> output.</li>
+   <li>Dropped a perl dependency that could cause problems on old systems.</li>
+   <li>Added <tt><a href="manual.shtml#bowtie2-options-no-1mm-upfront">--no-1mm-upfront</a></tt> option and clarified documentation for parameters
+     governing the multiseed heuristic.</li>
+</ul>
+
 <h2>Bowtie 2 on GitHub - February 4, 2014</h2>
 <ul>
    <li>Bowtie 2 source now lives in a <a href="https://github.com/BenLangmead/bowtie2">public GitHub repository</a>.</li>
diff --git a/doc/website/recent_news.ssi b/doc/website/recent_news.ssi
index 958a2fa..07103af 100644
--- a/doc/website/recent_news.ssi
+++ b/doc/website/recent_news.ssi
@@ -1,3 +1,13 @@
+<h2>Version 2.3.1 - Mar 03, 2017</h2>
+<p>Please note that as of this release Bowtie 2 now has dependencies on zlib and readline libraries. Make sure that all dependencies are met before attempting to build from source.</p>
+<ul>
+    <li>Added native support for gzipped read files. The wrapper script is no longer responsible for decompression.  This simplifies the wrapper and improves speed and thread scalability for gzipped inputs.</li>
+    <li>Fixed a bug that caused <tt>'bowtie2-build'</tt> to crash when the first FASTA sequence contains all Ns.</li>
+    <li>Add support for interleaved FASTQ format <tt><a href="manual.shtml#bowtie2-options-interleaved">-—interleaved</a></tt>.</li>
+    <li>Empty FASTQ inputs would yield an error in Bowtie 2 2.3.0, whereas previous versions would simply align 0 reads and report the SAM header as usual.  This version returns to the pre-2.3.0 behavior, resolving a compatibility issue between TopHat2 and Bowtie 2 2.3.0.</li>
+    <li>Fixed a bug whereby combining <tt>'-—un-conc*</tt> with <tt>'-k'</tt> or <tt>'-a'</tt> would cause <tt>'bowtie2'</tt> to print duplicate reads in one or both of the <tt>'--un-conc*'</tt> output files, causing the ends to be misaligned.</li>
+    <li>The default <tt>'--score-min'</tt> for <tt>'--local'</tt> mode is now <tt>'G,20,8'</tt>. That was the stated default in the documentation for a while, but the actual default was <tt>'G,0,10'</tt> for many versions. Now the default matches the documentation and, we find, yields more accurate alignments than <tt>'G,0,10'</tt></li>
+</ul>
 <h2>Version 2.3.0 - Dec 13, 2016</h2>
 <p>This is a major release with some larger and many smaller changes.  These notes emphasize the large changes.  See commit history for details.</p>
 <ul>
@@ -58,47 +68,3 @@
 <ul>
    <li>Lighter is an extremely fast and memory-efficient program for correcting sequencing errors in DNA sequencing data.  For details on how error correction can help improve the speed and accuracy of downstream analysis tools, see the <a href="http://genomebiology.com/2014/15/11/509">paper in Genome Biology</a>.  Source and software <a href="https://github.com/mourisl/Lighter">available at GitHub</a>.
 </ul>
-
-
-<h2>Version 2.2.4 - Oct 22, 2014</h2>
-<ul>
-   <li>Fixed a Mavericks OSX specific bug caused by some linkage ambiguities.</li>
-   <li>Added lz4 compression option for the wrapper.</li>
-   <li>Fixed the vanishing <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> help line.</li>
-   <li>Added the static linkage for MinGW builds.</li>
-   <li>Added extra seed-hit output.</li>
-   <li>Fixed missing 0-length read in fastq <tt>--passthrough</tt> output.</li>
-   <li>Fixed an issue that would cause different output in <tt>-a</tt> mode depending on random seed.</li>
-</ul>
-
-<h2>Version 2.2.3 - May 30, 2014</h2>
-<ul>
-   <li>Fixed a bug that made loading an index into memory crash sometimes.</li>
-   <li>Fixed a silent failure to warn the user in case the <tt><a href="manual.shtml#bowtie2-options-x">-x</a></tt> option is missing.</li>
-   <li>Updated <tt><a href="manual.shtml#bowtie2-options-al">--al</a></tt>, <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>, <tt><a href="manual.shtml#bowtie2-options-al-conc">al-conc</a></tt> and <tt><a href="manual.shtml#bowtie2-options-un-conc">un-conc</a></tt> options to avoid confusion in cases where the user does not provide a base file name.</li>
-   <li>Fixed a spurious assert that made bowtie2-inspect debug fail.</li>
-</ul>
-
-<h2>Version 2.2.2 - April 10, 2014</h2>
-<ul>
-   <li>Improved performance for cases where the reference contains ambiguous 
-     or masked nucleobases represented by Ns. </li>
-</ul>
-
-<h2>Version 2.2.1 - February 28, 2014</h2>
-<ul>
-   <li>Improved way in which index files are loaded for alignment.  Should fix
-     efficiency problems on some filesystems.</li>
-   <li>Fixed a bug that made older systems unable to correctly deal with bowtie 
-     relative symbolic links.</li>
-   <li>Fixed a bug that, for very big indexes, could determine to determine file
-     offsets correctly.</li>
-   <li>Fixed a bug where using <tt><a href="manual.shtml#bowtie2-options-no-unal">--no-unal</a></tt> option incorrectly suppressed
-     <tt><a href="manual.shtml#bowtie2-options-un">--un</a></tt>/<tt><a href="manual.shtml#bowtie2-options-un-conc">--un-conc</a></tt> output.</li>
-   <li>Dropped a perl dependency that could cause problems on old systems.</li>
-   <li>Added <tt><a href="manual.shtml#bowtie2-options-no-1mm-upfront">--no-1mm-upfront</a></tt> option and clarified documentation for parameters
-     governing the multiseed heuristic.</li>
-</ul>
-
-
-
diff --git a/doc/website/rhsidebar.ssi b/doc/website/rhsidebar.ssi
index ba11d60..8f976dc 100644
--- a/doc/website/rhsidebar.ssi
+++ b/doc/website/rhsidebar.ssi
@@ -18,10 +18,10 @@
         </tr>
       <tr>
       <td>
-        <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.0">Bowtie2 2.3.0</a>
+        <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.1">Bowtie2 2.3.1</a>
       </td>
       <td align="right">
-        12/13/16 
+        03/04/17 
       </td>
       </tr>
       <tr>
@@ -36,7 +36,7 @@
     <ul>
       <li><a href="https://github.com/BenLangmead/bowtie2">Bowtie GitHub repository</a></li>
       <li><a href="https://github.com/BenLangmead/bowtie2/issues">Report an issue</a></li>
-      <li><a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce">Bowtie Mailing list</a></li>
+      <!--<li><a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce">Bowtie Mailing list</a></li>-->
     </ul>
   </div>
   <h2>Related Tools</h2>
@@ -173,24 +173,25 @@
     <ul>
       <li><a href="http://www.cs.jhu.edu/~langmea/index.shtml">Ben Langmead</a></li>
       <li><a href="http://www.ccb.jhu.edu/people/infphilo">Daehwan Kim</a></li>
-      <li>Valentin Antonescu</li>
+      <li>Rone Charles</li>
       <li>Chris Wilks</li>
+      <li>Valentin Antonescu</li>
     </ul>
   </div>
+<!--
   <h2>Other Documentation</h2>
   <div class="box">
     <ul>
       <li>Bio. of Genomes poster, 5/11 (<a href="http://www.cbcb.umd.edu/~langmead/bog2011_poster.ppt">.ppt</a>, <a href="http://www.cbcb.umd.edu/~langmead/bog2011_poster.pdf">.pdf</a>)</li>
     </ul>
   </div>
+-->
   <h2>Related links</h2>
   <div class="box">
     <ul>
 	  <li><a href="http://scholar.google.com/scholar?oi=bibs&hl=en&cites=4636016874467197548">Papers citing Bowtie 2</a></li>
       <li><a href="http://www.jhu.edu/">Johns Hopkins University</a></li>
       <li><a href="http://www.cs.jhu.edu/">JHU Computer Science</a></li>
-      <li><a href="http://www.biostat.jhsph.edu/">JHSPH Biostatistics</a></li>
-      <li><a href="http://seqanswers.com/">SEQanswers</a></li>
     </ul>
   </div>
 </div> <!-- End of "rightside" -->
diff --git a/formats.h b/formats.h
index 7627827..5067939 100644
--- a/formats.h
+++ b/formats.h
@@ -30,6 +30,7 @@ enum file_format {
 	FASTA = 1,
 	FASTA_CONT,
 	FASTQ,
+	INTERLEAVED,
 	TAB_MATE5,
 	TAB_MATE6,
 	RAW,
diff --git a/opts.h b/opts.h
index a90669b..e57aa26 100644
--- a/opts.h
+++ b/opts.h
@@ -151,7 +151,8 @@ enum {
 	ARG_DESC_PRIORITIZE,        // --desc-prioritize
 	ARG_DESC_FMOPS,             // --desc-fmops
 	ARG_LOG_DP,                 // --log-dp
-	ARG_LOG_DP_OPP              // --log-dp-opp
+	ARG_LOG_DP_OPP,             // --log-dp-opp
+	ARG_INTERLEAVED_FASTQ       // --interleaved
 };
 
 #endif
diff --git a/pat.cpp b/pat.cpp
index 782130f..e5f387b 100644
--- a/pat.cpp
+++ b/pat.cpp
@@ -89,6 +89,7 @@ PatternSource* PatternSource::patsrcFromStrings(
 		case FASTA_CONT:  return new FastaContinuousPatternSource(qs, p);
 		case RAW:         return new RawPatternSource(qs, p);
 		case FASTQ:       return new FastqPatternSource(qs, p);
+		case INTERLEAVED: return new FastqPatternSource(qs, p, true /* interleaved */);
 		case TAB_MATE5:   return new TabbedPatternSource(qs, p, false);
 		case TAB_MATE6:   return new TabbedPatternSource(qs, p, true);
 		case CMDLINE:     return new VectorPatternSource(qs, p);
@@ -166,7 +167,7 @@ pair<bool, bool> PatternSourcePerThread::nextReadPair() {
 	} else {
 		finalize(buf_.read_a());
 	}
-	bool this_is_last = buf_.cur_buf_ == last_batch_size_-1;
+	bool this_is_last = buf_.cur_buf_ == static_cast<unsigned int>(last_batch_size_-1);
 	return make_pair(true, this_is_last ? last_batch_ : false);
 }
 
@@ -373,14 +374,14 @@ PatternComposer* PatternComposer::setupPatternComposer(
 }
 
 void PatternComposer::free_EList_pmembers( const EList<PatternSource*> &elist) {
-    for (size_t i = 0; i < elist.size(); i++)
-        if (elist[i] != NULL)
-            delete elist[i];
+	for (size_t i = 0; i < elist.size(); i++)
+		if (elist[i] != NULL)
+			delete elist[i];
 }
 
 /**
  * Fill Read with the sequence, quality and name for the next
- * read in the list of read files.  This function gets called by
+ * read in the list of read files. This function gets called by
  * all the search threads, so we must handle synchronization.
  *
  * Returns pair<bool, int> where bool indicates whether we're
@@ -425,24 +426,44 @@ pair<bool, int> CFilePatternSource::nextBatch(
 void CFilePatternSource::open() {
 	if(is_open_) {
 		is_open_ = false;
-		fclose(fp_);
-		fp_ = NULL;
+		if (compressed_) {
+			gzclose(zfp_);
+			zfp_ = NULL;
+		}
+		else {
+			fclose(fp_);
+			fp_ = NULL;
+		}
 	}
 	while(filecur_ < infiles_.size()) {
 		if(infiles_[filecur_] == "-") {
-			fp_ = stdin;
-		} else if((fp_ = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
-			if(!errs_[filecur_]) {
-				cerr << "Warning: Could not open read file \""
-				<< infiles_[filecur_].c_str()
-				<< "\" for reading; skipping..." << endl;
-				errs_[filecur_] = true;
+			// always assume that data from stdin is compressed
+			compressed_ = true;
+			int fn = dup(fileno(stdin));
+			zfp_ = gzdopen(fn, "rb");
+		}
+		else {
+			compressed_ = false;
+			if (is_gzipped_file(infiles_[filecur_])) {
+				compressed_ = true;
+				zfp_ = gzopen(infiles_[filecur_].c_str(), "rb");
+			}
+			else {
+				fp_ = fopen(infiles_[filecur_].c_str(), "rb");
+			}
+			if((compressed_ && zfp_ == NULL) || (!compressed_ && fp_ == NULL)) {
+				if(!errs_[filecur_]) {
+					cerr << "Warning: Could not open read file \""
+					<< infiles_[filecur_].c_str()
+					<< "\" for reading; skipping..." << endl;
+					errs_[filecur_] = true;
+				}
+				filecur_++;
+				continue;
 			}
-			filecur_++;
-			continue;
 		}
 		is_open_ = true;
-		setvbuf(fp_, buf_, _IOFBF, 64*1024);
+		compressed_ ? gzbuffer(zfp_, 64*1024) : setvbuf(fp_, buf_, _IOFBF, 64*1024);
 		return;
 	}
 	cerr << "Error: No input read files were valid" << endl;
@@ -499,7 +520,7 @@ VectorPatternSource::VectorPatternSource(
 /**
  * Read next batch.  However, batch concept is not very applicable for this
  * PatternSource where all the info has already been parsed into the fields
- * in the contsructor.  This essentially modifies the pt as though we read
+ * in the contsructor.	This essentially modifies the pt as though we read
  * in some number of patterns.
  */
 pair<bool, int> VectorPatternSource::nextBatch(
@@ -622,9 +643,12 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile(
 	int c;
 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
 	if(first_) {
-		c = getc_unlocked(fp_);
+		c = getc_wrapper();
+		if (c == EOF) {
+			return make_pair(true, 0);
+		}
 		while(c == '\r' || c == '\n') {
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 		if(c != '>') {
 			cerr << "Error: reads file does not look like a FASTA file" << endl;
@@ -634,16 +658,13 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile(
 	}
 	bool done = false;
 	size_t readi = 0;
-	if (feof(fp_)) {
-		done = true;
-	}
 	// Read until we run out of input or until we've filled the buffer
 	for(; readi < pt.max_buf_ && !done; readi++) {
 		Read::TBuf& buf = readbuf[readi].readOrigBuf;
 		buf.clear();
 		buf.append('>');
 		while(true) {
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 			if(c < 0 || c == '>') {
 				done = c < 0;
 				break;
@@ -659,7 +680,7 @@ pair<bool, int> FastaPatternSource::nextBatchFromFile(
  */
 bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
 	// We assume the light parser has put the raw data for the separate ends
-	// into separate Read objects.  That doesn't have to be the case, but
+	// into separate Read objects.	That doesn't have to be the case, but
 	// that's how we've chosen to do it for FastqPatternSource
 	assert(!r.readOrigBuf.empty());
 	assert(r.empty());
@@ -729,7 +750,7 @@ bool FastaPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
  * 1. Reads are substrings of a longer FASTA input string
  * 2. Reads may overlap w/r/t the longer FASTA string
  * 3. Read names depend on the most recently observed FASTA
- *    record name
+ *	  record name
  */
 pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
 	PerThreadReadBuf& pt,
@@ -739,13 +760,13 @@ pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
 	size_t readi = 0;
 	while(readi < pt.max_buf_) {
-		c = getc_unlocked(fp_);
+		c = getc_wrapper();
 		if(c < 0) {
 			break;
 		}
 		if(c == '>') {
 			resetForNextFile();
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 			bool sawSpace = false;
 			while(c != '\n' && c != '\r') {
 				if(!sawSpace) {
@@ -754,10 +775,10 @@ pair<bool, int> FastaContinuousPatternSource::nextBatchFromFile(
 				if(!sawSpace) {
 					name_prefix_buf_.append(c);
 				}
-				c = getc_unlocked(fp_);
+				c = getc_wrapper();
 			}
 			while(c == '\n' || c == '\r') {
-				c = getc_unlocked(fp_);
+				c = getc_wrapper();
 			}
 			if(c < 0) {
 				break;
@@ -869,7 +890,7 @@ bool FastaContinuousPatternSource::parse(
 
 
 /**
- * "Light" parser.  This is inside the critical section, so the key is to do
+ * "Light" parser. This is inside the critical section, so the key is to do
  * just enough parsing so that another function downstream (finalize()) can do
  * the rest of the parsing.  Really this function's only job is to stick every
  * for lines worth of the input file into a buffer (r.readOrigBuf).  finalize()
@@ -880,28 +901,32 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
 	bool batch_a)
 {
 	int c;
-	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
+	EList<Read>* readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
 	if(first_) {
-		c = getc_unlocked(fp_);
+		c = getc_wrapper();
+		if (c == EOF) {
+			return make_pair(true, 0);
+		}
 		while(c == '\r' || c == '\n') {
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 		if(c != '@') {
 			cerr << "Error: reads file does not look like a FASTQ file" << endl;
 			throw 1;
 		}
 		first_ = false;
-		readbuf[0].readOrigBuf.append('@');
+		(*readbuf)[0].readOrigBuf.append('@');
 	}
+
 	bool done = false, aborted = false;
 	size_t readi = 0;
 	// Read until we run out of input or until we've filled the buffer
-	for(; readi < pt.max_buf_ && !done; readi++) {
-		Read::TBuf& buf = readbuf[readi].readOrigBuf;
+	while (readi < pt.max_buf_ && !done) {
+		Read::TBuf& buf = (*readbuf)[readi].readOrigBuf;
 		assert(readi == 0 || buf.empty());
 		int newlines = 4;
 		while(newlines) {
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 			done = c < 0;
 			if(c == '\n' || (done && newlines == 1)) {
 				// Saw newline, or EOF that we're
@@ -909,11 +934,29 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
 				newlines--;
 				c = '\n';
 			} else if(done) {
-				aborted = true; // Unexpected EOF
+				// account for newline at the end of the file
+				if (newlines == 4) {
+					newlines = 0;
+				}
+				else {
+					aborted = true; // Unexpected EOF
+				}
 				break;
 			}
 			buf.append(c);
 		}
+		if (c > 0) {
+			if (interleaved_) {
+				// alternate between read buffers
+				batch_a = !batch_a;
+				readbuf = batch_a ? &pt.bufa_ : &pt.bufb_;
+				// increment read counter after each pair gets read
+				readi = batch_a ? readi+1 : readi;
+			}
+			else {
+				readi++;
+			}
+		}
 	}
 	if(aborted) {
 		readi--;
@@ -926,7 +969,7 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
  */
 bool FastqPatternSource::parse(Read &r, Read& rb, TReadId rdid) const {
 	// We assume the light parser has put the raw data for the separate ends
-	// into separate Read objects.  That doesn't have to be the case, but
+	// into separate Read objects. That doesn't have to be the case, but
 	// that's how we've chosen to do it for FastqPatternSource
 	assert(!r.readOrigBuf.empty());
 	assert(r.empty());
@@ -1043,9 +1086,9 @@ pair<bool, int> TabbedPatternSource::nextBatchFromFile(
 	PerThreadReadBuf& pt,
 	bool batch_a)
 {
-	int c = getc_unlocked(fp_);
+	int c = getc_wrapper();
 	while(c >= 0 && (c == '\n' || c == '\r')) {
-		c = getc_unlocked(fp_);
+		c = getc_wrapper();
 	}
 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
 	size_t readi = 0;
@@ -1054,10 +1097,10 @@ pair<bool, int> TabbedPatternSource::nextBatchFromFile(
 		readbuf[readi].readOrigBuf.clear();
 		while(c >= 0 && c != '\n' && c != '\r') {
 			readbuf[readi].readOrigBuf.append(c);
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 		while(c >= 0 && (c == '\n' || c == '\r')) {
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 	}
 	return make_pair(c < 0, readi);
@@ -1177,9 +1220,9 @@ pair<bool, int> RawPatternSource::nextBatchFromFile(
 	PerThreadReadBuf& pt,
 	bool batch_a)
 {
-	int c = getc_unlocked(fp_);
+	int c = getc_wrapper();
 	while(c >= 0 && (c == '\n' || c == '\r')) {
-		c = getc_unlocked(fp_);
+		c = getc_wrapper();
 	}
 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
 	size_t readi = 0;
@@ -1188,15 +1231,15 @@ pair<bool, int> RawPatternSource::nextBatchFromFile(
 		readbuf[readi].readOrigBuf.clear();
 		while(c >= 0 && c != '\n' && c != '\r') {
 			readbuf[readi].readOrigBuf.append(c);
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 		while(c >= 0 && (c == '\n' || c == '\r')) {
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 	}
 	// incase a valid character is consumed between batches
-    if (c >= 0 && c != '\n' && c != '\r') {
-		ungetc(c, fp_);
+	if (c >= 0 && c != '\n' && c != '\r') {
+		ungetc_wrapper(c);
 	}
 	return make_pair(c < 0, readi);
 }
@@ -1252,7 +1295,7 @@ bool RawPatternSource::parse(Read& r, Read& rb, TReadId rdid) const {
 
 void wrongQualityFormat(const BTString& read_name) {
 	cerr << "Error: Encountered one or more spaces while parsing the quality "
-	     << "string for read " << read_name << ".  If this is a FASTQ file "
+		 << "string for read " << read_name << ".  If this is a FASTQ file "
 		 << "with integer (non-ASCII-encoded) qualities, try re-running with "
 		 << "the --integer-quals option." << endl;
 	throw 1;
diff --git a/pat.h b/pat.h
index e5071db..a2556c7 100644
--- a/pat.h
+++ b/pat.h
@@ -20,6 +20,9 @@
 #ifndef PAT_H_
 #define PAT_H_
 
+#include <stdio.h>
+#include <sys/stat.h>
+#include <zlib.h>
 #include <cassert>
 #include <string>
 #include <ctype.h>
@@ -75,18 +78,18 @@ struct PatternParams {
 		nthreads(nthreads_),
 		fixName(fixName_) { }
 
-	int format;           // file format
-	bool fileParallel;    // true -> wrap files with separate PatternComposers
-	uint32_t seed;        // pseudo-random seed
-	size_t max_buf;       // number of reads to buffer in one read
-	bool solexa64;        // true -> qualities are on solexa64 scale
-	bool phred64;         // true -> qualities are on phred64 scale
-	bool intQuals;        // true -> qualities are space-separated numbers
-	int sampleLen;        // length of sampled reads for FastaContinuous...
-	int sampleFreq;       // frequency of sampled reads for FastaContinuous...
-	size_t skip;          // skip the first 'skip' patterns
-	int nthreads;         // number of threads for locking
-	bool fixName;         //
+	int format;			  // file format
+	bool fileParallel;	  // true -> wrap files with separate PatternComposers
+	uint32_t seed;		  // pseudo-random seed
+	size_t max_buf;		  // number of reads to buffer in one read
+	bool solexa64;		  // true -> qualities are on solexa64 scale
+	bool phred64;		  // true -> qualities are on phred64 scale
+	bool intQuals;		  // true -> qualities are space-separated numbers
+	int sampleLen;		  // length of sampled reads for FastaContinuous...
+	int sampleFreq;		  // frequency of sampled reads for FastaContinuous...
+	size_t skip;		  // skip the first 'skip' patterns
+	int nthreads;		  // number of threads for locking
+	bool fixName;		  //
 };
 
 /**
@@ -163,10 +166,10 @@ struct PerThreadReadBuf {
 	}
 	
 	const size_t max_buf_; // max # reads to read into buffer at once
-	EList<Read> bufa_;     // Read buffer for mate as
-	EList<Read> bufb_;     // Read buffer for mate bs
-	size_t cur_buf_;       // Read buffer currently active
-	TReadId rdid_;         // index of read at offset 0 of bufa_/bufb_
+	EList<Read> bufa_;	   // Read buffer for mate as
+	EList<Read> bufb_;	   // Read buffer for mate bs
+	size_t cur_buf_;	   // Read buffer currently active
+	TReadId rdid_;		   // index of read at offset 0 of bufa_/bufb_
 };
 
 extern void wrongQualityFormat(const BTString& read_name);
@@ -256,7 +259,7 @@ public:
 	/**
 	 * Read next batch.  However, batch concept is not very applicable for this
 	 * PatternSource where all the info has already been parsed into the fields
-	 * in the contsructor.  This essentially modifies the pt as though we read
+	 * in the contsructor.	This essentially modifies the pt as though we read
 	 * in some number of patterns.
 	 */
 	virtual std::pair<bool, int> nextBatch(
@@ -280,12 +283,12 @@ public:
 	
 private:
 
-	size_t cur_;               // index for first read of next batch
-	size_t skip_;              // # reads to skip
-	bool paired_;              // whether reads are paired
-	EList<string> tokbuf_;     // buffer for storing parsed tokens
+	size_t cur_;			   // index for first read of next batch
+	size_t skip_;			   // # reads to skip
+	bool paired_;			   // whether reads are paired
+	EList<string> tokbuf_;	   // buffer for storing parsed tokens
 	EList<Read::TBuf> bufs_;   // per-read buffers
-	char nametmp_[20];         // temp buffer for constructing name
+	char nametmp_[20];		   // temp buffer for constructing name
 };
 
 /**
@@ -303,9 +306,11 @@ public:
 		infiles_(infiles),
 		filecur_(0),
 		fp_(NULL),
+		zfp_(NULL),
 		is_open_(false),
 		skip_(p.skip),
-		first_(true)
+		first_(true),
+		compressed_(false)
 	{
 		assert_gt(infiles.size(), 0);
 		errs_.resize(infiles_.size());
@@ -319,14 +324,20 @@ public:
 	 */
 	virtual ~CFilePatternSource() {
 		if(is_open_) {
-			assert(fp_ != NULL);
-			fclose(fp_);
+			if (compressed_) {
+				assert(zfp_ != NULL);
+				gzclose(zfp_);
+			}
+			else {
+				assert(fp_ != NULL);
+				fclose(fp_);
+			}
 		}
 	}
 
 	/**
 	 * Fill Read with the sequence, quality and name for the next
-	 * read in the list of read files.  This function gets called by
+	 * read in the list of read files.	This function gets called by
 	 * all the search threads, so we must handle synchronization.
 	 *
 	 * Returns pair<bool, int> where bool indicates whether we're
@@ -352,7 +363,7 @@ protected:
 
 	/**
 	 * Light-parse a batch of unpaired reads from current file into the given
-	 * buffer.  Called from CFilePatternSource.nextBatch().
+	 * buffer.	Called from CFilePatternSource.nextBatch().
 	 */
 	virtual std::pair<bool, int> nextBatchFromFile(
 		PerThreadReadBuf& pt,
@@ -367,15 +378,42 @@ protected:
 	 * Open the next file in the list of input files.
 	 */
 	void open();
+
+	int getc_wrapper() {
+		return compressed_ ? gzgetc(zfp_) : getc_unlocked(fp_);
+	}
+
+	int ungetc_wrapper(int c) {
+		return compressed_ ? gzungetc(c, zfp_) : ungetc(c, fp_);
+	}
+
+	bool is_gzipped_file(const std::string& filename) {
+		struct stat s;
+		if (stat(filename.c_str(), &s) != 0) {
+			perror("stat");
+		}
+		else {
+			if (S_ISFIFO(s.st_mode))
+				return true;
+		}
+		size_t pos = filename.find_last_of(".");
+		std::string ext = (pos == std::string::npos) ? "" : filename.substr(pos + 1);
+		if (ext == "" || ext == "gz" || ext == "Z") {
+			return true;
+		}
+		return false;
+	}
 	
 	EList<std::string> infiles_;  // filenames for read files
-	EList<bool> errs_;       // whether we've already printed an error for each file
-	size_t filecur_;         // index into infiles_ of next file to read
-	FILE *fp_;               // read file currently being read from
-	bool is_open_;           // whether fp_ is currently open
-	TReadId skip_;           // number of reads to skip
-	bool first_;             // parsing first record in first file?
-	char buf_[64*1024];      // file buffer
+	EList<bool> errs_;		 // whether we've already printed an error for each file
+	size_t filecur_;		 // index into infiles_ of next file to read
+	FILE *fp_;				 // read file currently being read from
+	gzFile zfp_;
+	bool is_open_;			 // whether fp_ is currently open
+	TReadId skip_;			 // number of reads to skip
+	bool first_;			 // parsing first record in first file?
+	char buf_[64*1024];		 // file buffer
+	bool compressed_;
 };
 
 /**
@@ -469,10 +507,10 @@ protected:
 		PerThreadReadBuf& pt,
 		bool batch_a);
 
-	bool solQuals_;     // base qualities are log odds
+	bool solQuals_;		// base qualities are log odds
 	bool phred64Quals_; // base qualities are on -64 scale
-	bool intQuals_;     // base qualities are space-separated strings
-	bool secondName_;   // true if --tab6, false if --tab5
+	bool intQuals_;		// base qualities are space-separated strings
+	bool secondName_;	// true if --tab6, false if --tab5
 };
 
 /**
@@ -487,8 +525,8 @@ protected:
  * 5. X coordinate of spot
  * 6. Y coordinate of spot
  * 7. Index: "Index sequence or 0. For no indexing, or for a file that
- *    has not been demultiplexed yet, this field should have a value of
- *    0."
+ *	  has not been demultiplexed yet, this field should have a value of
+ *	  0."
  * 8. Read number: 1 for unpaired, 1 or 2 for paired
  * 9. Sequence
  * 10. Quality
@@ -520,9 +558,9 @@ protected:
 		PerThreadReadBuf& pt,
 		bool batch_a);
 
-	bool solQuals_;     // base qualities are log odds
+	bool solQuals_;		// base qualities are log odds
 	bool phred64Quals_; // base qualities are on -64 scale
-	bool intQuals_;     // base qualities are space-separated strings
+	bool intQuals_;		// base qualities are space-separated strings
 	EList<std::string> qualToks_;
 };
 
@@ -581,19 +619,19 @@ protected:
 private:
 	const size_t length_; /// length of reads to generate
 	const size_t freq_;   /// frequency to sample reads
-	size_t eat_;        /// number of characters we need to skip before
-	                    /// we have flushed all of the ambiguous or
-	                    /// non-existent characters out of our read
-	                    /// window
-	bool beginning_;    /// skipping over the first read length?
-	char buf_[1024];    /// FASTA sequence buffer
+	size_t eat_;		/// number of characters we need to skip before
+						/// we have flushed all of the ambiguous or
+						/// non-existent characters out of our read
+						/// window
+	bool beginning_;	/// skipping over the first read length?
+	char buf_[1024];	/// FASTA sequence buffer
 	Read::TBuf name_prefix_buf_; /// FASTA sequence name buffer
 	char name_int_buf_[20]; /// for composing offsets for names
-	size_t bufCur_;     /// buffer cursor; points to where we should
-	                    /// insert the next character
+	size_t bufCur_;		/// buffer cursor; points to where we should
+						/// insert the next character
 	uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
-	                    /// the pat id to output (so it resets to 0 for
-	                    /// each new sequence)
+						/// the pat id to output (so it resets to 0 for
+						/// each new sequence)
 };
 
 /**
@@ -606,12 +644,13 @@ public:
 
 	FastqPatternSource(
 		const EList<std::string>& infiles,
-		const PatternParams& p) :
+		const PatternParams& p, bool interleaved = false) :
 		CFilePatternSource(infiles, p),
 		first_(true),
 		solQuals_(p.solexa64),
 		phred64Quals_(p.phred64),
-		intQuals_(p.intQuals) { }
+		intQuals_(p.intQuals),
+		interleaved_(interleaved) { }
 	
 	virtual void reset() {
 		first_ = true;
@@ -639,14 +678,15 @@ protected:
 		first_ = true;
 	}
 
-	bool first_;        // parsing first read in file
-	bool solQuals_;     // base qualities are log odds
+	bool first_;		// parsing first read in file
+	bool solQuals_;		// base qualities are log odds
 	bool phred64Quals_; // base qualities are on -64 scale
-	bool intQuals_;     // base qualities are space-separated strings
+	bool intQuals_;		// base qualities are space-separated strings
+	bool interleaved_;	// fastq reads are interleaved
 };
 
 /**
- * Read a Raw-format file (one sequence per line).  No quality strings
+ * Read a Raw-format file (one sequence per line).	No quality strings
  * allowed.  All qualities are assumed to be 'I' (40 on the Phred-33
  * scale).
  */
@@ -718,15 +758,15 @@ public:
 	 * dispense them.
 	 */
 	static PatternComposer* setupPatternComposer(
-		const EList<std::string>& si,    // singles, from argv
-		const EList<std::string>& m1,    // mate1's, from -1 arg
-		const EList<std::string>& m2,    // mate2's, from -2 arg
-		const EList<std::string>& m12,   // both mates on each line, from --12
-		const EList<std::string>& q,     // qualities associated with singles
-		const EList<std::string>& q1,    // qualities associated with m1
-		const EList<std::string>& q2,    // qualities associated with m2
-		const PatternParams& p,     // read-in params
-		bool verbose);              // be talkative?
+		const EList<std::string>& si,	 // singles, from argv
+		const EList<std::string>& m1,	 // mate1's, from -1 arg
+		const EList<std::string>& m2,	 // mate2's, from -2 arg
+		const EList<std::string>& m12,	 // both mates on each line, from --12
+		const EList<std::string>& q,	 // qualities associated with singles
+		const EList<std::string>& q1,	 // qualities associated with m1
+		const EList<std::string>& q2,	 // qualities associated with m2
+		const PatternParams& p,		// read-in params
+		bool verbose);				// be talkative?
 	
 protected:
 	
@@ -814,7 +854,7 @@ public:
 		// srca_ and srcb_ must be parallel
 		assert_eq(srca_->size(), srcb_->size());
 		for(size_t i = 0; i < srca_->size(); i++) {
-			// Can't have NULL first-mate sources.  Second-mate sources
+			// Can't have NULL first-mate sources.	Second-mate sources
 			// can be NULL, in the case when the corresponding first-
 			// mate source is unpaired.
 			assert((*srca_)[i] != NULL);
@@ -937,10 +977,10 @@ private:
 	}
 
 	PatternComposer& composer_; // pattern composer
-	PerThreadReadBuf buf_;      // read data buffer
-	const PatternParams& pp_;   // pattern-related parameters
-	bool last_batch_;           // true if this is final batch
-	int last_batch_size_;       // # reads read in previous batch
+	PerThreadReadBuf buf_;		// read data buffer
+	const PatternParams& pp_;	// pattern-related parameters
+	bool last_batch_;			// true if this is final batch
+	int last_batch_size_;		// # reads read in previous batch
 };
 
 /**
diff --git a/pthreadGC2.dll b/pthreadGC2.dll
deleted file mode 100644
index 8b9116c..0000000
Binary files a/pthreadGC2.dll and /dev/null differ
diff --git a/read_qseq.cpp b/read_qseq.cpp
index be1e868..286e28d 100644
--- a/read_qseq.cpp
+++ b/read_qseq.cpp
@@ -55,9 +55,9 @@ pair<bool, int> QseqPatternSource::nextBatchFromFile(
 	PerThreadReadBuf& pt,
 	bool batch_a)
 {
-	int c = getc_unlocked(fp_);
+	int c = getc_wrapper();
 	while(c >= 0 && (c == '\n' || c == '\r')) {
-		c = getc_unlocked(fp_);
+		c = getc_wrapper();
 	}
 	EList<Read>& readbuf = batch_a ? pt.bufa_ : pt.bufb_;
 	size_t readi = 0;
@@ -66,10 +66,10 @@ pair<bool, int> QseqPatternSource::nextBatchFromFile(
 		readbuf[readi].readOrigBuf.clear();
 		while(c >= 0 && c != '\n' && c != '\r') {
 			readbuf[readi].readOrigBuf.append(c);
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 		while(c >= 0 && (c == '\n' || c == '\r')) {
-			c = getc_unlocked(fp_);
+			c = getc_wrapper();
 		}
 	}
 	return make_pair(c < 0, readi);
diff --git a/scoring.h b/scoring.h
index 2ac50a4..54038a6 100644
--- a/scoring.h
+++ b/scoring.h
@@ -51,8 +51,8 @@
 // Linear coefficient a
 #define DEFAULT_MIN_LINEAR (-0.6f)
 // Different defaults for --local mode
-#define DEFAULT_MIN_CONST_LOCAL (1.0f)
-#define DEFAULT_MIN_LINEAR_LOCAL (10.0f)
+#define DEFAULT_MIN_CONST_LOCAL (20.0f)
+#define DEFAULT_MIN_LINEAR_LOCAL (8.0f)
 
 // Constant coefficient b in linear function f(x) = ax + b determining
 // maximum permitted number of Ns f in a read before it is filtered &
diff --git a/scripts/test/simple_tests.pl b/scripts/test/simple_tests.pl
index c27744e..6cc1fb4 100644
--- a/scripts/test/simple_tests.pl
+++ b/scripts/test/simple_tests.pl
@@ -36,10 +36,12 @@ use Test::Deep;
 
 my $bowtie2 = "";
 my $bowtie2_build = "";
+my $compressed;
 
 GetOptions(
-	"bowtie2=s"       => \$bowtie2,
-	"bowtie2-build=s" => \$bowtie2_build) || die "Bad options";
+	"compressed-reads" => \$compressed,
+	"bowtie2=s"        => \$bowtie2,
+	"bowtie2-build=s"  => \$bowtie2_build) || die "Bad options";
 
 if(! -x $bowtie2 || ! -x $bowtie2_build) {
 	my $bowtie2_dir = `dirname $bowtie2`;
@@ -4294,8 +4296,8 @@ sub writeReads($$$$$$$$$) {
 		$fq1,
 		$fq2) = @_;
 
-	open(FQ1, ">$fq1") || die "Could not open '$fq1' for writing";
-	open(FQ2, ">$fq2") || die "Could not open '$fq2' for writing";
+	open(FQ1, defined($compressed) ? "| gzip -c >$fq1.gz" : ">$fq1") || die "Could not open '$fq1' for writing";
+	open(FQ2, defined($compressed) ? "| gzip -c >$fq2.gz" : ">$fq2") || die "Could not open '$fq2' for writing";
 	my $pe = (defined($mate1s) && $mate1s ne "");
 	if($pe) {
 		for (0..scalar(@$mate1s)-1) {
@@ -4392,7 +4394,7 @@ my  $idx_type = "";
 			$formatarg = "-q";
 			$ext = ".fq";
 		} elsif($read_file_format eq "tabbed") {
-			$formatarg = "--12";
+			$formatarg = "--tab5";
 			$ext = ".tab";
 		} elsif($read_file_format eq "cline_reads") {
 			$formatarg = "-c";
@@ -4417,9 +4419,14 @@ my  $idx_type = "";
 			die "Bad format: $read_file_format";
 		}
 		if($formatarg ne "-c") {
+			my $cmd = ">";
+			if (defined($compressed)) {
+				$cmd = "| gzip -c" . $cmd;
+				$ext = $ext . ".gz";
+			}
 			if(defined($read_file)) {
 				# Unpaired
-				open(RD, ">.simple_tests$ext") || die;
+				open(RD, $cmd . ".simple_tests$ext") || die;
 				print RD $read_file;
 				close(RD);
 				$readarg = ".simple_tests$ext";
@@ -4427,10 +4434,10 @@ my  $idx_type = "";
 				defined($mate1_file) || die;
 				defined($mate2_file) || die;
 				# Paired
-				open(M1, ">.simple_tests.1$ext") || die;
+				open(M1, $cmd . ".simple_tests.1$ext") || die;
 				print M1 $mate1_file;
 				close(M1);
-				open(M2, ">.simple_tests.2$ext") || die;
+				open(M2, $cmd . ".simple_tests.2$ext") || die;
 				print M2 $mate2_file;
 				close(M2);
 				$mate1arg = ".simple_tests.1$ext";
@@ -4448,8 +4455,8 @@ my  $idx_type = "";
 			$names,
 			".simple_tests.1.fq",
 			".simple_tests.2.fq");
-		$mate1arg = ".simple_tests.1.fq";
-		$mate2arg = ".simple_tests.2.fq";
+		$mate1arg = defined($compressed) ? ".simple_tests.1.fq.gz" : ".simple_tests.1.fq";
+		$mate2arg = defined($compressed) ? ".simple_tests.2.fq.gz" : ".simple_tests.2.fq";
 		$formatarg = "-q";
 		$readarg = $mate1arg;
 	}
diff --git a/scripts/test/simple_tests.sh b/scripts/test/simple_tests.sh
index 90b13f2..b71ade5 100644
--- a/scripts/test/simple_tests.sh
+++ b/scripts/test/simple_tests.sh
@@ -31,4 +31,5 @@ make $* bowtie2-align-s \
 		bowtie2-build-l-debug && \
 perl scripts/test/simple_tests.pl \
 	--bowtie2=./bowtie2 \
-	--bowtie2-build=./bowtie2-build
+	--bowtie2-build=./bowtie2-build \
+	--compressed

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bowtie2.git



More information about the debian-med-commit mailing list