[med-svn] [bowtie2] 01/04: Imported Upstream version 2.2.0

Fri Feb 21 13:01:40 UTC 2014

This is an automated email from the git hooks/post-receive script.

malex-guest pushed a commit to branch master
in repository bowtie2.

commit 706918cb26f1f3f89ae41b4011aa5c5b16659d9f
Author: Alexandre Mestiashvili <alex at biotec.tu-dresden.de>
Date:   Thu Feb 20 12:17:23 2014 +0100

    Imported Upstream version 2.2.0
---
 MANUAL                         | 124 ++++--
 MANUAL.markdown                | 108 +++--
 Makefile                       | 163 +++++--
 NEWS                           |  27 +-
 VERSION                        |   2 +-
 aligner_cache.cpp              |  16 +-
 aligner_cache.h                |  77 ++--
 aligner_driver.cpp             | 186 +++++++-
 aligner_driver.h               |  84 +++-
 aligner_result.cpp             |   6 +-
 aligner_result.h               |  56 +--
 aligner_seed.cpp               | 112 ++---
 aligner_seed.h                 |  77 ++--
 aligner_seed2.cpp              | 355 ++++++++++-----
 aligner_seed2.h                | 490 +++++++++++----------
 aligner_sw.cpp                 |  47 +-
 aligner_sw.h                   |  21 +-
 aligner_sw_driver.cpp          | 119 +++---
 aligner_sw_driver.h            |  10 +-
 aln_sink.cpp                   | 120 ++++--
 aln_sink.h                     |  25 +-
 binary_sa_search.h             |  30 +-
 blockwise_sa.h                 | 206 ++++-----
 bowtie2                        | 183 +++++---
 bowtie2-build                  |  98 +++++
 bowtie2-inspect                |  74 ++++
 bt2_build.cpp                  | 125 ++++--
 bt2_dp.cpp                     | 788 ++++++++++++++++++++++++++++++++++
 bt2_idx.cpp                    |  88 ++--
 bt2_idx.h                      | 949 +++++++++++++++++++++++++----------------
 bt2_inspect.cpp                |  52 ++-
 bt2_io.cpp                     | 401 ++++++++---------
 bt2_search.cpp                 | 120 ++++--
 bt2_util.cpp                   |  50 +--
 multikey_qsort.cpp => btypes.h |  30 +-
 diff_sample.h                  | 127 +++---
 doc/manual.html                |  34 +-
 dp_framer.h                    |   6 +
 ds.h                           |  31 +-
 endian_swap.h                  |  52 ++-
 fast_mutex.h                   |   0
 filebuf.h                      |   4 +-
 group_walk.h                   | 230 +++++-----
 ls.h                           |   2 +-
 mm.h                           |   2 +
 multikey_qsort.cpp             |   2 +-
 multikey_qsort.h               |  85 ++--
 opts.h                         |   6 +-
 pat.cpp                        |   6 +
 pat.h                          |  11 +-
 processor_support.h            |  70 +++
 random_source.h                |  17 +
 random_util.h                  |   2 +-
 ref_read.cpp                   |  36 +-
 ref_read.h                     |  57 ++-
 reference.cpp                  | 140 +++---
 reference.h                    |  20 +-
 sam.cpp                        |  43 +-
 sam.h                          |   3 +-
 scripts/infer_fraglen.pl       |  29 +-
 scripts/make_e_coli.sh         |   4 +-
 shmem.h                        |   3 +
 sse_util.h                     |   5 +
 third_party/cpuid.h            | 187 ++++++++
 tinythread.cpp                 |   0
 tinythread.h                   |   0
 word_io.h                      | 209 +++++++--
 zbox.h                         |  10 +-
 68 files changed, 4781 insertions(+), 2071 deletions(-)

diff --git a/MANUAL b/MANUAL
index a022d83..6ab7437 100644
--- a/MANUAL
+++ b/MANUAL
@@ -133,10 +133,10 @@ Obtaining Bowtie 2
 ==================
 
 Download Bowtie 2 sources and binaries from the [Download] section of the
-Sourceforge site.  Binaries are available for Intel architectures (`i386` and
-`x86_64`) running Linux, and Mac OS X.  A 32-bit version is available for
-Windows.  If you plan to compile Bowtie 2 yourself, make sure to get the source
-package, i.e., the filename that ends in "-source.zip".
+Sourceforge site.  Binaries are available for the Intel `x86_64` architecture
+running Linux, Mac OS X, and Windows.  If you plan to compile Bowtie 2 yourself,
+make sure to get the source package, i.e., the filename that ends in
+"-source.zip".
 
 Building from source
 --------------------
@@ -396,7 +396,8 @@ instance, when seeking [structural variants].
 The expected relative orientation of the mates is set using the `--ff`,
 `--fr`, or `--rf` options.  The expected range of inter-mates distances (as
 measured from the furthest extremes of the mates; also called "outer distance")
-is set with the `-I` and `-X` options.
+is set with the `-I` and `-X` options.  Note that setting `-I` and `-X`
+far apart makes Bowtie 2 slower.  See documentation for `-I` and `-X`.
 
 To declare that a pair aligns discordantly, Bowtie 2 requires that both mates
 align uniquely.  This is a conservative threshold, but this is often desirable
@@ -730,27 +731,35 @@ For datasets consisting of pairs, the summary might look like this:
 
 The indentation indicates how subtotals relate to totals.
 
-Wrapper
--------
+Wrapper scripts
+---------------
 
-The `bowtie2` executable is actually a Perl wrapper script that calls the
-compiled `bowtie2-align` binary.  It is recommended that you always run the
-`bowtie2` wrapper and not run `bowtie2-align` directly.
+The `bowtie2`, `bowtie2-build` and `bowtie2-inspect` executables are actually 
+wrapper scripts that call binary programs as appropriate.  The wrappers shield
+users from having to distinguish between "small" and "large" index formats,
+discussed briefly in the following section.  Also, the `bowtie2` wrapper
+provides some key functionality, like the ability to handle compressed inputs,
+and the fucntionality for `--un`, `--al` and related options.
 
-Performance tuning
-------------------
+It is recommended that you always run the bowtie2 wrappers and not run the
+binaries directly.
 
-1.  Use 64-bit version if possible
+Small and large indexes
+-----------------------
 
-    The 64-bit version of Bowtie 2 is faster than the 32-bit version, owing to
-    its use of 64-bit arithmetic.  If possible, download the 64-bit binaries for
-    Bowtie 2 and run on a 64-bit computer.  If you are building Bowtie 2 from
-    sources, you may need to pass the `-m64` option to `g++` to compile the
-    64-bit version; you can do this by including `BITS=64` in the arguments to
-    the `make` command; e.g.: `make BITS=64 bowtie2`.  To determine whether your
-    version of bowtie is 64-bit or 32-bit, run `bowtie2 --version`.
+`bowtie2-build` can index reference genomes of any size.  For genomes less than
+about 4 billion nucleotides in length, `bowtie2-build` builds a "small" index
+using 32-bit numbers in various parts of the index.  When the genome is longer,
+`bowtie2-build` builds a "large" index using 64-bit numbers.  Small indexes are
+stored in files with the `.bt2` extension, and large indexes are stored in
+files with the `.bt2l` extension.  The user need not worry about whether a
+particular index is small or large; the wrapper scripts will automatically build
+and use the appropriate index.
 
-2.  If your computer has multiple processors/cores, use `-p`
+Performance tuning
+------------------
+
+1.  If your computer has multiple processors/cores, use `-p`
 
     The `-p` option causes Bowtie 2 to launch a specified number of parallel
     search threads.  Each thread runs on a different processor/core and all
@@ -758,6 +767,23 @@ Performance tuning
     approximately a multiple of the number of threads (though in practice,
     speedup is somewhat worse than linear).
 
+2.  If reporting many alignments per read, try reducing
+    `bowtie2-build --offrate`
+
+    If you are using `-k` or `-a` options and Bowtie 2 is reporting many
+    alignments per read, using an index with a denser SA sample can speed
+    things up considerably.  To do this, specify a smaller-than-default
+    `-o`/`--offrate` value when running `bowtie2-build`.
+    A denser SA sample yields a larger index, but is also particularly
+    effective at speeding up alignment when many alignments are reported per
+    read.
+
+3.  If `bowtie2` "thrashes", try increasing `bowtie2-build --offrate`
+
+    If `bowtie2` runs very slowly on a relatively low-memory computer, try
+    setting `-o`/`--offrate` to a *larger* value when building the index.
+    This decreases the memory footprint of the index.
+
 Command Line
 ------------
 
@@ -1143,7 +1169,15 @@ specified and a paired-end alignment consists of two 20-bp alignments in the
 appropriate orientation with a 20-bp gap between them, that alignment is
 considered valid (as long as `-X` is also satisfied).  A 19-bp gap would not
 be valid in that case.  If trimming options `-3` or `-5` are also used, the
-`-I` constraint is applied with respect to the untrimmed mates.  Default: 0.
+`-I` constraint is applied with respect to the untrimmed mates.
+
+The larger the difference between `-I` and `-X`, the slower Bowtie 2 will
+run.  This is because larger differences bewteen `-I` and `-X` require that
+Bowtie 2 scan a larger window to determine if a concordant alignment exists.
+For typical fragment length ranges (200 to 400 nucleotides), Bowtie 2 is very
+efficient.
+
+Default: 0 (essentially imposing no minimum) 
 
     -X/--maxins <int>
 
@@ -1153,7 +1187,15 @@ proper orientation with a 60-bp gap between them, that alignment is considered
 valid (as long as `-I` is also satisfied).  A 61-bp gap would not be valid in
 that case.  If trimming options `-3` or `-5` are also used, the `-X`
 constraint is applied with respect to the untrimmed mates, not the trimmed
-mates.  Default: 500.
+mates.
+
+The larger the difference between `-I` and `-X`, the slower Bowtie 2 will
+run.  This is because larger differences bewteen `-I` and `-X` require that
+Bowtie 2 scan a larger window to determine if a concordant alignment exists.
+For typical fragment length ranges (200 to 400 nucleotides), Bowtie 2 is very
+efficient.
+
+Default: 500.
 
     --fr/--rf/--ff
 
@@ -1494,10 +1536,12 @@ alignment:
 
         XS:i:<N>
 
-    Alignment score for second-best alignment.  Can be negative.  Can be greater
-    than 0 in `--local` mode (but not in `--end-to-end` mode).  Only present
-    if the SAM record is for an aligned read and more than one alignment was
-    found for the read.
+    Alignment score for the best-scoring alignment found other than the
+	alignment reported.  Can be negative.  Can be greater than 0 in `--local`
+	mode (but not in `--end-to-end` mode).  Only present if the SAM record is
+	for an aligned read and more than one alignment was found for the read.
+	Note that, when the read is part of a concordantly-aligned pair, this score
+	could be greater than `AS:i`.
 
         YS:i:<N>
 
@@ -1559,7 +1603,8 @@ The `bowtie2-build` indexer
 
 `bowtie2-build` builds a Bowtie index from a set of DNA sequences.
 `bowtie2-build` outputs a set of 6 files with suffixes `.1.bt2`, `.2.bt2`,
-`.3.bt2`, `.4.bt2`, `.rev.1.bt2`, and `.rev.2.bt2`.  These files together
+`.3.bt2`, `.4.bt2`, `.rev.1.bt2`, and `.rev.2.bt2`.  In the case of a large 
+index these suffixes will have a `bt2l` termination.  These files together
 constitute the index: they are all that is needed to align reads to that
 reference.  The original sequence FASTA files are no longer used by Bowtie 2
 once the index is built.
@@ -1582,19 +1627,11 @@ profitable trade-offs depending on the application.  They have been set to
 defaults that are reasonable for most cases according to our experiments.  See
 [Performance tuning] for details.
 
-Because `bowtie2-build` uses 32-bit pointers internally, it can handle up to a
-theoretical maximum of 2^32-1 (somewhat more than 4 billion) characters in an
-index, though, with other constraints, the actual ceiling is somewhat less than
-that.  If your reference exceeds 2^32-1 characters, `bowtie2-build` will print
-an error message and abort.  To resolve this, divide your reference sequences
-into smaller batches and/or chunks and build a separate index for each.
-
-If your computer has more than 3-4 GB of memory and you would like to exploit
-that fact to make index building faster, use a 64-bit version of the
-`bowtie2-build` binary.  The 32-bit version of the binary is restricted to using
-less than 4 GB of memory.  If a 64-bit pre-built binary does not yet exist for
-your platform on the sourceforge download site, you will need to build one from
-source.
+`bowtie2-build` can generate either [small or large indexes].  The wrapper
+will decide which based on the length of the input genome.  If the reference
+does not exceed 4 billion characters but a large index is preferred,  the user
+can specify `--large-index` to force `bowtie2-build` to build a large index
+instead.
 
 The Bowtie 2 index is based on the [FM Index] of Ferragina and Manzini, which in
 turn is based on the [Burrows-Wheeler] transform.  The algorithm used to build
@@ -1636,6 +1673,11 @@ The reference input files (specified as `<reference_in>`) are FASTA files
 The reference sequences are given on the command line.  I.e. `<reference_in>` is
 a comma-separated list of sequences rather than a list of FASTA files.
 
+    --large-index
+
+Force `bowtie2-build` to build a [large index], even if the reference is less
+than ~ 4 billion nucleotides inlong.
+
     -a/--noauto
 
 Disable the default behavior whereby `bowtie2-build` automatically selects
diff --git a/MANUAL.markdown b/MANUAL.markdown
index 0ddba4e..4341185 100644
--- a/MANUAL.markdown
+++ b/MANUAL.markdown
@@ -143,10 +143,10 @@ Obtaining Bowtie 2
 ==================
 
 Download Bowtie 2 sources and binaries from the [Download] section of the
-Sourceforge site.  Binaries are available for Intel architectures (`i386` and
-`x86_64`) running Linux, and Mac OS X.  A 32-bit version is available for
-Windows.  If you plan to compile Bowtie 2 yourself, make sure to get the source
-package, i.e., the filename that ends in "-source.zip".
+Sourceforge site.  Binaries are available for the Intel `x86_64` architecture
+running Linux, Mac OS X, and Windows.  If you plan to compile Bowtie 2 yourself,
+make sure to get the source package, i.e., the filename that ends in
+"-source.zip".
 
 Building from source
 --------------------
@@ -749,27 +749,35 @@ For datasets consisting of pairs, the summary might look like this:
 
 The indentation indicates how subtotals relate to totals.
 
-Wrapper
--------
+Wrapper scripts
+---------------
 
-The `bowtie2` executable is actually a Perl wrapper script that calls the
-compiled `bowtie2-align` binary.  It is recommended that you always run the
-`bowtie2` wrapper and not run `bowtie2-align` directly.
+The `bowtie2`, `bowtie2-build` and `bowtie2-inspect` executables are actually 
+wrapper scripts that call binary programs as appropriate.  The wrappers shield
+users from having to distinguish between "small" and "large" index formats,
+discussed briefly in the following section.  Also, the `bowtie2` wrapper
+provides some key functionality, like the ability to handle compressed inputs,
+and the fucntionality for [`--un`], [`--al`] and related options.
 
-Performance tuning
-------------------
+It is recommended that you always run the bowtie2 wrappers and not run the
+binaries directly.
 
-1.  Use 64-bit version if possible
+Small and large indexes
+-----------------------
 
-    The 64-bit version of Bowtie 2 is faster than the 32-bit version, owing to
-    its use of 64-bit arithmetic.  If possible, download the 64-bit binaries for
-    Bowtie 2 and run on a 64-bit computer.  If you are building Bowtie 2 from
-    sources, you may need to pass the `-m64` option to `g++` to compile the
-    64-bit version; you can do this by including `BITS=64` in the arguments to
-    the `make` command; e.g.: `make BITS=64 bowtie2`.  To determine whether your
-    version of bowtie is 64-bit or 32-bit, run `bowtie2 --version`.
+`bowtie2-build` can index reference genomes of any size.  For genomes less than
+about 4 billion nucleotides in length, `bowtie2-build` builds a "small" index
+using 32-bit numbers in various parts of the index.  When the genome is longer,
+`bowtie2-build` builds a "large" index using 64-bit numbers.  Small indexes are
+stored in files with the `.bt2` extension, and large indexes are stored in
+files with the `.bt2l` extension.  The user need not worry about whether a
+particular index is small or large; the wrapper scripts will automatically build
+and use the appropriate index.
 
-2.  If your computer has multiple processors/cores, use `-p`
+Performance tuning
+------------------
+
+1.  If your computer has multiple processors/cores, use `-p`
 
     The [`-p`] option causes Bowtie 2 to launch a specified number of parallel
     search threads.  Each thread runs on a different processor/core and all
@@ -777,6 +785,23 @@ Performance tuning
     approximately a multiple of the number of threads (though in practice,
     speedup is somewhat worse than linear).
 
+2.  If reporting many alignments per read, try reducing
+    `bowtie2-build --offrate`
+
+    If you are using [`-k`] or [`-a`] options and Bowtie 2 is reporting many
+    alignments per read, using an index with a denser SA sample can speed
+    things up considerably.  To do this, specify a smaller-than-default
+    [`-o`/`--offrate`](#bowtie2-build-options-o) value when running `bowtie2-build`.
+    A denser SA sample yields a larger index, but is also particularly
+    effective at speeding up alignment when many alignments are reported per
+    read.
+
+3.  If `bowtie2` "thrashes", try increasing `bowtie2-build --offrate`
+
+    If `bowtie2` runs very slowly on a relatively low-memory computer, try
+    setting [`-o`/`--offrate`] to a *larger* value when building the index.
+    This decreases the memory footprint of the index.
+
 Command Line
 ------------
 
@@ -2197,10 +2222,12 @@ alignment:
     </td>
     <td>
 
-    Alignment score for second-best alignment.  Can be negative.  Can be greater
-    than 0 in [`--local`] mode (but not in [`--end-to-end`] mode).  Only present
-    if the SAM record is for an aligned read and more than one alignment was
-    found for the read.
+    Alignment score for the best-scoring alignment found other than the
+	alignment reported.  Can be negative.  Can be greater than 0 in [`--local`]
+	mode (but not in [`--end-to-end`] mode).  Only present if the SAM record is
+	for an aligned read and more than one alignment was found for the read.
+	Note that, when the read is part of a concordantly-aligned pair, this score
+	could be greater than [`AS:i`].
 
     </td></tr>
     <tr><td id="bowtie2-build-opt-fields-ys">
@@ -2338,7 +2365,8 @@ The `bowtie2-build` indexer
 
 `bowtie2-build` builds a Bowtie index from a set of DNA sequences.
 `bowtie2-build` outputs a set of 6 files with suffixes `.1.bt2`, `.2.bt2`,
-`.3.bt2`, `.4.bt2`, `.rev.1.bt2`, and `.rev.2.bt2`.  These files together
+`.3.bt2`, `.4.bt2`, `.rev.1.bt2`, and `.rev.2.bt2`.  In the case of a large 
+index these suffixes will have a `bt2l` termination.  These files together
 constitute the index: they are all that is needed to align reads to that
 reference.  The original sequence FASTA files are no longer used by Bowtie 2
 once the index is built.
@@ -2361,19 +2389,11 @@ profitable trade-offs depending on the application.  They have been set to
 defaults that are reasonable for most cases according to our experiments.  See
 [Performance tuning] for details.
 
-Because `bowtie2-build` uses 32-bit pointers internally, it can handle up to a
-theoretical maximum of 2^32-1 (somewhat more than 4 billion) characters in an
-index, though, with other constraints, the actual ceiling is somewhat less than
-that.  If your reference exceeds 2^32-1 characters, `bowtie2-build` will print
-an error message and abort.  To resolve this, divide your reference sequences
-into smaller batches and/or chunks and build a separate index for each.
-
-If your computer has more than 3-4 GB of memory and you would like to exploit
-that fact to make index building faster, use a 64-bit version of the
-`bowtie2-build` binary.  The 32-bit version of the binary is restricted to using
-less than 4 GB of memory.  If a 64-bit pre-built binary does not yet exist for
-your platform on the sourceforge download site, you will need to build one from
-source.
+`bowtie2-build` can generate either [small or large indexes](#small-and-large-indexes).  The wrapper
+will decide which based on the length of the input genome.  If the reference
+does not exceed 4 billion characters but a large index is preferred,  the user
+can specify [`--large-index`] to force `bowtie2-build` to build a large index
+instead.
 
 The Bowtie 2 index is based on the [FM Index] of Ferragina and Manzini, which in
 turn is based on the [Burrows-Wheeler] transform.  The algorithm used to build
@@ -2439,6 +2459,18 @@ The reference sequences are given on the command line.  I.e. `<reference_in>` is
 a comma-separated list of sequences rather than a list of FASTA files.
 
 </td></tr>
+</td></tr><tr><td id="bowtie2-build-options-large-index">
+
+[`--large-index`]: #bowtie2-build-options-large-index
+
+    --large-index
+
+</td><td>
+
+Force `bowtie2-build` to build a [large index](#small-and-large-indexes), even if the reference is less
+than ~ 4 billion nucleotides inlong.
+
+</td></tr>
 <tr><td id="bowtie2-build-options-a">
 
 [`-a`/`--noauto`]: #bowtie2-build-options-a
diff --git a/Makefile b/Makefile
index 931ff3c..abd4155 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,12 @@ ifneq (,$(findstring Darwin,$(shell uname)))
 	MACOS = 1
 endif
 
+POPCNT_CAPABILITY ?= 1
+ifeq (1, $(POPCNT_CAPABILITY))
+    EXTRA_FLAGS += -DPOPCNT_CAPABILITY
+    INC += -I third_party
+endif
+
 MM_DEF = 
 
 ifeq (1,$(BOWTIE_MM))
@@ -111,16 +117,20 @@ SEARCH_CPPS = qual.cpp pat.cpp sam.cpp \
 			  aligner_driver.cpp
 SEARCH_CPPS_MAIN = $(SEARCH_CPPS) bowtie_main.cpp
 
+DP_CPPS = qual.cpp aligner_sw.cpp aligner_result.cpp ref_coord.cpp mask.cpp \
+          simple_func.cpp sse_util.cpp aligner_bt.cpp aligner_swsse.cpp \
+		  aligner_swsse_loc_i16.cpp aligner_swsse_ee_i16.cpp \
+		  aligner_swsse_loc_u8.cpp aligner_swsse_ee_u8.cpp scoring.cpp
+
 BUILD_CPPS = diff_sample.cpp
 BUILD_CPPS_MAIN = $(BUILD_CPPS) bowtie_build_main.cpp
 
 SEARCH_FRAGMENTS = $(wildcard search_*_phase*.c)
 VERSION = $(shell cat VERSION)
 
-# Convert BITS=?? to a -m flag
 BITS=32
 ifeq (x86_64,$(shell uname -m))
-BITS=64
+	BITS=64
 endif
 # msys will always be 32 bit so look at the cpu arch instead.
 ifneq (,$(findstring AMD64,$(PROCESSOR_ARCHITEW6432)))
@@ -128,33 +138,35 @@ ifneq (,$(findstring AMD64,$(PROCESSOR_ARCHITEW6432)))
 		BITS=64
 	endif
 endif
-BITS_FLAG =
-
 ifeq (32,$(BITS))
-	BITS_FLAG = -m32
+  $(error bowtie2 compilation requires a 64-bit platform )
 endif
 
-ifeq (64,$(BITS))
-	BITS_FLAG = -m64
-endif
-SSE_FLAG=-msse2
+SSE_FLAG=-msse2 
 
-DEBUG_FLAGS    = -O0 -g3 $(BITS_FLAG) $(SSE_FLAG)
+DEBUG_FLAGS    = -O0 -g3 -m64 $(SSE_FLAG)
 DEBUG_DEFS     = -DCOMPILER_OPTIONS="\"$(DEBUG_FLAGS) $(EXTRA_FLAGS)\""
-RELEASE_FLAGS  = -O3 $(BITS_FLAG) $(SSE_FLAG) -funroll-loops -g3
+RELEASE_FLAGS  = -O3 -m64 $(SSE_FLAG) -funroll-loops -g3
 RELEASE_DEFS   = -DCOMPILER_OPTIONS="\"$(RELEASE_FLAGS) $(EXTRA_FLAGS)\""
 NOASSERT_FLAGS = -DNDEBUG
 FILE_FLAGS     = -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
 
-BOWTIE2_BIN_LIST =     bowtie2-build \
-                       bowtie2-align \
-                       bowtie2-inspect
-BOWTIE2_BIN_LIST_AUX = bowtie2-build-debug \
-                       bowtie2-align-debug \
-                       bowtie2-inspect-debug
+BOWTIE2_BIN_LIST =     bowtie2-build-s \
+                       bowtie2-build-l \
+                       bowtie2-align-s \
+                       bowtie2-align-l \
+                       bowtie2-inspect-s \
+                       bowtie2-inspect-l
+BOWTIE2_BIN_LIST_AUX = bowtie2-build-s-debug \
+                       bowtie2-build-l-debug \
+                       bowtie2-align-s-debug \
+                       bowtie2-align-l-debug \
+                       bowtie2-inspect-s-debug \
+                       bowtie2-inspect-l-debug
 
 GENERAL_LIST = $(wildcard scripts/*.sh) \
                $(wildcard scripts/*.pl) \
+               $(wildcard third_party/*) \
                doc/manual.html \
                doc/README \
                doc/style.css \
@@ -164,6 +176,8 @@ GENERAL_LIST = $(wildcard scripts/*.sh) \
 			   example/reference/lambda_virus.fa \
                $(PTHREAD_PKG) \
 			   bowtie2 \
+			   bowtie2-build \
+			   bowtie2-inspect \
                AUTHORS \
                LICENSE \
                NEWS \
@@ -172,6 +186,10 @@ GENERAL_LIST = $(wildcard scripts/*.sh) \
                TUTORIAL \
                VERSION
 
+ifeq (1,$(WINDOWS))
+	BOWTIE2_BIN_LIST := $(BOWTIE2_BIN_LIST) bowtie2.bat bowtie2-build.bat bowtie2-inspect.bat 
+endif
+
 # This is helpful on Windows under MinGW/MSYS, where Make might go for
 # the Windows FIND tool instead.
 FIND=$(shell which find)
@@ -192,9 +210,9 @@ all: $(BOWTIE2_BIN_LIST)
 
 allall: $(BOWTIE2_BIN_LIST) $(BOWTIE2_BIN_LIST_AUX)
 
-both: bowtie2 bowtie2-build
+both: bowtie2-align-s bowtie2-build-s bowtie2-align-l bowtie2-build-l
 
-both-debug: bowtie2-align-debug bowtie2-build-debug
+both-debug: bowtie2-align-s-debug bowtie2-build-s-debug bowtie2-align-l-debug bowtie2-build-l-debug
 
 DEFS=-fno-strict-aliasing \
      -DBOWTIE2_VERSION="\"`cat VERSION`\"" \
@@ -210,15 +228,23 @@ DEFS=-fno-strict-aliasing \
 # bowtie2-build targets
 #
 
-bowtie2-build: bt2_build.cpp $(SHARED_CPPS) $(HEADERS)
+bowtie2-build-s: bt2_build.cpp $(SHARED_CPPS) $(HEADERS)
 	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
 		$(DEFS) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall \
 		$(INC) \
 		-o $@ $< \
 		$(SHARED_CPPS) $(BUILD_CPPS_MAIN) \
 		$(LIBS) $(BUILD_LIBS)
-        
-bowtie2-build-debug: bt2_build.cpp $(SHARED_CPPS) $(HEADERS)
+
+bowtie2-build-l: bt2_build.cpp $(SHARED_CPPS) $(HEADERS)
+	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+		$(INC) \
+		-o $@ $< \
+		$(SHARED_CPPS) $(BUILD_CPPS_MAIN) \
+		$(LIBS) $(BUILD_LIBS)
+
+bowtie2-build-s-debug: bt2_build.cpp $(SHARED_CPPS) $(HEADERS)
 	$(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
 		$(DEFS) -DBOWTIE2 -Wall \
 		$(INC) \
@@ -226,11 +252,19 @@ bowtie2-build-debug: bt2_build.cpp $(SHARED_CPPS) $(HEADERS)
 		$(SHARED_CPPS) $(BUILD_CPPS_MAIN) \
 		$(LIBS) $(BUILD_LIBS)
 
+bowtie2-build-l-debug: bt2_build.cpp $(SHARED_CPPS) $(HEADERS)
+	$(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+		$(INC) \
+		-o $@ $< \
+		$(SHARED_CPPS) $(BUILD_CPPS_MAIN) \
+		$(LIBS) $(BUILD_LIBS)
+
 #
-# bowtie targets
+# bowtie2-align targets
 #
 
-bowtie2-align: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+bowtie2-align-s: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
 	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
 		$(DEFS) -DBOWTIE2 $(NOASSERT_FLAGS) -Wall \
 		$(INC) \
@@ -238,7 +272,15 @@ bowtie2-align: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_
 		$(SHARED_CPPS) $(SEARCH_CPPS_MAIN) \
 		$(LIBS) $(SEARCH_LIBS)
 
-bowtie2-align-debug: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+bowtie2-align-l: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+		$(INC) \
+		-o $@ $< \
+		$(SHARED_CPPS) $(SEARCH_CPPS_MAIN) \
+		$(LIBS) $(SEARCH_LIBS)
+
+bowtie2-align-s-debug: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
 	$(CXX) $(DEBUG_FLAGS) \
 		$(DEBUG_DEFS) $(EXTRA_FLAGS) \
 		$(DEFS) -DBOWTIE2 -Wall \
@@ -247,11 +289,20 @@ bowtie2-align-debug: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(S
 		$(SHARED_CPPS) $(SEARCH_CPPS_MAIN) \
 		$(LIBS) $(SEARCH_LIBS)
 
+bowtie2-align-l-debug: bt2_search.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+	$(CXX) $(DEBUG_FLAGS) \
+		$(DEBUG_DEFS) $(EXTRA_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+		$(INC) \
+		-o $@ $< \
+		$(SHARED_CPPS) $(SEARCH_CPPS_MAIN) \
+		$(LIBS) $(SEARCH_LIBS)
+
 #
 # bowtie2-inspect targets
 #
 
-bowtie2-inspect: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+bowtie2-inspect-s: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
 	$(CXX) $(RELEASE_FLAGS) \
 		$(RELEASE_DEFS) $(EXTRA_FLAGS) \
 		$(DEFS) -DBOWTIE2 -DBOWTIE_INSPECT_MAIN -Wall \
@@ -260,7 +311,16 @@ bowtie2-inspect: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
 		$(SHARED_CPPS) \
 		$(LIBS) $(INSPECT_LIBS)
 
-bowtie2-inspect-debug: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS) 
+bowtie2-inspect-l: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+	$(CXX) $(RELEASE_FLAGS) \
+		$(RELEASE_DEFS) $(EXTRA_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_INSPECT_MAIN  -DBOWTIE_64BIT_INDEX -Wall \
+		$(INC) -I . \
+		-o $@ $< \
+		$(SHARED_CPPS) \
+		$(LIBS) $(INSPECT_LIBS)
+
+bowtie2-inspect-s-debug: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS) 
 	$(CXX) $(DEBUG_FLAGS) \
 		$(DEBUG_DEFS) $(EXTRA_FLAGS) \
 		$(DEFS) -DBOWTIE2 -DBOWTIE_INSPECT_MAIN -Wall \
@@ -269,6 +329,49 @@ bowtie2-inspect-debug: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
 		$(SHARED_CPPS) \
 		$(LIBS) $(INSPECT_LIBS)
 
+bowtie2-inspect-l-debug: bt2_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+	$(CXX) $(DEBUG_FLAGS) \
+		$(DEBUG_DEFS) $(EXTRA_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_64BIT_INDEX -DBOWTIE_INSPECT_MAIN -Wall \
+		$(INC) -I . \
+		-o $@ $< \
+		$(SHARED_CPPS) \
+		$(LIBS) $(INSPECT_LIBS)
+
+#
+# bowtie2-dp targets
+#
+
+bowtie2-dp: bt2_dp.cpp $(HEADERS) $(SHARED_CPPS) $(DP_CPPS)
+	$(CXX) $(RELEASE_FLAGS) \
+		$(RELEASE_DEFS) $(EXTRA_FLAGS) $(NOASSERT_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_DP_MAIN -Wall \
+		$(INC) -I . \
+		-o $@ $< \
+		$(DP_CPPS) $(SHARED_CPPS) \
+		$(LIBS) $(SEARCH_LIBS)
+
+bowtie2-dp-debug: bt2_dp.cpp $(HEADERS) $(SHARED_CPPS) $(DP_CPPS)
+	$(CXX) $(DEBUG_FLAGS) \
+		$(DEBUG_DEFS) $(EXTRA_FLAGS) \
+		$(DEFS) -DBOWTIE2 -DBOWTIE_DP_MAIN -Wall \
+		$(INC) -I . \
+		-o $@ $< \
+		$(DP_CPPS) $(SHARED_CPPS) \
+		$(LIBS) $(SEARCH_LIBS)
+
+bowtie2.bat:
+	echo "@echo off" > bowtie2.bat
+	echo "perl %~dp0/bowtie2 %*" >> bowtie2.bat
+
+bowtie2-build.bat:
+	echo "@echo off" > bowtie2-build.bat
+	echo "python %~dp0/bowtie2-build %*" >> bowtie2-build.bat
+
+bowtie2-inspect.bat:
+	echo "@echo off" > bowtie2-inspect.bat
+	echo "python %~dp0/bowtie2-inspect %*" >> bowtie2-inspect.bat
+
 .PHONY: bowtie2-src
 bowtie2-src: $(SRC_PKG_LIST)
 	chmod a+x scripts/*.sh scripts/*.pl
@@ -287,15 +390,15 @@ bowtie2-bin: $(BIN_PKG_LIST) $(BOWTIE2_BIN_LIST) $(BOWTIE2_BIN_LIST_AUX)
 	rm -rf .bin.tmp
 	mkdir .bin.tmp
 	mkdir .bin.tmp/bowtie2-$(VERSION)
-	if [ -f bowtie.exe ] ; then \
+	if [ -f bowtie2-align-s.exe ] ; then \
 		zip tmp.zip $(BIN_PKG_LIST) $(addsuffix .exe,$(BOWTIE2_BIN_LIST) $(BOWTIE2_BIN_LIST_AUX)) ; \
 	else \
 		zip tmp.zip $(BIN_PKG_LIST) $(BOWTIE2_BIN_LIST) $(BOWTIE2_BIN_LIST_AUX) ; \
 	fi
 	mv tmp.zip .bin.tmp/bowtie2-$(VERSION)
 	cd .bin.tmp/bowtie2-$(VERSION) ; unzip tmp.zip ; rm -f tmp.zip
-	cd .bin.tmp ; zip -r bowtie2-$(VERSION)-$(BITS).zip bowtie2-$(VERSION)
-	cp .bin.tmp/bowtie2-$(VERSION)-$(BITS).zip .
+	cd .bin.tmp ; zip -r bowtie2-$(VERSION).zip bowtie2-$(VERSION)
+	cp .bin.tmp/bowtie2-$(VERSION).zip .
 	rm -rf .bin.tmp
 
 bowtie2-seeds-debug: aligner_seed.cpp ccnt_lut.cpp alphabet.cpp aligner_seed.h bt2_idx.cpp bt2_io.cpp
diff --git a/NEWS b/NEWS
index 0f1fe64..a6e65dc 100644
--- a/NEWS
+++ b/NEWS
@@ -3,7 +3,7 @@ Bowtie 2 NEWS
 
 Bowtie 2 is now available for download from the project website,
 http://bowtie-bio.sf.net/bowtie2.  2.0.0-beta1 is the first version released to
-the public and 2.0.7 is the latest version.  Bowtie 2 is licensed under
+the public and 2.2.0 is the latest version.  Bowtie 2 is licensed under
 the GPLv3 license.  See `LICENSE' file for details.
 
 Reporting Issues
@@ -16,6 +16,31 @@ Please report any issues using the Sourceforge bug tracker:
 Version Release History
 =======================
 
+Version 2.2.0 - February 10, 2014
+   * Improved index querying efficiency using "population count" instructions
+     available since SSE4.2.
+   * Added support for large and small indexes, removing 4-billion-nucleotide
+     barrier.  Bowtie 2 can now be used with reference genomes of any size.
+   * Fixed bug that could cause bowtie2-build to crash when reference length
+     is close to 4 billion.
+   * Fixed issue in bowtie2-inspect that caused -e mode not to output
+     nucleotides properly.
+   * Added a CL: string to the @PG SAM header to preserve information about
+     the aligner binary and paramteres.
+   * No longer releasing 32-bit binaries.  Simplified manual and Makefile
+     accordingly.
+   * Credits to the Intel(r) enabling team for performance optimizations
+     included in this release.  Thank you!
+   * Phased out CygWin support.
+   * Added the .bat generation for Windows.
+   * Fixed issue with very large one sequence reference.
+   * Fixed some issues with rare chars in fasta files.
+   * Fixed wrappers so bowtie can now be used with symlinks.
+
+Bowtie 2 on GitHub - February 4, 2014
+   * Bowtie 2 source now lives in a public GitHub repository:
+     https://github.com/BenLangmead/bowtie2.
+
 Version 2.1.0 - February 21, 2013
    * Improved multithreading support so that Bowtie 2 now uses native Windows
      threads when compiled on Windows and uses a faster mutex.  Threading
diff --git a/VERSION b/VERSION
index 7ec1d6d..ccbccc3 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.1.0
+2.2.0
diff --git a/aligner_cache.cpp b/aligner_cache.cpp
index 02c0fee..3930356 100644
--- a/aligner_cache.cpp
+++ b/aligner_cache.cpp
@@ -54,10 +54,10 @@ bool SAVal::repOk(const AlignmentCache& ac) const {
 bool AlignmentCache::addOnTheFly(
 	QVal& qv,         // qval that points to the range of reference substrings
 	const SAKey& sak, // the key holding the reference substring
-	uint32_t topf,    // top range elt in BWT index
-	uint32_t botf,    // bottom range elt in BWT index
-	uint32_t topb,    // top range elt in BWT' index
-	uint32_t botb,    // bottom range elt in BWT' index
+	TIndexOffU topf,    // top range elt in BWT index
+	TIndexOffU botf,    // bottom range elt in BWT index
+	TIndexOffU topb,    // top range elt in BWT' index
+	TIndexOffU botb,    // bottom range elt in BWT' index
 	bool getLock)
 {
     ThreadSafe ts(lockPtr(), shared_ && getLock);
@@ -85,14 +85,14 @@ bool AlignmentCache::addOnTheFly(
 	}
 	assert(s->key.repOk());
 	if(added) {
-		s->payload.i = (uint32_t)salist_.size();
+		s->payload.i = (TIndexOffU)salist_.size();
 		s->payload.len = botf - topf;
 		s->payload.topf = topf;
 		s->payload.topb = topb;
 		for(size_t j = 0; j < (botf-topf); j++) {
-			if(!salist_.add(pool(), 0xffffffff)) {
+			if(!salist_.add(pool(), OFF_MASK)) {
 				// Change the payload's len field
-				s->payload.len = (uint32_t)j;
+				s->payload.len = (TIndexOffU)j;
 				return false; // Exhausted pool memory
 			}
 		}
@@ -214,7 +214,7 @@ static void aligner_cache_tests() {
 	}
 	// Add all of the 4-mers in several different random orders
 	RandomSource rand;
-	for(uint32_t runs = 0; runs < 100; runs++) {
+	for(unsigned runs = 0; runs < 100; runs++) {
 		rb.clear();
 		p.clear();
 		assert_eq(0, rb.size());
diff --git a/aligner_cache.h b/aligner_cache.h
index 9900508..c5c15d7 100644
--- a/aligner_cache.h
+++ b/aligner_cache.h
@@ -61,10 +61,11 @@
 #include "threading.h"
 #include "mem_ids.h"
 #include "simple_func.h"
+#include "btypes.h"
 
 #define CACHE_PAGE_SZ (16 * 1024)
 
-typedef PListSlice<uint32_t, CACHE_PAGE_SZ> TSlice;
+typedef PListSlice<TIndexOffU, CACHE_PAGE_SZ> TSlice;
 
 /**
  * Key for the query multimap: the read substring and its length.
@@ -194,13 +195,13 @@ public:
 	/**
 	 * Return the offset of the first reference substring in the qlist.
 	 */
-	uint32_t offset() const { return i_; }
+	TIndexOffU offset() const { return i_; }
 
 	/**
 	 * Return the number of reference substrings associated with a read
 	 * substring.
 	 */
-	uint32_t numRanges() const {
+	TIndexOffU numRanges() const {
 		assert(valid());
 		return rangen_;
 	}
@@ -209,7 +210,7 @@ public:
 	 * Return the number of elements associated with all associated
 	 * reference substrings.
 	 */
-	uint32_t numElts() const {
+	TIndexOffU numElts() const {
 		assert(valid());
 		return eltn_;
 	}
@@ -226,26 +227,26 @@ public:
 	/**
 	 * Return true iff the QVal is valid.
 	 */
-	bool valid() const { return rangen_ != 0xffffffff; }
+	bool valid() const { return rangen_ != OFF_MASK; }
 	
 	/**
 	 * Reset to invalid state.
 	 */
 	void reset() {
-		i_ = 0; rangen_ = eltn_ = 0xffffffff;
+		i_ = 0; rangen_ = eltn_ = OFF_MASK;
 	}
 	
 	/**
 	 * Initialize Qval.
 	 */
-	void init(uint32_t i, uint32_t ranges, uint32_t elts) {
+	void init(TIndexOffU i, TIndexOffU ranges, TIndexOffU elts) {
 		i_ = i; rangen_ = ranges; eltn_ = elts;
 	}
 	
 	/**
 	 * Tally another range with given number of elements.
 	 */
-	void addRange(uint32_t numElts) {
+	void addRange(TIndexOffU numElts) {
 		rangen_++;
 		eltn_ += numElts;
 	}
@@ -260,9 +261,9 @@ public:
 
 protected:
 
-	uint32_t i_;      // idx of first elt in qlist
-	uint32_t rangen_; // # ranges (= # associated reference substrings)
-	uint32_t eltn_;   // # elements (total)
+	TIndexOffU i_;      // idx of first elt in qlist
+	TIndexOffU rangen_; // # ranges (= # associated reference substrings)
+	TIndexOffU eltn_;   // # elements (total)
 };
 
 /**
@@ -278,12 +279,12 @@ typedef QKey SAKey;
  */
 struct SAVal {
 
-	SAVal() : topf(), topb(), i(), len(0xffffffff) { }
+	SAVal() : topf(), topb(), i(), len(OFF_MASK) { }
 
 	/**
 	 * Return true iff the SAVal is valid.
 	 */
-	bool valid() { return len != 0xffffffff; }
+	bool valid() { return len != OFF_MASK; }
 
 #ifndef NDEBUG
 	/**
@@ -297,10 +298,10 @@ struct SAVal {
 	 * Initialize the SAVal.
 	 */
 	void init(
-		uint32_t tf,
-		uint32_t tb,
-		uint32_t ii,
-		uint32_t ln)
+		TIndexOffU tf,
+		TIndexOffU tb,
+		TIndexOffU ii,
+		TIndexOffU ln)
 	{
 		topf = tf;
 		topb = tb;
@@ -308,10 +309,10 @@ struct SAVal {
 		len = ln;
 	}
 
-	uint32_t topf;  // top in BWT
-	uint32_t topb;  // top in BWT'
-	uint32_t i;     // idx of first elt in salist
-	uint32_t len;   // length of range
+	TIndexOffU topf;  // top in BWT
+	TIndexOffU topb;  // top in BWT'
+	TIndexOffU i;     // idx of first elt in salist
+	TIndexOffU len;   // length of range
 };
 
 /**
@@ -326,11 +327,11 @@ public:
 
 	SATuple() { reset(); };
 
-	SATuple(SAKey k, uint32_t tf, uint32_t tb, TSlice o) {
+	SATuple(SAKey k, TIndexOffU tf, TIndexOffU tb, TSlice o) {
 		init(k, tf, tb, o);
 	}
 	
-	void init(SAKey k, uint32_t tf, uint32_t tb, TSlice o) {
+	void init(SAKey k, TIndexOffU tf, TIndexOffU tb, TSlice o) {
 		key = k; topf = tf; topb = tb; offs = o;
 	}
 
@@ -338,10 +339,10 @@ public:
 	 * Initialize this SATuple from a subrange of the SATuple 'src'.
 	 */
 	void init(const SATuple& src, size_t first, size_t last) {
-		assert_neq(0xffffffff, src.topb);
+		assert_neq(OFF_MASK, src.topb);
 		key = src.key;
-		topf = src.topf + (uint32_t)first;
-		topb = 0xffffffff; // unknown!
+		topf = (TIndexOffU)(src.topf + first);
+		topb = OFF_MASK; // unknown!
 		offs.init(src.offs, first, last);
 	}
 	
@@ -385,7 +386,7 @@ public:
 		return key == o.key && topf == o.topf && topb == o.topb && offs == o.offs;
 	}
 
-	void reset() { topf = topb = 0xffffffff; offs.reset(); }
+	void reset() { topf = topb = OFF_MASK; offs.reset(); }
 	
 	/**
 	 * Set the length to be at most the original length.
@@ -403,8 +404,8 @@ public:
 
 	// bot/length of SA range equals offs.size()
 	SAKey    key;  // sequence key
-	uint32_t topf;  // top in BWT index
-	uint32_t topb;  // top in BWT' index
+	TIndexOffU topf;  // top in BWT index
+	TIndexOffU topb;  // top in BWT' index
 	TSlice   offs; // offsets
 };
 
@@ -430,7 +431,7 @@ class AlignmentCache {
 	typedef RedBlackNode<SAKey, SAVal> SANode;
 
 	typedef PList<SAKey, CACHE_PAGE_SZ> TQList;
-	typedef PList<uint32_t, CACHE_PAGE_SZ> TSAList;
+	typedef PList<TIndexOffU, CACHE_PAGE_SZ> TSAList;
 
 public:
 
@@ -534,10 +535,10 @@ public:
 	bool addOnTheFly(
 		QVal& qv,         // qval that points to the range of reference substrings
 		const SAKey& sak, // the key holding the reference substring
-		uint32_t topf,    // top range elt in BWT index
-		uint32_t botf,    // bottom range elt in BWT index
-		uint32_t topb,    // top range elt in BWT' index
-		uint32_t botb,    // bottom range elt in BWT' index
+		TIndexOffU topf,    // top range elt in BWT index
+		TIndexOffU botf,    // bottom range elt in BWT index
+		TIndexOffU topb,    // top range elt in BWT' index
+		TIndexOffU botb,    // bottom range elt in BWT' index
 		bool getLock = true);
 
 	/**
@@ -826,10 +827,10 @@ public:
 	 */
 	bool addOnTheFly(
 		const BTDnaString& rfseq, // reference sequence close to read seq
-		uint32_t topf,            // top in BWT index
-		uint32_t botf,            // bot in BWT index
-		uint32_t topb,            // top in BWT' index
-		uint32_t botb,            // bot in BWT' index
+		TIndexOffU topf,            // top in BWT index
+		TIndexOffU botf,            // bot in BWT index
+		TIndexOffU topb,            // top in BWT' index
+		TIndexOffU botb,            // bot in BWT' index
 		bool getLock = true)      // true -> lock is not held by caller
 	{
 		
diff --git a/aligner_driver.cpp b/aligner_driver.cpp
index b468780..03e69a7 100644
--- a/aligner_driver.cpp
+++ b/aligner_driver.cpp
@@ -19,7 +19,7 @@
 
 #include "aligner_driver.h"
 
-void AlignerDriverRootSelector::select(
+void PrioritizedRootSelector::select(
 	const Read& q,
 	const Read* qo,
 	bool nofw,
@@ -27,6 +27,178 @@ void AlignerDriverRootSelector::select(
 	EList<DescentConfig>& confs,
 	EList<DescentRoot>& roots)
 {
+	assert_gt(landing_, 0);
+	// To specify a search root, we must specify an offset from the 5' end,
+	// whether it is left-to-right, and whether it searchers over the read or
+	// its reverse complement.
+	
+	// Note that it's not very sensible to pick a search root going
+	// left-to-right but where its offset puts it very close to the
+	// right-hand-side of the read.  I.e. that root will "bounce" almost
+	// immediately and go in the other direction.
+	
+	// How to pick these roots?  One idea is to simply lay down roots every N
+	// positions along the read and its reverse-complement.  We would do this
+	// both for left-pointing and right-pointing roots.
+	
+	
+	// Another way is to consider every possible root, then rank them according
+	// to how optimistic we are that picking that root will be productive.
+	// Things that make us more optimistic are:
+	//
+	// 1. First several read characters to align are high quality
+	// 2. First several read characters to align are free of Ns
+	// 3. First several read characters do not form a simple repeat
+	// 4. First several k-mers are well represented both in other reads and in
+	//    the reference genome
+	// 5. Characters, k-mers just before the root are "bad"
+	// 6. Root is flush with one end of the read or the other
+	
+	// Go left-to-right along the forward and reverse-complement reads,
+	// compiling info about the nucleotides in the landing zone of each
+	// potential R2L root.
+	const int nPenalty = 150;
+	const int endBonus = 150;
+	const size_t qlen = q.length();
+	// Calculate interval length
+	int interval = rootIval_.f<int>((double)qlen);
+	size_t sizeTarget = qlen - landing_ + 1;
+	sizeTarget = (size_t)(ceilf((sizeTarget / (float)interval)));
+	sizeTarget *= 4;
+	// Set up initial score arrays
+	for(int i = 0; i < 2; i++) {
+		bool fw = (i == 0);
+		scoresOrig_[i].resize(qlen);
+		scores_[i].resize(qlen);
+		for(size_t j = 0; j < qlen; j++) {
+			size_t off5p = fw ? j : (qlen - j - 1);
+			int c = q.getc(off5p, fw);
+			int sc = q.getq(off5p) - ((c > 3) ? nPenalty : 0);
+			scoresOrig_[i][j] = scores_[i][j] = sc;
+		}
+	}
+	rootHeap_.clear();
+	for(int fwi = 0; fwi < 2; fwi++) {
+		bool fw = (fwi == 0);
+		if((fw && nofw) || (!fw && norc)) {
+			continue;
+		}
+		int pri = 0;
+		size_t revi = qlen;
+		for(size_t i = 0; i < qlen; i++) {
+			revi--;
+			pri += scoresOrig_[fwi][i];
+			if(i >= landing_) {
+				pri -= scoresOrig_[fwi][i - landing_];
+			}
+			if(i >= landing_-1 && scoresOrig_[fwi][i] > 0) {
+				rootHeap_.insert(DescentRoot(
+					fw ? i : revi, // offset from 5' end
+					false,         // left-to-right?
+					fw,            // fw?
+					landing_,      // landing length
+					qlen,          // query length
+					pri + ((revi == 0) ? endBonus : 0))); // root priority
+				// Give priority boost for being flush with one end or the
+				// other
+			}
+		}
+		pri = 0;
+		size_t i = qlen - revi;
+		for(size_t revi = 0; revi < qlen; revi++) {
+			i--;
+			pri += scoresOrig_[fwi][i];
+			if(revi >= landing_) {
+				pri -= scoresOrig_[fwi][i + landing_];
+			}
+			if(revi >= landing_-1 && scoresOrig_[fwi][i] > 0) {
+				rootHeap_.insert(DescentRoot(
+					fw ? i : revi, // offset from 5' end
+					true,          // left-to-right?
+					fw,            // fw?
+					landing_,      // landing length
+					qlen,          // query length
+					pri + ((i == 0) ? endBonus : 0))); // root priority
+				// Give priority boost for being flush with one end or the
+				// other
+			}
+		}
+	}
+	// Now that all the roots are in a heap, we select them one-by-one.
+	// Each time we select a root beyond the first, we check to see if an
+	// already-selected root's landing area overlaps.  If so, we take away
+	// any benefit associated with the bases/qualities in the landing area
+	// and then push it back onto the heap if that changes its priority.
+	while(roots.size() < sizeTarget) {
+		if(rootHeap_.empty()) {
+			break;
+		}
+		DescentRoot r = rootHeap_.pop();
+		const size_t off = r.fw ? r.off5p : (qlen - r.off5p - 1);
+		int fwi = r.fw ? 0 : 1;
+		// Re-calculate priority
+		int pri = 0;
+		if(r.l2r) {
+			for(size_t i = 0; i < landing_; i++) {
+				pri += scores_[fwi][off + i];
+			}
+		} else {
+			for(size_t i = 0; i < landing_; i++) {
+				pri += scores_[fwi][off - i];
+			}
+		}
+		// Must take end bonus into account when re-calculating
+		if((r.l2r && (off == 0)) || (!r.l2r && (off == qlen - 1))) {
+			pri += endBonus;
+		}
+		if(pri == r.pri) {
+			// Update the positions in this root's landing area
+			if(r.l2r) {
+				for(size_t i = 0; i < landing_; i++) {
+					float frac = ((float)i / (float)landing_);
+					scores_[fwi][off + i] = (int)(scores_[fwi][off + i] * frac);
+				}
+			} else {
+				for(size_t i = 0; i < landing_; i++) {
+					float frac = ((float)i / (float)landing_);
+					scores_[fwi][off - i] = (int)(scores_[fwi][off - i] * frac);
+				}
+			}
+			confs.expand();
+			confs.back().cons.init(landing_, consExp_);
+			roots.push_back(r);
+		} else {
+			// Re-insert the root, its priority now changed
+			assert_gt(roots.size(), 0);
+			r.pri = pri;
+			rootHeap_.insert(r);
+		}
+	}
+	assert(!roots.empty());
+	//std::cerr << roots.size() << ", " << ncandidates << std::endl;
+}
+
+void IntervalRootSelector::select(
+	const Read& q,
+	const Read* qo,
+	bool nofw,
+	bool norc,
+	EList<DescentConfig>& confs,
+	EList<DescentRoot>& roots)
+{
+	// To specify a search root, we must specify an offset from the 5' end,
+	// whether it is left-to-right, and whether it searchers over the read or
+	// its reverse complement.
+	
+	// Note that it's not very sensible to pick a search root going
+	// left-to-right but where its offset puts it very close to the
+	// right-hand-side of the read.  I.e. that root will "bounce" almost
+	// immediately and go in the other direction.
+	
+	// How to pick these roots?  One idea is to simply lay down roots every N
+	// positions along the read and its reverse-complement.  That's what we do
+	// here.
+	
 	// Calculate interval length for both mates
 	int interval = rootIval_.f<int>((double)q.length());
 	if(qo != NULL) {
@@ -51,6 +223,7 @@ void AlignerDriverRootSelector::select(
 					i,          // offset from 5' end
 					true,       // left-to-right?
 					fw,         // fw?
+					1,          // landing
 					q.length(), // query length
 					pri);       // root priority
 				i += interval;
@@ -69,6 +242,7 @@ void AlignerDriverRootSelector::select(
 					q.length() - i - 1, // offset from 5' end
 					false,              // left-to-right?
 					fw,                 // fw?
+					1,          // landing
 					q.length(),         // query length
 					pri);               // root priority
 				i += interval;
@@ -76,6 +250,7 @@ void AlignerDriverRootSelector::select(
 			}
 		}
 	}
+	//std::cerr << roots.size() << std::endl;
 }
 
 /**
@@ -127,9 +302,9 @@ int AlignerDriver::go(
 		while(true) {
 			int ret = dr1_.advance(stop_, sc, ebwtFw, ebwtBw, met, prm);
 			if(ret == DESCENT_DRIVER_ALN) {
-				//cerr << iter << ". DESCENT_DRIVER_ALN" << endl;
+				cerr << iter << ". DESCENT_DRIVER_ALN" << endl;
 			} else if(ret == DESCENT_DRIVER_MEM) {
-				//cerr << iter << ". DESCENT_DRIVER_MEM" << endl;
+				cerr << iter << ". DESCENT_DRIVER_MEM" << endl;
 				break;
 			} else if(ret == DESCENT_DRIVER_STRATA) {
 				// DESCENT_DRIVER_STRATA is returned by DescentDriver.advance()
@@ -202,9 +377,10 @@ int AlignerDriver::go(
 				}
 				dr1_.sink().advanceStratum();
 			} else if(ret == DESCENT_DRIVER_BWOPS) {
-				//cerr << iter << ". DESCENT_DRIVER_BWOPS" << endl;
+				cerr << iter << ". DESCENT_DRIVER_BWOPS" << endl;
+				break;
 			} else if(ret == DESCENT_DRIVER_DONE) {
-				//cerr << iter << ". DESCENT_DRIVER_DONE" << endl;
+				cerr << iter << ". DESCENT_DRIVER_DONE" << endl;
 				break;
 			} else {
 				assert(false);
diff --git a/aligner_driver.h b/aligner_driver.h
index 4dd7345..588f6bf 100644
--- a/aligner_driver.h
+++ b/aligner_driver.h
@@ -85,11 +85,11 @@
  * out if the end of the read is less than 'landing' positions away, in the
  * direction of the search.
  */
-class AlignerDriverRootSelector : public DescentRootSelector {
+class IntervalRootSelector : public DescentRootSelector {
 
 public:
 
-	AlignerDriverRootSelector(
+	IntervalRootSelector(
 		double consExp,
 		const SimpleFunc& rootIval,
 		size_t landing)
@@ -99,7 +99,7 @@ public:
 		landing_ = landing;
 	}
 	
-	virtual ~AlignerDriverRootSelector() { }
+	virtual ~IntervalRootSelector() { }
 
 	virtual void select(
 		const Read& q,                 // read that we're selecting roots for
@@ -117,6 +117,46 @@ protected:
 };
 
 /**
+ * Concrete subclass of DescentRootSelector.  Puts a root every 'ival' chars,
+ * where 'ival' is determined by user-specified parameters.  A root is filtered
+ * out if the end of the read is less than 'landing' positions away, in the
+ * direction of the search.
+ */
+class PrioritizedRootSelector : public DescentRootSelector {
+
+public:
+
+	PrioritizedRootSelector(
+		double consExp,
+		const SimpleFunc& rootIval,
+		size_t landing)
+	{
+		consExp_ = consExp;
+		rootIval_ = rootIval;
+		landing_ = landing;
+	}
+	
+	virtual ~PrioritizedRootSelector() { }
+
+	virtual void select(
+		const Read& q,                 // read that we're selecting roots for
+		const Read* qo,                // opposite mate, if applicable
+		bool nofw,                     // don't add roots for fw read
+		bool norc,                     // don't add roots for rc read
+		EList<DescentConfig>& confs,   // put DescentConfigs here
+		EList<DescentRoot>& roots);    // put DescentRoot here
+
+protected:
+
+	double consExp_;
+	SimpleFunc rootIval_;
+	size_t landing_;
+	EHeap<DescentRoot> rootHeap_;
+	EList<int> scoresOrig_[2];
+	EList<int> scores_[2];
+};
+
+/**
  * Return values from extendSeeds and extendSeedsPaired.
  */
 enum {
@@ -135,6 +175,9 @@ enum {
  * implementations in Bowtie 2.  The DescentDriver is used to find some very
  * high-scoring alignments, but is additionally used to rank partial alignments
  * so that they can be extended using dynamic programming.
+ *
+ * It is also the glue between the DescentDrivers and the DescentRootSelector
+ * concrete subclasses that decide where to put the search roots.
  */
 class AlignerDriver {
 
@@ -142,20 +185,35 @@ public:
 
 	AlignerDriver(
 		double consExp,
+		bool prioritizeRoots,
 		const SimpleFunc& rootIval,
 		size_t landing,
 		bool veryVerbose,
 		const SimpleFunc& totsz,
 		const SimpleFunc& totfmops) :
-		sel_(consExp, rootIval, landing),
 		alsel_(),
 		dr1_(veryVerbose),
 		dr2_(veryVerbose)
 	{
+		assert_gt(landing, 0);
 		totsz_ = totsz;
 		totfmops_ = totfmops;
+		if(prioritizeRoots) {
+			// Prioritize roots according the quality info & Ns
+			sel_ = new PrioritizedRootSelector(consExp, rootIval, landing);
+		} else {
+			// Take a root every so many positions
+			sel_ = new IntervalRootSelector(consExp, rootIval, landing);
+		}
 	}
-	
+
+	/**
+	 * Destroy this AlignerDriver.
+	 */
+	virtual ~AlignerDriver() {
+		delete sel_;
+	}
+
 	/**
 	 * Initialize driver with respect to a new read or pair.
 	 */
@@ -167,16 +225,23 @@ public:
 		TAlScore maxpen,
 		const Read* q2)
 	{
-		dr1_.initRead(q1, nofw, norc, minsc, maxpen, q2, &sel_);
+		// Initialize search for mate 1.  This includes instantiating and
+		// prioritizing all the search roots.
+		dr1_.initRead(q1, nofw, norc, minsc, maxpen, q2, sel_);
 		red1_.init(q1.length());
 		paired_ = false;
 		if(q2 != NULL) {
-			dr2_.initRead(*q2, nofw, norc, minsc, maxpen, &q1, &sel_);
+			// Initialize search for mate 1.  This includes instantiating and
+			// prioritizing all the search roots.
+			dr2_.initRead(*q2, nofw, norc, minsc, maxpen, &q1, sel_);
 			red2_.init(q2->length());
 			paired_ = true;
 		} else {
 			dr2_.reset();
 		}
+		// Initialize stopping conditions.  We use two conditions:
+		// totsz: when memory footprint exceeds this many bytes
+		// totfmops: when we've exceeded this many FM Index ops
 		size_t totsz = totsz_.f<size_t>(q1.length());
 		size_t totfmops = totfmops_.f<size_t>(q1.length());
 		stop_.init(
@@ -215,10 +280,13 @@ public:
 		red1_.reset();
 		red2_.reset();
 	}
+	
+	const DescentDriver& dr1() { return dr1_; }
+	const DescentDriver& dr2() { return dr2_; }
 
 protected:
 
-	AlignerDriverRootSelector sel_;   // selects where roots should go
+	DescentRootSelector *sel_;        // selects where roots should go
 	DescentAlignmentSelector alsel_;  // one selector can deal with >1 drivers
 	DescentDriver dr1_;               // driver for mate 1/unpaired reads
 	DescentDriver dr2_;               // driver for paired-end reads
diff --git a/aligner_result.cpp b/aligner_result.cpp
index d554208..1072575 100644
--- a/aligner_result.cpp
+++ b/aligner_result.cpp
@@ -1055,12 +1055,15 @@ void AlnSetSumm::init(
 	bestPaired.invalidate(); secbestPaired.invalidate();
 	bool paired = (rs1 != NULL && rs2 != NULL);
 	szs[0] = szs[1] = 0;
+	// Set bestPaired and secbestPaired
 	if(paired) {
 		// Paired alignments
 		assert_eq(rs1->size(), rs2->size());
 		szs[0] = szs[1] = rs1->size();
 		assert_gt(szs[0], 0);
 		for(size_t i = 0; i < rs1->size(); i++) {
+			// Combine mate scores into a concordant alignment score by
+			// summing them
 			AlnScore sc = (*rs1)[i].score() + (*rs2)[i].score();
 			if(sc > bestPaired) {
 				secbestPaired = bestPaired;
@@ -1073,14 +1076,13 @@ void AlnSetSumm::init(
 			}
 		}
 	}
+	// Set best[] and secbest[]
 	for(int j = 0; j < 2; j++) {
 		const EList<AlnRes>* rs = (j == 0 ? rs1u : rs2u);
 		if(rs == NULL) {
 			continue;
 		}
-		assert(rs != NULL);
 		szs[j] = rs->size();
-		//assert_gt(szs[j], 0);
 		for(size_t i = 0; i < rs->size(); i++) {
 			AlnScore sc = (*rs)[i].score();
 			if(sc > best[j]) {
diff --git a/aligner_result.h b/aligner_result.h
index b91c419..38fd4e4 100644
--- a/aligner_result.h
+++ b/aligner_result.h
@@ -1835,25 +1835,26 @@ public:
 	bool     exhausted2()    const { return exhausted2_;    }
 	TRefId   orefid()        const { return orefid_;        }
 	TRefOff  orefoff()       const { return orefoff_;       }
+	AlnScore bestUnchosen1() const { return bestUnchosen1_; }
+	AlnScore bestUnchosen2() const { return bestUnchosen2_; }
+	AlnScore bestUnchosenC() const { return bestUnchosenC_; }
 
 	/**
-	 *
+	 * Return best alignment score for 
 	 */
-	AlnScore best(bool mate1) const { return mate1 ? best1_ : best2_; }
+	AlnScore bestUnchosen(bool mate1) const {
+		return mate1 ? bestUnchosen1_ : bestUnchosen2_;
+	}
 
 	bool exhausted(bool mate1) const {
 		return mate1 ? exhausted1_ : exhausted2_;
 	}
-
+	
 	/**
-	 * Return the second-best score for the specified mate.  If the alignment
-	 * is paired and the specified mate aligns uniquely, return an invalid
-	 * second-best score.  This allows us to treat mates separately, so that
-	 * repetitive paired-end alignments don't trump potentially unique unpaired
-	 * alignments.
+	 * Return best alignment score for mate 1 or mate 2, depending on argument.
 	 */
-	AlnScore secbestMate(bool mate1) const {
-		return mate1 ? secbest1_ : secbest2_;
+	AlnScore best(bool mate1) const {
+		return mate1 ? best1_ : best2_;
 	}
 	
 	/**
@@ -1864,20 +1865,23 @@ public:
 	 * alignments.
 	 */
 	AlnScore secbest(bool mate1) const {
-		if(paired_) {
-			if(mate1) {
-				//if(!secbest1_.valid()) {
-					return secbest1_;
-				//}
-			} else {
-				//if(!secbest2_.valid()) {
-					return secbest2_;
-				//}
-			}
-			//return secbestPaired_;
-		} else {
-			return mate1 ? secbest1_ : secbest2_;
-		}
+		return mate1 ? secbest1_ : secbest2_;
+	}
+	
+	/**
+	 * Add information about unchosen alignments to the summary.  This is
+	 * helpful for concordant alignments; when calculating mapping quality,
+	 * we might like to know about how good the unchosen mate and pair
+	 * alignments were.
+	 */
+	void setUnchosen(
+		AlnScore bestUnchosen1,
+		AlnScore bestUnchosen2,
+		AlnScore bestUnchosenC)
+	{
+		bestUnchosen1_ = bestUnchosen1;
+		bestUnchosen2_ = bestUnchosen2;
+		bestUnchosenC_ = bestUnchosenC;
 	}
 	
 protected:
@@ -1895,6 +1899,10 @@ protected:
 	bool     exhausted2_;    // searched exhaustively for mate 2 alignments?
 	TRefId   orefid_;
 	TRefOff  orefoff_;
+
+	AlnScore bestUnchosen1_;
+	AlnScore bestUnchosen2_;
+	AlnScore bestUnchosenC_;
 };
 
 #endif
diff --git a/aligner_seed.cpp b/aligner_seed.cpp
index 5dd122e..f038848 100644
--- a/aligner_seed.cpp
+++ b/aligner_seed.cpp
@@ -244,7 +244,7 @@ Seed::zeroMmSeeds(int ln, EList<Seed>& pols, Constraint& oall) {
 	pols.back().type = SEED_TYPE_EXACT;
 	pols.back().zones[0] = Constraint::exact();
 	pols.back().zones[1] = Constraint::exact();
-	pols.back().zones[2] = Constraint::exact(); // not used
+	pols.back().zones[2] = Constraint::exact();
 	pols.back().overall = &oall;
 }
 
@@ -565,6 +565,8 @@ void SeedAligner::searchAllSeeds(
 
 	prm.nSdFmops += bwops_;
 	met.seedsearch += seedsearches;
+	met.nrange += sr.numRanges();
+	met.nelt += sr.numElts();
 	met.possearch += possearches;
 	met.intrahit += intrahits;
 	met.interhit += interhits;
@@ -580,22 +582,22 @@ bool SeedAligner::sanityPartial(
 	size_t dep,
 	size_t len,
 	bool do1mm,
-	uint32_t topfw,
-	uint32_t botfw,
-	uint32_t topbw,
-	uint32_t botbw)
+	TIndexOffU topfw,
+	TIndexOffU botfw,
+	TIndexOffU topbw,
+	TIndexOffU botbw)
 {
 	tmpdnastr_.clear();
 	for(size_t i = dep; i < len; i++) {
 		tmpdnastr_.append(seq[i]);
 	}
-	uint32_t top_fw = 0, bot_fw = 0;
+	TIndexOffU top_fw = 0, bot_fw = 0;
 	ebwtFw->contains(tmpdnastr_, &top_fw, &bot_fw);
 	assert_eq(top_fw, topfw);
 	assert_eq(bot_fw, botfw);
 	if(do1mm && ebwtBw != NULL) {
 		tmpdnastr_.reverse();
-		uint32_t top_bw = 0, bot_bw = 0;
+		TIndexOffU top_bw = 0, bot_bw = 0;
 		ebwtBw->contains(tmpdnastr_, &top_bw, &bot_bw);
 		assert_eq(top_bw, topbw);
 		assert_eq(bot_bw, botbw);
@@ -623,7 +625,7 @@ size_t SeedAligner::exactSweep(
 	SeedSearchMetrics& met)     // metrics
 {
 	assert_gt(mineMax, 0);
-	uint32_t top = 0, bot = 0;
+	TIndexOffU top = 0, bot = 0;
 	SideLocus tloc, bloc;
 	const size_t len = read.length();
 	size_t nelt = 0;
@@ -687,7 +689,7 @@ size_t SeedAligner::exactSweep(
 					} else {
 						bwops_++;
 						top = ebwt.mapLF1(top, tloc, c);
-						if(top == 0xffffffff) {
+						if(top == OFF_MASK) {
 							top = bot = 0;
 						} else {
 							bot = top+1;
@@ -779,13 +781,13 @@ bool SeedAligner::oneMmSearch(
 	assert_geq(halfFw, 1);
 	assert_geq(halfBw, 1);
 	SideLocus tloc, bloc;
-	uint32_t t[4], b[4];   // dest BW ranges for BWT
+	TIndexOffU t[4], b[4];   // dest BW ranges for BWT
 	t[0] = t[1] = t[2] = t[3] = 0;
 	b[0] = b[1] = b[2] = b[3] = 0;
-	uint32_t tp[4], bp[4]; // dest BW ranges for BWT'
+	TIndexOffU tp[4], bp[4]; // dest BW ranges for BWT'
 	tp[0] = tp[1] = tp[2] = tp[3] = 0;
 	bp[0] = bp[1] = bp[2] = bp[3] = 0;
-	uint32_t top = 0, bot = 0, topp = 0, botp = 0;
+	TIndexOffU top = 0, bot = 0, topp = 0, botp = 0;
 	// Align fw read / rc read
 	bool results = false;
 	for(int fwi = 0; fwi < 2; fwi++) {
@@ -874,7 +876,7 @@ bool SeedAligner::oneMmSearch(
 					assert(!rep1mm || botp == topp+1);
 					bwops_++;
 					top = ebwt->mapLF1(top, tloc, rdc);
-					if(top == 0xffffffff) {
+					if(top == OFF_MASK) {
 						do_continue = true;
 						break;
 					}
@@ -937,15 +939,15 @@ bool SeedAligner::oneMmSearch(
 						}
 						// Potential mismatch - next, try
 						size_t depm = dep + 1;
-						uint32_t topm = t[j], botm = b[j];
-						uint32_t topmp = tp[j], botmp = bp[j];
+						TIndexOffU topm = t[j], botm = b[j];
+						TIndexOffU topmp = tp[j], botmp = bp[j];
 						assert_eq(botm - topm, botmp - topmp);
-						uint32_t tm[4], bm[4];   // dest BW ranges for BWT
+						TIndexOffU tm[4], bm[4];   // dest BW ranges for BWT
 						tm[0] = t[0]; tm[1] = t[1];
 						tm[2] = t[2]; tm[3] = t[3];
 						bm[0] = b[0]; bm[1] = t[1];
 						bm[2] = b[2]; bm[3] = t[3];
-						uint32_t tmp[4], bmp[4]; // dest BW ranges for BWT'
+						TIndexOffU tmp[4], bmp[4]; // dest BW ranges for BWT'
 						tmp[0] = tp[0]; tmp[1] = tp[1];
 						tmp[2] = tp[2]; tmp[3] = tp[3];
 						bmp[0] = bp[0]; bmp[1] = tp[1];
@@ -972,7 +974,7 @@ bool SeedAligner::oneMmSearch(
 								assert_eq(botmp, topmp+1);
 								bwops_++;
 								topm = ebwt->mapLF1(topm, tlocm, rdcm);
-								if(topm == 0xffffffff) {
+								if(topm == OFF_MASK) {
 									break;
 								}
 								botm = topm + 1;
@@ -1039,12 +1041,12 @@ bool SeedAligner::oneMmSearch(
 								for(size_t i = 0; i < len; i++) {
 									assert_lt((int)rf[i], 4);
 								}
-								ASSERT_ONLY(uint32_t toptmp = 0);
-								ASSERT_ONLY(uint32_t bottmp = 0);
+								ASSERT_ONLY(TIndexOffU toptmp = 0);
+								ASSERT_ONLY(TIndexOffU bottmp = 0);
 								assert(ebwtFw->contains(rf, &toptmp, &bottmp));
 #endif
-								uint32_t toprep = ebwtfw ? topm : topmp;
-								uint32_t botrep = ebwtfw ? botm : botmp;
+								TIndexOffU toprep = ebwtfw ? topm : topmp;
+								TIndexOffU botrep = ebwtfw ? botm : botmp;
 								assert_eq(toprep, toptmp);
 								assert_eq(botrep, bottmp);
 								hits.add1mmEe(toprep, botrep, &e, NULL, fw, score);
@@ -1110,10 +1112,10 @@ inline void
 SeedAligner::nextLocsBi(
 	SideLocus& tloc,            // top locus
 	SideLocus& bloc,            // bot locus
-	uint32_t topf,              // top in BWT
-	uint32_t botf,              // bot in BWT
-	uint32_t topb,              // top in BWT'
-	uint32_t botb,              // bot in BWT'
+	TIndexOffU topf,              // top in BWT
+	TIndexOffU botf,              // bot in BWT
+	TIndexOffU topb,              // top in BWT'
+	TIndexOffU botb,              // bot in BWT'
 	int step                    // step to get ready for
 #if 0
 	, const SABWOffTrack* prevOt, // previous tracker
@@ -1184,16 +1186,16 @@ SeedAligner::nextLocsBi(
  */
 bool
 SeedAligner::extendAndReportHit(
-	uint32_t topf,                     // top in BWT
-	uint32_t botf,                     // bot in BWT
-	uint32_t topb,                     // top in BWT'
-	uint32_t botb,                     // bot in BWT'
+	TIndexOffU topf,                     // top in BWT
+	TIndexOffU botf,                     // bot in BWT
+	TIndexOffU topb,                     // top in BWT'
+	TIndexOffU botb,                     // bot in BWT'
 	uint16_t len,                      // length of hit
 	DoublyLinkedList<Edit> *prevEdit)  // previous edit
 {
 	size_t nlex = 0, nrex = 0;
-	uint32_t t[4], b[4];
-	uint32_t tp[4], bp[4];
+	TIndexOffU t[4], b[4];
+	TIndexOffU tp[4], bp[4];
 	SideLocus tloc, bloc;
 	if(off_ > 0) {
 		const Ebwt *ebwt = ebwtFw_;
@@ -1201,7 +1203,7 @@ SeedAligner::extendAndReportHit(
 		// Extend left using forward index
 		const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc;
 		// See what we get by extending 
-		uint32_t top = topf, bot = botf;
+		TIndexOffU top = topf, bot = botf;
 		t[0] = t[1] = t[2] = t[3] = 0;
 		b[0] = b[1] = b[2] = b[3] = 0;
 		tp[0] = tp[1] = tp[2] = tp[3] = topb;
@@ -1257,7 +1259,7 @@ SeedAligner::extendAndReportHit(
 		// Extend right using backward index
 		const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc;
 		// See what we get by extending 
-		uint32_t top = topb, bot = botb;
+		TIndexOffU top = topb, bot = botb;
 		t[0] = t[1] = t[2] = t[3] = 0;
 		b[0] = b[1] = b[2] = b[3] = 0;
 		tp[0] = tp[1] = tp[2] = tp[3] = topb;
@@ -1315,10 +1317,10 @@ SeedAligner::extendAndReportHit(
  */
 bool
 SeedAligner::reportHit(
-	uint32_t topf,                     // top in BWT
-	uint32_t botf,                     // bot in BWT
-	uint32_t topb,                     // top in BWT'
-	uint32_t botb,                     // bot in BWT'
+	TIndexOffU topf,                     // top in BWT
+	TIndexOffU botf,                     // bot in BWT
+	TIndexOffU topb,                     // top in BWT'
+	TIndexOffU botb,                     // bot in BWT'
 	uint16_t len,                      // length of hit
 	DoublyLinkedList<Edit> *prevEdit)  // previous edit
 {
@@ -1351,7 +1353,7 @@ SeedAligner::reportHit(
 	// correspond to the reference sequence aligned to
 	{
 		BTDnaString rfr;
-		uint32_t tpf, btf, tpb, btb;
+		TIndexOffU tpf, btf, tpb, btb;
 		tpf = btf = tpb = btb = 0;
 		assert(ebwtFw_->contains(rf, &tpf, &btf));
 		if(ebwtBw_ != NULL) {
@@ -1379,10 +1381,10 @@ bool
 SeedAligner::searchSeedBi(
 	int step,             // depth into steps_[] array
 	int depth,            // recursion depth
-	uint32_t topf,        // top in BWT
-	uint32_t botf,        // bot in BWT
-	uint32_t topb,        // top in BWT'
-	uint32_t botb,        // bot in BWT'
+	TIndexOffU topf,        // top in BWT
+	TIndexOffU botf,        // bot in BWT
+	TIndexOffU topb,        // top in BWT'
+	TIndexOffU botb,        // bot in BWT'
 	SideLocus tloc,       // locus for top (perhaps unititialized)
 	SideLocus bloc,       // locus for bot (perhaps unititialized)
 	Constraint c0,        // constraints to enforce in seed zone 0
@@ -1421,7 +1423,7 @@ SeedAligner::searchSeedBi(
 	}
 #endif
 	int off;
-	uint32_t tp[4], bp[4]; // dest BW ranges for "prime" index
+	TIndexOffU tp[4], bp[4]; // dest BW ranges for "prime" index
 	if(step == 0) {
 		// Just starting
 		assert(prevEdit == NULL);
@@ -1484,9 +1486,9 @@ SeedAligner::searchSeedBi(
 	assert(botf - topf == 1 ||  bloc.valid());
 	assert(botf - topf > 1  || !bloc.valid());
 	assert_geq(step, 0);
-	uint32_t t[4], b[4]; // dest BW ranges
+	TIndexOffU t[4], b[4]; // dest BW ranges
 	Constraint* zones[3] = { &c0, &c1, &c2 };
-	ASSERT_ONLY(uint32_t lasttot = botf - topf);
+	ASSERT_ONLY(TIndexOffU lasttot = botf - topf);
 	for(int i = step; i < (int)s.steps.size(); i++) {
 		assert_gt(botf, topf);
 		assert(botf - topf == 1 ||  bloc.valid());
@@ -1510,14 +1512,14 @@ SeedAligner::searchSeedBi(
 			// we use a simpler query (see if(!bloc.valid()) blocks below)
 			bwops_++;
 			ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
-			ASSERT_ONLY(uint32_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
-			ASSERT_ONLY(uint32_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
+			ASSERT_ONLY(TIndexOffU tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
+			ASSERT_ONLY(TIndexOffU totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
 			assert_eq(tot, totp);
 			assert_leq(tot, lasttot);
 			ASSERT_ONLY(lasttot = tot);
 		}
-		uint32_t *tf = ltr ? tp : t, *tb = ltr ? t : tp;
-		uint32_t *bf = ltr ? bp : b, *bb = ltr ? b : bp;
+		TIndexOffU *tf = ltr ? tp : t, *tb = ltr ? t : tp;
+		TIndexOffU *bf = ltr ? bp : b, *bb = ltr ? b : bp;
 		off = abs(off)-1;
 		//
 		bool leaveZone = s.zones[i].first < 0;
@@ -1532,7 +1534,7 @@ SeedAligner::searchSeedBi(
 			bool bail = false;
 			if(!bloc.valid()) {
 				// Range delimited by tloc/bloc has size 1
-				uint32_t ntop = ltr ? topb : topf;
+				TIndexOffU ntop = ltr ? topb : topf;
 				bwops_++;
 				int cc = ebwt->mapLF1(ntop, tloc);
 				assert_range(-1, 3, cc);
@@ -1617,10 +1619,10 @@ SeedAligner::searchSeedBi(
 		if(!bloc.valid()) {
 			assert(ebwtBw_ == NULL || bp[c] == tp[c]+1);
 			// Range delimited by tloc/bloc has size 1
-			uint32_t top = ltr ? topb : topf;
+			TIndexOffU top = ltr ? topb : topf;
 			bwops_++;
 			t[c] = ebwt->mapLF1(top, tloc, c);
-			if(t[c] == 0xffffffff) {
+			if(t[c] == OFF_MASK) {
 				return true;
 			}
 			assert_geq(t[c], ebwt->fchr()[c]);
@@ -1721,8 +1723,8 @@ bool gReportOverhangs = true;
 extern void aligner_seed_tests();
 extern void aligner_random_seed_tests(
 	int num_tests,
-	uint32_t qslo,
-	uint32_t qshi,
+	TIndexOffU qslo,
+	TIndexOffU qshi,
 	bool color,
 	uint32_t seed);
 
diff --git a/aligner_seed.h b/aligner_seed.h
index f628a79..44965f3 100644
--- a/aligner_seed.h
+++ b/aligner_seed.h
@@ -38,6 +38,7 @@
 #include "scoring.h"
 #include "mem_ids.h"
 #include "simple_func.h"
+#include "btypes.h"
 
 /**
  * A constraint to apply to an alignment zone, or to an overall
@@ -490,8 +491,8 @@ struct EEHit {
 	}
 	
 	void init(
-		uint32_t top_,
-		uint32_t bot_,
+		TIndexOffU top_,
+		TIndexOffU bot_,
 		const Edit* e1_,
 		const Edit* e2_,
 		bool fw_,
@@ -566,7 +567,7 @@ struct EEHit {
 	/**
 	 * Return the size of the alignments SA range.s
 	 */
-	uint32_t size() const { return bot - top; }
+	TIndexOffU size() const { return bot - top; }
 	
 #ifndef NDEBUG
 	/**
@@ -584,8 +585,8 @@ struct EEHit {
 	}
 #endif
 	
-	uint32_t top;
-	uint32_t bot;
+	TIndexOffU top;
+	TIndexOffU bot;
 	Edit     e1;
 	Edit     e2;
 	bool     fw;
@@ -976,7 +977,7 @@ public:
 	 */
 	void rankSeedHits(RandomSource& rnd) {
 		while(rankOffs_.size() < nonzTot_) {
-			uint32_t minsz = 0xffffffff;
+			TIndexOffU minsz = MAX_U32;
 			uint32_t minidx = 0;
 			bool minfw = true;
 			// Rank seed-hit positions in ascending order by number of elements
@@ -1003,7 +1004,7 @@ public:
 					}
 				}
 			}
-			assert_neq(0xffffffff, minsz);
+			assert_neq(MAX_U32, minsz);
 			if(minfw) {
 				sortedFw_[minidx] = true;
 			} else {
@@ -1183,8 +1184,8 @@ public:
 	 * Add an end-to-end 1-mismatch alignment.
 	 */
 	void add1mmEe(
-		uint32_t top,
-		uint32_t bot,
+		TIndexOffU top,
+		TIndexOffU bot,
 		const Edit* e1,
 		const Edit* e2,
 		bool fw,
@@ -1199,8 +1200,8 @@ public:
 	 * Add an end-to-end exact alignment.
 	 */
 	void addExactEeFw(
-		uint32_t top,
-		uint32_t bot,
+		TIndexOffU top,
+		TIndexOffU bot,
 		const Edit* e1,
 		const Edit* e2,
 		bool fw,
@@ -1213,8 +1214,8 @@ public:
 	 * Add an end-to-end exact alignment.
 	 */
 	void addExactEeRc(
-		uint32_t top,
-		uint32_t bot,
+		TIndexOffU top,
+		TIndexOffU bot,
 		const Edit* e1,
 		const Edit* e2,
 		bool fw,
@@ -1340,6 +1341,8 @@ struct SeedSearchMetrics {
 	void merge(const SeedSearchMetrics& m, bool getLock = false) {
         ThreadSafe ts(&mutex_m, getLock);
 		seedsearch   += m.seedsearch;
+		nrange       += m.nrange;
+		nelt         += m.nelt;
 		possearch    += m.possearch;
 		intrahit     += m.intrahit;
 		interhit     += m.interhit;
@@ -1357,6 +1360,8 @@ struct SeedSearchMetrics {
 	 */
 	void reset() {
 		seedsearch =
+		nrange =
+		nelt =
 		possearch =
 		intrahit =
 		interhit =
@@ -1370,6 +1375,8 @@ struct SeedSearchMetrics {
 	}
 
 	uint64_t seedsearch;   // # times we executed strategy in InstantiatedSeed
+	uint64_t nrange;       // # ranges found
+	uint64_t nelt;         // # range elements found
 	uint64_t possearch;    // # offsets where aligner executed >= 1 strategy
 	uint64_t intrahit;     // # offsets where current-read cache gave answer
 	uint64_t interhit;     // # offsets where across-read cache gave answer
@@ -1449,10 +1456,10 @@ public:
 		size_t             dep,
 		size_t             len,
 		bool               do1mm,
-		uint32_t           topfw,
-		uint32_t           botfw,
-		uint32_t           topbw,
-		uint32_t           botbw);
+		TIndexOffU           topfw,
+		TIndexOffU           botfw,
+		TIndexOffU           topbw,
+		TIndexOffU           botbw);
 
 	/**
 	 * Do an exact-matching sweet to establish a lower bound on number of edits
@@ -1498,10 +1505,10 @@ protected:
 	 * calling reportHit().
 	 */
 	bool extendAndReportHit(
-		uint32_t topf,                     // top in BWT
-		uint32_t botf,                     // bot in BWT
-		uint32_t topb,                     // top in BWT'
-		uint32_t botb,                     // bot in BWT'
+		TIndexOffU topf,                     // top in BWT
+		TIndexOffU botf,                     // bot in BWT
+		TIndexOffU topb,                     // top in BWT'
+		TIndexOffU botb,                     // bot in BWT'
 		uint16_t len,                      // length of hit
 		DoublyLinkedList<Edit> *prevEdit); // previous edit
 
@@ -1510,10 +1517,10 @@ protected:
 	 * false if the hit could not be reported because of, e.g., cache exhaustion.
 	 */
 	bool reportHit(
-		uint32_t topf,         // top in BWT
-		uint32_t botf,         // bot in BWT
-		uint32_t topb,         // top in BWT'
-		uint32_t botb,         // bot in BWT'
+		TIndexOffU topf,         // top in BWT
+		TIndexOffU botf,         // bot in BWT
+		TIndexOffU topb,         // top in BWT'
+		TIndexOffU botb,         // bot in BWT'
 		uint16_t len,          // length of hit
 		DoublyLinkedList<Edit> *prevEdit);  // previous edit
 	
@@ -1528,10 +1535,10 @@ protected:
 	bool searchSeedBi(
 		int step,              // depth into steps_[] array
 		int depth,             // recursion depth
-		uint32_t topf,         // top in BWT
-		uint32_t botf,         // bot in BWT
-		uint32_t topb,         // top in BWT'
-		uint32_t botb,         // bot in BWT'
+		TIndexOffU topf,         // top in BWT
+		TIndexOffU botf,         // bot in BWT
+		TIndexOffU topb,         // top in BWT'
+		TIndexOffU botb,         // bot in BWT'
 		SideLocus tloc,        // locus for top (perhaps unititialized)
 		SideLocus bloc,        // locus for bot (perhaps unititialized)
 		Constraint c0,         // constraints to enforce in seed zone 0
@@ -1546,10 +1553,10 @@ protected:
 	inline void nextLocsBi(
 		SideLocus& tloc,            // top locus
 		SideLocus& bloc,            // bot locus
-		uint32_t topf,              // top in BWT
-		uint32_t botf,              // bot in BWT
-		uint32_t topb,              // top in BWT'
-		uint32_t botb,              // bot in BWT'
+		TIndexOffU topf,              // top in BWT
+		TIndexOffU botf,              // bot in BWT
+		TIndexOffU topb,              // top in BWT'
+		TIndexOffU botb,              // bot in BWT'
 		int step);                  // step to get ready for
 	
 	// Following are set in searchAllSeeds then used by searchSeed()
@@ -1589,8 +1596,8 @@ protected:
 }
 
 #define SANITY_CHECK_4TUP(t, b, tp, bp) { \
-	ASSERT_ONLY(uint32_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); \
-	ASSERT_ONLY(uint32_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); \
+	ASSERT_ONLY(TIndexOffU tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); \
+	ASSERT_ONLY(TIndexOffU totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); \
 	assert_eq(tot, totp); \
 }
 
diff --git a/aligner_seed2.cpp b/aligner_seed2.cpp
index 58415bc..2bf75a8 100644
--- a/aligner_seed2.cpp
+++ b/aligner_seed2.cpp
@@ -170,26 +170,24 @@ int DescentDriver::advance(
     // Advance until some stopping condition
     bool stop = heap_.empty();
     while(!stop) {
-		// Pop off the highest-priority descent.  Note that some outgoing edges
-		// might have since been explored, which could reduce the priority of
-		// the descent once we .
+		// Pop off the highest-priority descent.
         TDescentPair p = heap_.pop();
-		df_.alloc(); df_.pop();
+		df_.alloc(); df_.pop(); // Create new descent
         df_[p.second].followBestOutgoing(
-            q_,
-            ebwtFw,
-            ebwtBw,
-            sc,
+            q_,        // read
+            ebwtFw,    // forward index
+            ebwtBw,    // backward index
+            sc,        // scoring scheme
 			minsc_,    // minimum score
 			maxpen_,   // maximum penalty
 			re_,       // redundancy checker
             df_,       // Descent factory
             pf_,       // DescentPos factory
-            roots_,
-            confs_,
-            heap_,
-            alsink_,
-            met,
+            roots_,    // search roots
+            confs_,    // search root configurations
+            heap_,     // descent heap
+            alsink_,   // alignment sink
+            met,       // metrics
 			prm);      // per-read metrics
 		TAlScore best = std::numeric_limits<TAlScore>::max();
 		if(!heap_.empty()) {
@@ -220,10 +218,10 @@ bool DescentAlignmentSink::reportAlignment(
 	const Read& q,                  // query string
 	const Ebwt& ebwtFw,             // forward index
 	const Ebwt& ebwtBw,             // mirror index
-	TIndexOff topf,                 // SA range top in forward index
-	TIndexOff botf,                 // SA range bottom in forward index
-	TIndexOff topb,                 // SA range top in backward index
-	TIndexOff botb,                 // SA range bottom in backward index
+	TIndexOffU topf,                 // SA range top in forward index
+	TIndexOffU botf,                 // SA range bottom in forward index
+	TIndexOffU topb,                 // SA range top in backward index
+	TIndexOffU botb,                 // SA range bottom in backward index
 	TDescentId id,                  // id of leaf Descent
 	TRootId rid,                    // id of search root
 	const Edit& e,                  // final edit, if needed
@@ -242,8 +240,8 @@ bool DescentAlignmentSink::reportAlignment(
 	// Adjust al5pi and al5pf to take the final edit into account (if
 	// there is one)
 	// Check if this is redundant with a previous reported alignment
-	Triple<TIndexOff, TIndexOff, size_t> lhs(topf, botf, 0);
-	Triple<TIndexOff, TIndexOff, size_t> rhs(topb, botb, q.length()-1);
+	Triple<TIndexOffU, TIndexOffU, size_t> lhs(topf, botf, 0);
+	Triple<TIndexOffU, TIndexOffU, size_t> rhs(topb, botb, q.length()-1);
 	if(!lhs_.insert(lhs)) {
 		rhs_.insert(rhs);
 		return false; // Already there
@@ -275,8 +273,8 @@ bool DescentAlignmentSink::reportAlignment(
 			// Invert them back to how they were before
 			Edit::invertPoss(edits_, len, ei, en, true);
 		}
-		ASSERT_ONLY(uint32_t toptmp = 0);
-		ASSERT_ONLY(uint32_t bottmp = 0);
+		ASSERT_ONLY(TIndexOffU toptmp = 0);
+		ASSERT_ONLY(TIndexOffU bottmp = 0);
 		// Check that the edited string occurs in the reference
 		if(!ebwtFw.contains(rf, &toptmp, &bottmp)) {
 			std::cerr << rf << std::endl;
@@ -309,10 +307,10 @@ bool Descent::init(
 	TAlScore maxpen,                // maximum penalty
     TReadOff al5pi,                 // offset from 5' of 1st aligned char
     TReadOff al5pf,                 // offset from 5' of last aligned char
-    TIndexOff topf,                 // SA range top in FW index
-    TIndexOff botf,                 // SA range bottom in FW index
-    TIndexOff topb,                 // SA range top in BW index
-    TIndexOff botb,                 // SA range bottom in BW index
+    TIndexOffU topf,                 // SA range top in FW index
+    TIndexOffU botf,                 // SA range bottom in FW index
+    TIndexOffU topb,                 // SA range top in BW index
+    TIndexOffU botb,                 // SA range bottom in BW index
     bool l2r,                       // direction this descent will go in
     size_t descid,                  // my ID
     TDescentId parent,              // parent ID
@@ -356,7 +354,7 @@ bool Descent::init(
 		}
 	}
     bool branches = false, hitEnd = false, done = false;
-    TIndexOff topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
+    TIndexOffU topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
     off5p_i_ = 0;
 #ifndef NDEBUG
     size_t depth = al5pf_ - al5pi_ + 1;
@@ -460,7 +458,7 @@ bool Descent::init(
     lastRecalc_ = true;
 	gapadd_ = 0;
     bool branches = false, hitEnd = false, done = false;
-    TIndexOff topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
+    TIndexOffU topf_new = 0, botf_new = 0, topb_new = 0, botb_new = 0;
     off5p_i_ = 0;
     bool matchSucc = followMatches(
         q,
@@ -578,11 +576,11 @@ size_t Descent::recalcOutgoing(
 	TScore pen_rdg_ex = sc.readGapExtend(), pen_rfg_ex = sc.refGapExtend();
 	TScore pen_rdg_op = sc.readGapOpen(),   pen_rfg_op = sc.refGapOpen();
 	// Top and bot in the direction of the descent
-	TIndexOff top  = l2r_ ? topb_ : topf_;
-	TIndexOff bot  = l2r_ ? botb_ : botf_;
+	TIndexOffU top  = l2r_ ? topb_ : topf_;
+	TIndexOffU bot  = l2r_ ? botb_ : botf_;
 	// Top and bot in the opposite direction
-	TIndexOff topp = l2r_ ? topf_ : topb_;
-	TIndexOff botp = l2r_ ? botf_ : botb_;
+	TIndexOffU topp = l2r_ ? topf_ : topb_;
+	TIndexOffU botp = l2r_ ? botf_ : botb_;
 	assert_eq(botp - topp, bot - top);
 	DescentEdge edge;
 	size_t nout = 0;
@@ -601,10 +599,10 @@ size_t Descent::recalcOutgoing(
 		assert_geq(maxpend, pen_);    // can't have already exceeded max penalty
 		TScore diff = maxpend - pen_; // room we have left
 		// Get pointer to SA ranges in the direction of descent
-		const TIndexOff *t  = l2r_ ? pf[d].topb : pf[d].topf;
-		const TIndexOff *b  = l2r_ ? pf[d].botb : pf[d].botf;
-		const TIndexOff *tp = l2r_ ? pf[d].topf : pf[d].topb;
-		const TIndexOff *bp = l2r_ ? pf[d].botf : pf[d].botb;
+		const TIndexOffU *t  = l2r_ ? pf[d].topb : pf[d].topf;
+		const TIndexOffU *b  = l2r_ ? pf[d].botb : pf[d].botf;
+		const TIndexOffU *tp = l2r_ ? pf[d].topf : pf[d].topb;
+		const TIndexOffU *bp = l2r_ ? pf[d].botf : pf[d].botb;
 		assert_eq(pf[d].botf - pf[d].topf, pf[d].botb - pf[d].topb);
 		// What are the read char / quality?
 		std::pair<int, int> p = q.get(off5p, fw);
@@ -628,14 +626,14 @@ size_t Descent::recalcOutgoing(
 					if(!pf[d].flags.mmExplore(j)) {
 						continue; // Already been explored
 					}
-					TIndexOff topf = pf[d].topf[j], botf = pf[d].botf[j];
-					ASSERT_ONLY(TIndexOff topb = pf[d].topb[j], botb = pf[d].botb[j]);
+					TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
+					ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
 					if(re.contains(fw, l2r_, cur5pi, cur5pf, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_mm)) {
 						prm.nRedSkip++;
 						continue; // Redundant with a path already explored
 					}
 					prm.nRedFail++;
-					TIndexOff width = b[j] - t[j];
+					TIndexOffU width = b[j] - t[j];
 					Edit edit((uint32_t)off5p, (int)("ACGTN"[j]), (int)("ACGTN"[c]), EDIT_TYPE_MM);
 					DescentPriority pri(pen_ + pen_mm, depth, width, rootpri);
                     assert(topf != 0 || botf != 0);
@@ -684,8 +682,8 @@ size_t Descent::recalcOutgoing(
 							if(!pf[d].flags.rdgExplore(j)) {
 								continue; // Already been explored
 							}
-							TIndexOff topf = pf[d].topf[j], botf = pf[d].botf[j];
-							ASSERT_ONLY(TIndexOff topb = pf[d].topb[j], botb = pf[d].botb[j]);
+							TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
+							ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
 							assert(topf != 0 || botf != 0);
 							assert(topb != 0 || botb != 0);
 							if(re.contains(fw, l2r_, cur5pi_i, cur5pf_i, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_rdg_ex)) {
@@ -693,7 +691,7 @@ size_t Descent::recalcOutgoing(
 								continue; // Redundant with a path already explored
 							}
 							prm.nRedFail++;
-							TIndexOff width = b[j] - t[j];
+							TIndexOffU width = b[j] - t[j];
 							// off5p holds the offset from the 5' of the next
 							// character we were trying to align when we decided to
 							// introduce a read gap (before that character).  If we
@@ -721,15 +719,15 @@ size_t Descent::recalcOutgoing(
 						// Extension of a reference gap
 						rfex = true;
 						if(pf[d].flags.rfgExplore()) {
-                            TIndexOff topf = l2r_ ? topp : top;
-                            TIndexOff botf = l2r_ ? botp : bot;
-							ASSERT_ONLY(TIndexOff topb = l2r_ ? top : topp);
-							ASSERT_ONLY(TIndexOff botb = l2r_ ? bot : botp);
+                            TIndexOffU topf = l2r_ ? topp : top;
+                            TIndexOffU botf = l2r_ ? botp : bot;
+							ASSERT_ONLY(TIndexOffU topb = l2r_ ? top : topp);
+							ASSERT_ONLY(TIndexOffU botb = l2r_ ? bot : botp);
 							assert(topf != 0 || botf != 0);
 							assert(topb != 0 || botb != 0);
 							size_t nrefal = cur5pf - cur5pi + gapadd_;
 							if(!re.contains(fw, l2r_, cur5pi, cur5pf, nrefal, topf, botf, pen_ + pen_rfg_ex)) {
-								TIndexOff width = bot - top;
+								TIndexOffU width = bot - top;
 								Edit edit((uint32_t)off5p, '-', (int)("ACGTN"[c]), EDIT_TYPE_REF_GAP);
 								DescentPriority pri(pen_ + pen_rfg_ex, depth, width, rootpri);
 								assert(topf != 0 || botf != 0);
@@ -765,8 +763,8 @@ size_t Descent::recalcOutgoing(
 						if(!pf[d].flags.rdgExplore(j)) {
 							continue; // Already been explored
 						}
-						TIndexOff topf = pf[d].topf[j], botf = pf[d].botf[j];
-						ASSERT_ONLY(TIndexOff topb = pf[d].topb[j], botb = pf[d].botb[j]);
+						TIndexOffU topf = pf[d].topf[j], botf = pf[d].botf[j];
+						ASSERT_ONLY(TIndexOffU topb = pf[d].topb[j], botb = pf[d].botb[j]);
 						assert(topf != 0 || botf != 0);
 						assert(topb != 0 || botb != 0);
 						if(re.contains(fw, l2r_, cur5pi_i, cur5pf_i, cur5pf - cur5pi + 1 + gapadd_, topf, botf, pen_ + pen_rdg_op)) {
@@ -774,7 +772,7 @@ size_t Descent::recalcOutgoing(
 							continue; // Redundant with a path already explored
 						}
 						prm.nRedFail++;
-						TIndexOff width = b[j] - t[j];
+						TIndexOffU width = b[j] - t[j];
 						// off5p holds the offset from the 5' of the next
 						// character we were trying to align when we decided to
 						// introduce a read gap (before that character).  If we
@@ -799,15 +797,15 @@ size_t Descent::recalcOutgoing(
 				if(!allmatch && pen_rfg_op <= diff && !rfex) {
 					// Opening a new reference gap
                     if(pf[d].flags.rfgExplore()) {
-                        TIndexOff topf = l2r_ ? topp : top;
-                        TIndexOff botf = l2r_ ? botp : bot;
-						ASSERT_ONLY(TIndexOff topb = l2r_ ? top : topp);
-						ASSERT_ONLY(TIndexOff botb = l2r_ ? bot : botp);
+                        TIndexOffU topf = l2r_ ? topp : top;
+                        TIndexOffU botf = l2r_ ? botp : bot;
+						ASSERT_ONLY(TIndexOffU topb = l2r_ ? top : topp);
+						ASSERT_ONLY(TIndexOffU botb = l2r_ ? bot : botp);
 						assert(topf != 0 || botf != 0);
 						assert(topb != 0 || botb != 0);
 						size_t nrefal = cur5pf - cur5pi + gapadd_;
 						if(!re.contains(fw, l2r_, cur5pi, cur5pf, nrefal, topf, botf, pen_ + pen_rfg_op)) {
-							TIndexOff width = bot - top;
+							TIndexOffU width = bot - top;
 							Edit edit((uint32_t)off5p, '-', (int)("ACGTN"[c]), EDIT_TYPE_REF_GAP);
 							DescentPriority pri(pen_ + pen_rfg_op, depth, width, rootpri);
 							assert(topf != 0 || botf != 0);
@@ -980,10 +978,10 @@ void Descent::print(
  */
 bool Descent::bounce(
 	const Read& q,                  // query string
-    TIndexOff topf,                 // SA range top in fw index
-    TIndexOff botf,                 // SA range bottom in fw index
-    TIndexOff topb,                 // SA range top in bw index
-    TIndexOff botb,                 // SA range bottom in bw index
+    TIndexOffU topf,                 // SA range top in fw index
+    TIndexOffU botf,                 // SA range bottom in fw index
+    TIndexOffU topb,                 // SA range top in bw index
+    TIndexOffU botb,                 // SA range bottom in bw index
 	const Ebwt& ebwtFw,             // forward index
 	const Ebwt& ebwtBw,             // mirror index
 	const Scoring& sc,              // scoring scheme
@@ -1146,7 +1144,7 @@ void Descent::followBestOutgoing(
 		}
 		size_t dfsz = df.size();
 		size_t pfsz = pf.size();
-		TIndexOff topf, botf, topb, botb;
+		TIndexOffU topf, botf, topb, botb;
 		size_t d = posid_ + doff;
 		if(e.e.isRefGap()) {
 			d--; // might underflow
@@ -1257,10 +1255,10 @@ void Descent::nextLocsBi(
 	const Ebwt& ebwtBw, // mirror index
 	SideLocus& tloc,    // top locus
 	SideLocus& bloc,    // bot locus
-	TIndexOff topf,     // top in BWT
-	TIndexOff botf,     // bot in BWT
-	TIndexOff topb,     // top in BWT'
-	TIndexOff botb)     // bot in BWT'
+	TIndexOffU topf,     // top in BWT
+	TIndexOffU botf,     // bot in BWT
+	TIndexOffU topb,     // top in BWT'
+	TIndexOffU botb)     // bot in BWT'
 {
 	assert_gt(botf, 0);
 	// Which direction are we going in next?
@@ -1331,10 +1329,10 @@ bool Descent::followMatches(
     bool& hitEnd,              // out: true -> hit read end with non-empty range
     bool& done,                // out: true -> we made a full alignment
     TReadOff& off5p_i,         // out: initial 5' offset
-    TIndexOff& topf_bounce,    // out: top of SA range for fw idx for bounce
-    TIndexOff& botf_bounce,    // out: bot of SA range for fw idx for bounce
-    TIndexOff& topb_bounce,    // out: top of SA range for bw idx for bounce
-    TIndexOff& botb_bounce)    // out: bot of SA range for bw idx for bounce
+    TIndexOffU& topf_bounce,    // out: top of SA range for fw idx for bounce
+    TIndexOffU& botf_bounce,    // out: bot of SA range for fw idx for bounce
+    TIndexOffU& topb_bounce,    // out: top of SA range for bw idx for bounce
+    TIndexOffU& botb_bounce)    // out: bot of SA range for bw idx for bounce
 {
 	// TODO: make these full-fledged parameters
 	size_t nobranchDepth = 20;
@@ -1348,7 +1346,7 @@ bool Descent::followMatches(
 	}
 #endif
 	SideLocus tloc, bloc;
-	TIndexOff topf = topf_, botf = botf_, topb = topb_, botb = botb_;
+	TIndexOffU topf = topf_, botf = botf_, topb = topb_, botb = botb_;
     bool fw = rs[rid_].fw;
 	bool toward3p;
 	size_t off5p;
@@ -1601,9 +1599,9 @@ bool Descent::followMatches(
     assert(tloc.valid());
 	assert(botf - topf == 1 ||  bloc.valid());
 	assert(botf - topf > 1  || !bloc.valid());
-	TIndexOff t[4], b[4];   // dest BW ranges
-	TIndexOff tp[4], bp[4]; // dest BW ranges for "prime" index
-	ASSERT_ONLY(TIndexOff lasttot = botf - topf);
+	TIndexOffU t[4], b[4];   // dest BW ranges
+	TIndexOffU tp[4], bp[4]; // dest BW ranges for "prime" index
+	ASSERT_ONLY(TIndexOffU lasttot = botf - topf);
 	bool fail = false;
 	while(!fail && !hitEnd) {
         assert(!done);
@@ -1614,7 +1612,7 @@ bool Descent::followMatches(
 		assert(botf - topf == 1 ||  bloc.valid());
 		assert(botf - topf > 1  || !bloc.valid());
 		assert(tloc.valid());
-        TIndexOff width = botf - topf;
+        TIndexOffU width = botf - topf;
 		bool ltr = l2r_;
 		const Ebwt& ebwt = ltr ? ebwtBw : ebwtFw;
 		t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
@@ -1639,8 +1637,8 @@ bool Descent::followMatches(
 			}
 			ebwt.mapBiLFEx(tloc, bloc, t, b, tp, bp);
 			// t, b, tp and bp now filled
-			ASSERT_ONLY(TIndexOff tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
-			ASSERT_ONLY(TIndexOff totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
+			ASSERT_ONLY(TIndexOffU tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
+			ASSERT_ONLY(TIndexOffU totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
 			assert_eq(tot, totp);
 			assert_leq(tot, lasttot);
 			ASSERT_ONLY(lasttot = tot);
@@ -1656,7 +1654,7 @@ bool Descent::followMatches(
 		} else {
 			tp[0] = tp[1] = tp[2] = tp[3] = bp[0] = bp[1] = bp[2] = bp[3] = 0;
 			// Range delimited by tloc/bloc has size 1
-			TIndexOff ntop = ltr ? topb : topf;
+			TIndexOffU ntop = ltr ? topb : topf;
 			met.bwops++;
 			met.bwops_1++;
 			prm.nSdFmops++;
@@ -1703,10 +1701,10 @@ bool Descent::followMatches(
         // case.
         
 		// Convert t, tp, b, bp info tf, bf, tb, bb
-		TIndexOff *tf = ltr ? tp : t;
-		TIndexOff *bf = ltr ? bp : b;
-		TIndexOff *tb = ltr ? t : tp;
-		TIndexOff *bb = ltr ? b : bp;
+		TIndexOffU *tf = ltr ? tp : t;
+		TIndexOffU *bf = ltr ? bp : b;
+		TIndexOffU *tb = ltr ? t : tp;
+		TIndexOffU *bb = ltr ? b : bp;
 		// Allocate DescentPos data structure.
 		if(firstPos) {
 			posid_ = pf.alloc();
@@ -1808,9 +1806,12 @@ bool Descent::followMatches(
 
 #include <string>
 #include "sstring.h"
+#include "aligner_driver.h"
 
 using namespace std;
 
+bool gReportOverhangs = true;
+
 /**
  * A way of feeding simply tests to the seed alignment infrastructure.
  */
@@ -1866,7 +1867,12 @@ int main(int argc, char **argv) {
 				seq.reverseComp();
 				qual.reverse();
 			}
-			dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+			dr.initRead(
+				Read("test", seq.toZBuf(), qual.toZBuf()),
+				false, // nofw
+				false, // norc
+				-30,   // minsc
+				30);   // maxpen
 
 			// Set up the DescentConfig
 			DescentConfig conf;
@@ -1879,6 +1885,7 @@ int main(int argc, char **argv) {
 				(i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root
 				(i == 0) ? true : false,           // left-to-right?
 				rc == 0,   // forward?
+				1,         // landing
 				0.0f);   // root priority
 			
 			// Do the search
@@ -1902,7 +1909,12 @@ int main(int argc, char **argv) {
         // Set up the read
         BTDnaString seq ("GCTATATAGC", true);
         BTString    qual("ABCDEFGHIa");
-		dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+		dr.initRead(
+			Read("test", seq.toZBuf(), qual.toZBuf()),
+			false, // nofw
+			false, // norc
+			-30,   // minsc
+			30);   // maxpen
         
         // Set up the DescentConfig
         DescentConfig conf;
@@ -1915,7 +1927,8 @@ int main(int argc, char **argv) {
             (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root
             (i == 0) ? true : false,           // left-to-right?
             true,   // forward?
-            0.0f);   // root priority
+			1,      // landing
+            0.0f);  // root priority
         
         // Do the search
         Scoring sc = Scoring::base1();
@@ -1937,7 +1950,12 @@ int main(int argc, char **argv) {
         // Set up the read
         BTDnaString seq ("GCTATATAG", true);
         BTString    qual("ABCDEFGHI");
-		dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+		dr.initRead(
+			Read("test", seq.toZBuf(), qual.toZBuf()),
+			false, // nofw
+			false, // norc
+			-30,   // minsc
+			30);   // maxpen
         
         // Set up the DescentConfig
         DescentConfig conf;
@@ -1950,6 +1968,7 @@ int main(int argc, char **argv) {
             (i == 0) ? 0 : (seq.length() - 1), // 5' offset into read of root
             (i == 0) ? true : false,           // left-to-right?
             true,   // forward?
+			1,      // landing
             0.0f);   // root priority
         
         // Do the search
@@ -1973,11 +1992,16 @@ int main(int argc, char **argv) {
 		//                012345678901234567890123456789
         BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
         BTString    qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
-		uint32_t top, bot;
+		TIndexOffU top, bot;
 		top = bot = 0;
 		bool ret = ebwts.first->contains("GCGCTCGCATCATTTTGTGT", &top, &bot);
 		cerr << ret << ", " << top << ", " << bot << endl;
-		dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+		dr.initRead(
+			Read("test", seq.toZBuf(), qual.toZBuf()),
+			false, // nofw
+			false, // norc
+			-30,   // minsc
+			30);   // maxpen
         
         // Set up the DescentConfig
         DescentConfig conf;
@@ -1990,6 +2014,7 @@ int main(int argc, char **argv) {
             (i == 0) ? 10 : (seq.length() - 1 - 10), // 5' offset into read of root
             (i == 0) ? true : false,                 // left-to-right?
             true,   // forward?
+			1,      // landing
             0.0f);   // root priority
         
         // Do the search
@@ -2030,6 +2055,61 @@ int main(int argc, char **argv) {
     
     ebwts.first->loadIntoMemory (color, -1, true, true, true, true, false);
     ebwts.second->loadIntoMemory(color,  1, true, true, true, true, false);
+
+	// Test to see if the root selector works as we expect
+	{
+		BTDnaString seq ("GCTATATAGCGCGCTCGCATCATTTTGTGT", true);
+		BTString    qual("ABCDEFGHIabcdefghiABCDEFGHIabc");
+		//                012345678901234567890123456789
+		//                         abcdefghiA: 644
+		//                                    CDEFGHIabc : 454 + 100 = 554
+		DescentDriver dr;
+		PrioritizedRootSelector sel(
+			2.0,
+			SimpleFunc(SIMPLE_FUNC_CONST, 10.0, 10.0, 10.0, 10.0), // 6 roots
+			10);
+		// Set up the read
+		dr.initRead(
+			Read("test", seq.toZBuf(), qual.toZBuf()),
+			false, // nofw
+			false, // norc
+			-30,   // minsc
+			30,    // maxpen
+			NULL,  // opposite mate
+			&sel); // root selector
+		dr.printRoots(std::cerr);
+		assert_eq(12, dr.roots().size());
+		assert_eq(652, dr.roots()[0].pri);
+		assert_eq(652, dr.roots()[1].pri);
+	}
+
+	// Test to see if the root selector works as we expect; this time the
+	// string is longer (64 chars) and there are a couple Ns
+	{
+		BTDnaString seq ("NCTATATAGCGCGCTCGCATCNTTTTGTGTGCTATATAGCGCGCTCGCATCATTTTGTGTTTAT", true);
+		BTString    qual("ABCDEFGHIJKLMNOPabcdefghijklmnopABCDEFGHIJKLMNOPabcdefghijklmnop");
+		//                0123456789012345678901234567890123456789012345678901234567890123
+		//                                 bcdefghijklmnop: 1080 - 150     bcdefghijklmnop: 1080 + 150 = 1230
+		//                                                 = 930
+		DescentDriver dr;
+		PrioritizedRootSelector sel(
+			2.0,
+			SimpleFunc(SIMPLE_FUNC_CONST, 10.0, 10.0, 10.0, 10.0), // 6 * 4 = 24 roots
+			15);  // landing size
+		// Set up the read
+		dr.initRead(
+			Read("test", seq.toZBuf(), qual.toZBuf()),
+			false, // nofw
+			false, // norc
+			-30,   // minsc
+			30,    // maxpen
+			NULL,  // opposite mate
+			&sel); // root selector
+		dr.printRoots(std::cerr);
+		assert_eq(24, dr.roots().size());
+		assert_eq(1230, dr.roots()[0].pri);
+		assert_eq(1230, dr.roots()[1].pri);
+	}
 	
 	// Query is longer than ftab and matches exactly once.  One search root for
 	// forward read.
@@ -2047,7 +2127,12 @@ int main(int argc, char **argv) {
 				DescentDriver dr;
 				
 				// Set up the read
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2060,6 +2145,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					true,   // forward?
+					1,      // landing
 					0.0f);   // root priority
 				
 				// Do the search
@@ -2091,7 +2177,12 @@ int main(int argc, char **argv) {
 				DescentDriver dr;
 				
 				// Set up the read
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2104,12 +2195,14 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					true,   // forward?
+					1,      // landing
 					0.0f);   // root priority
 				dr.addRoot(
 					conf,   // DescentConfig
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					false,  // forward?
+					1,      // landing
 					1.0f);   // root priority
 				
 				// Do the search
@@ -2170,7 +2263,12 @@ int main(int argc, char **argv) {
 					PerReadMetrics prm;
 					DescentDriver dr;
 					
-					dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+					dr.initRead(
+						Read("test", seq.toZBuf(), qual.toZBuf()),
+						false, // nofw
+						false, // norc
+						-30,   // minsc
+						30);   // maxpen
 					
 					// Set up the DescentConfig
 					DescentConfig conf;
@@ -2184,6 +2282,7 @@ int main(int argc, char **argv) {
 						j,       // 5' offset into read of root
 						i == 0,  // left-to-right?
 						true,    // forward?
+						1,      // landing
 						0.0f);    // root priority
 					
 					// Do the search
@@ -2244,7 +2343,12 @@ int main(int argc, char **argv) {
 					PerReadMetrics prm;
 					DescentDriver dr;
 					
-					dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+					dr.initRead(
+						Read("test", seq.toZBuf(), qual.toZBuf()),
+						false, // nofw
+						false, // norc
+						-30,   // minsc
+						30);   // maxpen
 					
 					// Set up the DescentConfig
 					DescentConfig conf;
@@ -2258,6 +2362,7 @@ int main(int argc, char **argv) {
 						j,      // 5' offset into read of root
 						i == 0, // left-to-right?
 						true,   // forward?
+						1,      // landing
 						0.0f);   // root priority
 					
 					// Do the search
@@ -2307,7 +2412,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2325,6 +2435,7 @@ int main(int argc, char **argv) {
 						j,      // 5' offset into read of root
 						ltr,    // left-to-right?
 						fw,     // forward?
+						1,      // landing
 						0.0f);   // root priority
 				}
 				
@@ -2362,7 +2473,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2381,6 +2497,7 @@ int main(int argc, char **argv) {
 						(TReadOff)j,        // 5' offset into read of root
 						ltr,      // left-to-right?
 						fw,       // forward?
+						1,        // landing
 						(float)((float)y * 1.0f)); // root priority
 					// Assume left-to-right
 					size_t beg = j;
@@ -2480,7 +2597,12 @@ int main(int argc, char **argv) {
 				
 				Read q("test", seq.toZBuf(), qual.toZBuf());
 				assert(q.repOk());
-				dr.initRead(q, -30, 30);
+				dr.initRead(
+					q,     // read
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2494,6 +2616,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					k == 0, // forward?
+					1,      // landing
 					0.0f);  // root priority
 				
 				// Do the search
@@ -2559,7 +2682,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2573,6 +2701,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					k == 0, // forward?
+					1,      // landing
 					0.0f);  // root priority
 				
 				// Do the search
@@ -2630,7 +2759,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2644,6 +2778,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					true,   // forward?
+					1,      // landing
 					0.0f);  // root priority
 				
 				// Do the search
@@ -2707,7 +2842,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -30, 30);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-30,   // minsc
+					30);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2721,6 +2861,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					true,   // forward?
+					1,      // landing
 					0.0f);  // root priority
 				
 				// Do the search
@@ -2785,7 +2926,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -50, 50);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-50,   // minsc
+					50);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2799,6 +2945,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					true,   // forward?
+					1,      // landing
 					0.0f);  // root priority
 				
 				// Do the search
@@ -2900,7 +3047,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -50, 50);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-50,   // minsc
+					50);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2914,6 +3066,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					true,   // forward?
+					1,      // landing
 					0.0f);  // root priority
 				
 				// Do the search
@@ -2982,7 +3135,12 @@ int main(int argc, char **argv) {
 				PerReadMetrics prm;
 				DescentDriver dr;
 				
-				dr.initRead(Read("test", seq.toZBuf(), qual.toZBuf()), -50, 50);
+				dr.initRead(
+					Read("test", seq.toZBuf(), qual.toZBuf()),
+					false, // nofw
+					false, // norc
+					-50,   // minsc
+					50);   // maxpen
 				
 				// Set up the DescentConfig
 				DescentConfig conf;
@@ -2996,6 +3154,7 @@ int main(int argc, char **argv) {
 					j,      // 5' offset into read of root
 					i == 0, // left-to-right?
 					true,   // forward?
+					1,      // landing
 					0.0f);  // root priority
 				
 				// Do the search
diff --git a/aligner_seed2.h b/aligner_seed2.h
index 0dd5027..e7c4e8c 100644
--- a/aligner_seed2.h
+++ b/aligner_seed2.h
@@ -107,8 +107,8 @@
 #include "read.h"
 #include "ds.h"
 #include "group_walk.h"
+#include "btypes.h"
 
-typedef uint32_t TIndexOff;
 typedef size_t   TReadOff;
 typedef int64_t  TScore;
 typedef float    TRootPri;
@@ -180,7 +180,7 @@ struct DescentPriority {
 	DescentPriority(
 		TScore pen_,
 		size_t depth_,
-		TIndexOff width_,
+		TIndexOffU width_,
 		float rootpri_)
 	{
 		pen = pen_;
@@ -192,7 +192,7 @@ struct DescentPriority {
 	/**
 	 * Initialize new DescentPriority.
 	 */
-	void init(TScore pen_, size_t depth_, TIndexOff width_, float rootpri_) {
+	void init(TScore pen_, size_t depth_, TIndexOffU width_, float rootpri_) {
 		pen = pen_;
 		depth = depth_;
 		width = width_;
@@ -264,7 +264,7 @@ struct DescentPriority {
 
 	TScore pen;      // total penalty accumulated so far
 	size_t depth;    // depth from root of descent
-	TIndexOff width; // width of the SA range
+	TIndexOffU width; // width of the SA range
 	float  rootpri;  // priority of the root
 };
 
@@ -397,8 +397,8 @@ struct DescentRedundancyKey {
 	DescentRedundancyKey(
 		TReadOff  al5pf_,
 		size_t    rflen_,
-		TIndexOff topf_,
-		TIndexOff botf_)
+		TIndexOffU topf_,
+		TIndexOffU botf_)
 	{
 		init(al5pf_, rflen_, topf_, botf_);
 	}
@@ -414,8 +414,8 @@ struct DescentRedundancyKey {
 	void init(
 		TReadOff  al5pf_,
 		size_t    rflen_,
-		TIndexOff topf_,
-		TIndexOff botf_)
+		TIndexOffU topf_,
+		TIndexOffU botf_)
 	{
 		al5pf = al5pf_;
 		rflen = rflen_;
@@ -439,8 +439,8 @@ struct DescentRedundancyKey {
 
 	TReadOff al5pf; // 3'-most aligned char, as offset from 5' end
 	size_t rflen;   // number of reference characters involved in alignment
-	TIndexOff topf; // top w/r/t forward index
-	TIndexOff botf; // bot w/r/t forward index
+	TIndexOffU topf; // top w/r/t forward index
+	TIndexOffU botf; // bot w/r/t forward index
 };
 
 /**
@@ -518,8 +518,8 @@ public:
 		TReadOff al5pi,
 		TReadOff al5pf,
 		size_t rflen,
-		TIndexOff topf,
-		TIndexOff botf,
+		TIndexOffU topf,
+		TIndexOffU botf,
 		TScore pen)
 	{
 		assert(inited_);
@@ -557,8 +557,8 @@ public:
 		TReadOff al5pi,
 		TReadOff al5pf,
 		size_t rflen,
-		TIndexOff topf,
-		TIndexOff botf,
+		TIndexOffU topf,
+		TIndexOffU botf,
 		TScore pen)
 	{
 		assert(inited_);
@@ -620,8 +620,34 @@ struct DescentRoot {
 
 	DescentRoot() { reset(); }
 
-	DescentRoot(size_t off5p_, bool l2r_, bool fw_, size_t len, float pri_) {
-		init(off5p_, l2r_, fw_, len, pri_);
+	DescentRoot(
+		size_t off5p_,
+		bool l2r_,
+		bool fw_,
+		size_t landing_,
+		size_t len,
+		float pri_)
+	{
+		init(off5p_, l2r_, fw_, landing_, len, pri_);
+	}
+	
+	/**
+	 * Initialize a new descent root.
+	 */
+	void init(
+		size_t off5p_,
+		bool l2r_,
+		bool fw_,
+		size_t landing_,
+		size_t len,
+		float pri_)
+	{
+		off5p = off5p_;
+		l2r = l2r_;
+		fw = fw_;
+		landing = landing_;
+		pri = pri_;
+		assert_lt(off5p, len);
 	}
 	
 	/**
@@ -639,19 +665,42 @@ struct DescentRoot {
 	}
 	
 	/**
-	 * Initialize a new descent root.
+	 * Determine if two DescentRoots are equal.
 	 */
-	void init(size_t off5p_, bool l2r_, bool fw_, size_t len, float pri_) {
-		off5p = off5p_;
-		l2r = l2r_;
-		fw = fw_;
-		pri = pri_;
-		assert_lt(off5p, len);
+	bool operator==(const DescentRoot& o) const {
+		return pri == o.pri && off5p == o.off5p && l2r == o.l2r &&
+		       fw == o.fw && landing == o.landing;
+	}
+	
+	/**
+	 * Determine the relative order of two DescentRoots.
+	 */
+	bool operator<(const DescentRoot& o) const {
+		if(pri > o.pri)         return true;
+		if(pri < o.pri)         return false;
+		if(off5p < o.off5p)     return true;
+		if(off5p > o.off5p)     return false;
+		if(fw != o.fw)          return fw;
+		if(l2r != o.l2r)        return l2r;
+		if(landing < o.landing) return true;
+		if(landing > o.landing) return false;
+		return false; // they're equal
 	}
 
+	/**
+	 * Return true iff this DescentRoot is either less than or equal to o.
+	 */
+	bool operator<=(const DescentRoot& o) const {
+		return (*this) < o || (*this) == o;
+	}
+
+	// Maybe add an array of bools indicating how the landing area of this
+	// root overlaps landing areas of already-chosen roots?
+
 	TReadOff off5p;   // root origin offset, expressed as offset from 5' end
 	bool     l2r;     // true -> move in left-to-right direction
 	bool     fw;      // true -> work with forward read, false -> revcomp
+	size_t   landing; // length of the "landing" in front of the root
 	float    pri;     // priority of seed
 };
 
@@ -810,10 +859,10 @@ struct DescentPos {
 	}
 #endif
 	
-	TIndexOff       topf[4]; // SA range top indexes in fw index
-	TIndexOff       botf[4]; // SA range bottom indexes (exclusive) in fw index
-	TIndexOff       topb[4]; // SA range top indexes in bw index
-	TIndexOff       botb[4]; // SA range bottom indexes (exclusive) in bw index
+	TIndexOffU       topf[4]; // SA range top indexes in fw index
+	TIndexOffU       botf[4]; // SA range bottom indexes (exclusive) in fw index
+	TIndexOffU       topb[4]; // SA range top indexes in bw index
+	TIndexOffU       botb[4]; // SA range bottom indexes (exclusive) in bw index
     char            c;       // read char that would yield match
 	DescentPosFlags flags;   // flags 
 };
@@ -834,10 +883,10 @@ struct DescentEdge {
 #ifndef NDEBUG
         ,
         size_t d_,
-		TIndexOff topf_,
-		TIndexOff botf_,
-		TIndexOff topb_,
-		TIndexOff botb_
+		TIndexOffU topf_,
+		TIndexOffU botf_,
+		TIndexOffU topb_,
+		TIndexOffU botb_
 #endif
         )
 	{
@@ -869,10 +918,10 @@ struct DescentEdge {
 #ifndef NDEBUG
         ,
         size_t d_,
-		TIndexOff topf_,
-		TIndexOff botf_,
-		TIndexOff topb_,
-		TIndexOff botb_
+		TIndexOffU topf_,
+		TIndexOffU botf_,
+		TIndexOffU topb_,
+		TIndexOffU botb_
 #endif
         )
 	{
@@ -927,7 +976,7 @@ struct DescentEdge {
     // This can be recreated by looking at the edit, the paren't descent's
     // len_, al5pi_, al5pf_.  I have it here so we can sanity check.
     size_t d;
-	TIndexOff topf, botf, topb, botb;
+	TIndexOffU topf, botf, topb, botb;
 #endif
 
 	Edit e;
@@ -1061,10 +1110,10 @@ public:
 		TAlScore maxpen,                // maximum penalty
 		TReadOff al5pi,                 // offset from 5' of 1st aligned char
 		TReadOff al5pf,                 // offset from 5' of last aligned char
-		TIndexOff topf,                 // SA range top in FW index
-		TIndexOff botf,                 // SA range bottom in FW index
-		TIndexOff topb,                 // SA range top in BW index
-		TIndexOff botb,                 // SA range bottom in BW index
+		TIndexOffU topf,                 // SA range top in FW index
+		TIndexOffU botf,                 // SA range bottom in FW index
+		TIndexOffU topb,                 // SA range top in BW index
+		TIndexOffU botb,                 // SA range bottom in BW index
 		bool l2r,                       // direction this descent will go in
 		size_t descid,                  // my ID
 		TDescentId parent,              // parent ID
@@ -1233,18 +1282,22 @@ public:
 		// Sort just the edits we just added
 		edits.sortPortion(ei, en);
 	}
+	
+	TIndexOffU topf() const { return topf_; }
+	TIndexOffU botf() const { return botf_; }
 
 protected:
 
 	/**
-	 *
+	 * When the search reaches the edge of the read and needs to "bounce" and
+	 * extend in the other direction.
 	 */
     bool bounce(
         const Read& q,                  // query string
-        TIndexOff topf,                 // SA range top in fw index
-        TIndexOff botf,                 // SA range bottom in fw index
-        TIndexOff topb,                 // SA range top in bw index
-        TIndexOff botb,                 // SA range bottom in bw index
+        TIndexOffU topf,                 // SA range top in fw index
+        TIndexOffU botf,                 // SA range bottom in fw index
+        TIndexOffU topb,                 // SA range top in bw index
+        TIndexOffU botb,                 // SA range bottom in bw index
         const Ebwt& ebwtFw,             // forward index
         const Ebwt& ebwtBw,             // mirror index
         const Scoring& sc,              // scoring scheme
@@ -1269,10 +1322,10 @@ protected:
 		const Ebwt& ebwtBw, // mirror index
 		SideLocus& tloc,    // top locus
 		SideLocus& bloc,    // bot locus
-		uint32_t topf,      // top in BWT
-		uint32_t botf,      // bot in BWT
-		uint32_t topb,      // top in BWT'
-		uint32_t botb);     // bot in BWT'
+		TIndexOffU topf,      // top in BWT
+		TIndexOffU botf,      // bot in BWT
+		TIndexOffU topb,      // top in BWT'
+		TIndexOffU botb);     // bot in BWT'
 
 	/**
 	 * Advance this descent by following read matches as far as possible.
@@ -1295,10 +1348,10 @@ protected:
         bool& hitEnd,              // out: true -> hit read end with non-empty range
         bool& done,                // out: true -> we made a full alignment
         TReadOff& off5p_i,         // out: initial 5' offset
-        TIndexOff& topf_bounce,    // out: top of SA range for fw idx for bounce
-        TIndexOff& botf_bounce,    // out: bot of SA range for fw idx for bounce
-        TIndexOff& topb_bounce,    // out: top of SA range for bw idx for bounce
-        TIndexOff& botb_bounce);   // out: bot of SA range for bw idx for bounce
+        TIndexOffU& topf_bounce,    // out: top of SA range for fw idx for bounce
+        TIndexOffU& botf_bounce,    // out: bot of SA range for fw idx for bounce
+        TIndexOffU& topb_bounce,    // out: top of SA range for bw idx for bounce
+        TIndexOffU& botb_bounce);   // out: bot of SA range for bw idx for bounce
 
 	/**
 	 * Recalculate our summary of the outgoing edges from this descent.  When
@@ -1328,8 +1381,8 @@ protected:
 	int             gapadd_;      // net ref characters additional
     TReadOff        off5p_i_;     // offset we started out at for this descent
 
-	TIndexOff       topf_, botf_; // incoming SA range w/r/t forward index
-	TIndexOff       topb_, botb_; // incoming SA range w/r/t forward index
+	TIndexOffU       topf_, botf_; // incoming SA range w/r/t forward index
+	TIndexOffU       topb_, botb_; // incoming SA range w/r/t forward index
 
 	size_t          descid_;      // ID of this descent
 	TDescentId      parent_;      // ID of parent descent
@@ -1365,8 +1418,8 @@ struct DescentAlignment {
 	void init(
 		TScore pen_,
 		bool fw_,
-		TIndexOff topf_,
-		TIndexOff botf_,
+		TIndexOffU topf_,
+		TIndexOffU botf_,
 		size_t ei_,
 		size_t en_)
 	{
@@ -1400,12 +1453,12 @@ struct DescentAlignment {
 		return botf - topf;
 	}
 
-	TScore pen; // score
+	TScore pen; // penalty
 	
 	bool fw; // forward or revcomp aligned?
 
-	TIndexOff topf; // top in forward index
-	TIndexOff botf; // bot in forward index
+	TIndexOffU topf; // top in forward index
+	TIndexOffU botf; // bot in forward index
 
 	size_t ei; // First edit in DescentAlignmentSink::edits_ involved in aln
 	size_t en; // # edits in DescentAlignmentSink::edits_ involved in aln
@@ -1436,8 +1489,8 @@ struct DescentPartialResolvedAlignment {
 	void init(
 		TScore pen_,
 		bool fw_,
-		TIndexOff topf_,
-		TIndexOff botf_,
+		TIndexOffU topf_,
+		TIndexOffU botf_,
 		size_t ei_,
 		size_t en_,
 		const Coord& refcoord_)
@@ -1470,11 +1523,11 @@ struct DescentPartialResolvedAlignment {
 	
 	bool fw;        // forward or revcomp aligned?
 
-	TIndexOff topf; // top in forward index
-	TIndexOff botf; // bot in forward index
+	TIndexOffU topf; // top in forward index
+	TIndexOffU botf; // bot in forward index
 
-	size_t ei;      // First edit in DescentAlignmentSink::edits_ involved in aln
-	size_t en;      // # edits in DescentAlignmentSink::edits_ involved in aln
+	size_t ei;      // First edit in DescentPartialResolvedAlignmentSink::edits_ involved in aln
+	size_t en;      // # edits in DescentPartialResolvedAlignmentSink::edits_ involved in aln
 	
 	Coord refcoord; // reference coord of leftmost ref char involved
 };
@@ -1509,10 +1562,10 @@ public:
         const Read& q,           // query string
 		const Ebwt& ebwtFw,              // forward index
 		const Ebwt& ebwtBw,              // mirror index
-		TIndexOff topf,                  // SA range top in forward index
-		TIndexOff botf,                  // SA range bottom in forward index
-		TIndexOff topb,                  // SA range top in backward index
-		TIndexOff botb,                  // SA range bottom in backward index
+		TIndexOffU topf,                  // SA range top in forward index
+		TIndexOffU botf,                  // SA range bottom in forward index
+		TIndexOffU topb,                  // SA range top in backward index
+		TIndexOffU botb,                  // SA range bottom in backward index
         TDescentId id,                   // id of leaf Descent
 		TRootId rid,                     // id of search root
         const Edit& e,                   // final edit, if needed
@@ -1656,8 +1709,8 @@ protected:
 
 	EList<Edit> edits_;
 	EList<DescentAlignment> als_;
-	ESet<Triple<TIndexOff, TIndexOff, size_t> > lhs_;
-	ESet<Triple<TIndexOff, TIndexOff, size_t> > rhs_;
+	ESet<Triple<TIndexOffU, TIndexOffU, size_t> > lhs_;
+	ESet<Triple<TIndexOffU, TIndexOffU, size_t> > rhs_;
 	size_t nelt_;
 	TAlScore bestPen_;  // best (smallest) penalty among as-yet-unreported alns
 	TAlScore worstPen_; // worst (greatest) penalty among as-yet-unreported alns
@@ -1681,8 +1734,6 @@ public:
     void reset() {
 		edits_.clear();
 		als_.clear();
-		nelt_ = 0;
-		bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max();
     }
 
 	/**
@@ -1706,91 +1757,12 @@ public:
 	}
 	
 	/**
-	 * Return the number of SA ranges involved in hits.
-	 */
-	size_t nrange() const {
-		return als_.size();
-	}
-
-	/**
-	 * Return the number of SA elements involved in hits.
-	 */
-	size_t nelt() const {
-		return nelt_;
-	}
-	
-	/**
-	 * The caller provides 'i', which is an offset of a particular element in
-	 * one of the SA ranges in the current stratum.  This function returns, in
-	 * 'al' and 'off', information about the element in terms of the range it's
-	 * part of and its offset into that range.
-	 */
-	void elt(size_t i, DescentPartialResolvedAlignment& al, size_t& ri, size_t& off) const {
-		assert_lt(i, nelt());
-		for(size_t j = 0; j < als_.size(); j++) {
-			if(i < als_[j].size()) {
-				al = als_[j];
-				ri = j;
-				off = i;
-				return;
-			}
-			i -= als_[j].size();
-		}
-		assert(false);
-	}
-	
-	/**
 	 * Get a particular alignment.
 	 */
 	const DescentPartialResolvedAlignment& operator[](size_t i) const {
 		return als_[i];
 	}
-
-	/**
-	 * Return true iff (a) we found an alignment since the sink was initialized
-	 * or since the last time advanceStratum() was called, and (b) the penalty
-	 * associated with the current-best task on the heap ('best') is worse
-	 * (higher) than the penalty associated with the alignments found most
-	 * recently (worstPen_).
-	 */
-	bool stratumDone(TAlScore bestPen) const {
-		if(nelt_ > 0 && bestPen > worstPen_) {
-			return true;
-		}
-		return false;
-	}
 	
-	/**
-	 * The alignment consumer calls this to indicate that they are done with
-	 * all the alignments in the current best non-empty stratum.  We can
-	 * therefore mark all those alignments as "reported" and start collecting
-	 * results for the next stratum.
-	 */
-	void advanceStratum() {
-		assert_gt(nelt_, 0);
-		edits_.clear();
-		als_.clear();
-		nelt_ = 0;
-		bestPen_ = worstPen_ = std::numeric_limits<TAlScore>::max();
-	}
-	
-#ifndef NDEBUG
-	/**
-	 * Check that partial alignment sink is internally consistent.
-	 */
-	bool repOk() const {
-		assert_geq(nelt_, als_.size());
-		//for(size_t i = 1; i < als_.size(); i++) {
-		//	assert_geq(als_[i].pen, als_[i-1].pen);
-		//}
-		assert(bestPen_ == std::numeric_limits<TAlScore>::max() || worstPen_ >= bestPen_);
-		return true;
-	}
-#endif
-	
-	TAlScore bestPenalty() const { return bestPen_; }
-	TAlScore worstPenalty() const { return worstPen_; }
-
 	size_t editsSize() const { return edits_.size(); }
 	size_t alsSize() const { return als_.size(); }
 	
@@ -1800,9 +1772,6 @@ protected:
 
 	EList<Edit> edits_;
 	EList<DescentPartialResolvedAlignment> als_;
-	size_t nelt_;
-	TAlScore bestPen_;  // best (smallest) penalty among as-yet-unreported alns
-	TAlScore worstPen_; // worst (greatest) penalty among as-yet-unreported alns
 };
 
 /**
@@ -1895,7 +1864,7 @@ enum {
 class DescentDriver {
 public:
 
-	DescentDriver(bool veryVerbose) :
+	DescentDriver(bool veryVerbose = false) :
 		veryVerbose_(veryVerbose)
 	{
 		reset();
@@ -1911,17 +1880,24 @@ public:
 		bool norc,
 		TAlScore minsc,
 		TAlScore maxpen,
-		const Read* qu = NULL,
+		const Read* qmate = NULL,
 		DescentRootSelector *sel = NULL)
 	{
 		reset();
-		q_ = q;
-		minsc_ = minsc;
-		maxpen_ = maxpen;
+		q_ = q;           // copy the read itself
+		minsc_ = minsc;   // minimum score
+		maxpen_ = maxpen; // maximum penalty
 		if(sel != NULL) {
-			sel->select(q_, qu, nofw, norc, confs_, roots_);
+			sel->select(  // Select search roots
+				q_,       // in: read
+				qmate,    // in: opposite mate, if paired
+				nofw,     // in: true -> don't put roots on fw read
+				norc,     // in: true -> don't put roots on rc read
+				confs_,   // out: search configs for each root
+				roots_);  // out: roots
+			//printRoots(std::cerr);
 		}
-		re_.init(q.length());
+		re_.init(q.length()); // initialize redundancy checker
 	}
 	
 	/**
@@ -1934,6 +1910,7 @@ public:
         TReadOff off,
         bool l2r,
         bool fw,
+		size_t landing,
         float pri)
     {
         confs_.push_back(conf);
@@ -1943,7 +1920,7 @@ public:
 		} else if(!l2r && off == 0) {
 			l2r = !l2r;
 		}
-		roots_.push_back(DescentRoot(off, l2r, fw, q_.length(), pri));
+		roots_.push_back(DescentRoot(off, l2r, fw, landing, q_.length(), pri));
 	}
 	
 	/**
@@ -1955,6 +1932,36 @@ public:
 	}
 	
 	/**
+	 * Print ASCII picture of where we put the roots.
+	 */
+	void printRoots(std::ostream& os) {
+		std::ostringstream fwstr, rcstr;
+		fwstr << q_.patFw << std::endl << q_.qual << std::endl;
+		rcstr << q_.patRc << std::endl << q_.qualRev << std::endl;
+		for(size_t i = 0; i < roots_.size(); i++) {
+			if(roots_[i].fw) {
+				for(size_t j = 0; j < roots_[i].off5p; j++) {
+					fwstr << " ";
+				}
+				fwstr << (roots_[i].l2r ? ">" : "<");
+				fwstr << " " << i << ":";
+				fwstr << roots_[i].pri;
+				fwstr << "\n";
+			} else {
+				size_t off = q_.length() - roots_[i].off5p - 1;
+				for(size_t j = 0; j < off; j++) {
+					rcstr << " ";
+				}
+				rcstr << (roots_[i].l2r ? ">" : "<");
+				rcstr << " " << i << ":";
+				rcstr << roots_[i].pri;
+				rcstr << "\n";
+			}
+		}
+		os << fwstr.str() << rcstr.str();
+	}
+	
+	/**
 	 * Clear the Descent driver so that we're ready to re-start seed alignment
 	 * for the current read.
 	 */
@@ -2079,6 +2086,19 @@ public:
 	TAlScore minScore() const {
 		return minsc_;
 	}
+	
+	const EList<DescentRoot>& roots() { return roots_; }
+	
+	/**
+	 * Called to pause the index-assisted search, and collect a set of partial
+	 * alignments to try using dynamic programming.
+	 *
+	 * The space explored so far is represented by the prioritized collection
+	 * of Descents in the heap.  Each Descent has one or more outgoing edges.
+	 * Each outgoing edge is a set of >=1 partial alignments we might try.
+	 */
+	void nextPartial() {
+	}
 
 protected:
 
@@ -2097,7 +2117,7 @@ protected:
 	DescentRedundancyChecker re_; // redundancy checker
 	TAlScore             curPen_; // current penalty
 	bool veryVerbose_;            // print lots of partial alignments
-
+	
 	EList<Edit> tmpedit_;
 	BTDnaString tmprfdnastr_;
 };
@@ -2131,13 +2151,13 @@ public:
 			sink.nelt(), // # elements to choose from
 			true);       // without replacement
 		offs_.resize(sink.nelt());
-		offs_.fill(std::numeric_limits<TIndexOff>::max());
+		offs_.fill(std::numeric_limits<TIndexOffU>::max());
 		sas_.resize(sink.nrange());
 		gws_.resize(sink.nrange());
 		size_t ei = 0;
 		for(size_t i = 0; i < sas_.size(); i++) {
 			size_t en = sink[i].botf - sink[i].topf;
-			sas_[i].init(sink[i].topf, q.length(), EListSlice<TIndexOff, 16>(offs_, ei, en));
+			sas_[i].init(sink[i].topf, EListSlice<TIndexOffU, 16>(offs_, ei, en));
 			gws_[i].init(ebwtFw, ref, sas_[i], rnd, met);
 			ei += en;
 		}
@@ -2179,9 +2199,9 @@ public:
 		assert_lt(off, al.size());
 		Coord refcoord;
 		WalkResult wr;
-		uint32_t tidx = 0, toff = 0, tlen = 0;
+		TIndexOffU tidx = 0, toff = 0, tlen = 0;
 		gws_[rangei].advanceElement(
-			(uint32_t)off,
+			(TIndexOffU)off,
 			ebwtFw,       // forward Bowtie index for walking left
 			ref,          // bitpair-encoded reference
 			sas_[rangei], // SA range with offsets
@@ -2189,7 +2209,7 @@ public:
 			wr,           // put the result here
 			met,          // metrics
 			prm);         // per-read metrics
-		assert_neq(0xffffffff, wr.toff);
+		assert_neq(OFF_MASK, wr.toff);
 		bool straddled = false;
 		ebwtFw.joinedToTextOff(
 			wr.elt.len,
@@ -2199,7 +2219,7 @@ public:
 			tlen,
 			true,        // reject straddlers?
 			straddled);  // straddled?
-		if(tidx == 0xffffffff) {
+		if(tidx == OFF_MASK) {
 			// The seed hit straddled a reference boundary so the seed
 			// hit isn't valid
 			return false;
@@ -2276,9 +2296,9 @@ public:
 protected:
 
 	Random1toN rnd_;
-	EList<TIndexOff, 16> offs_;
-	EList<SARangeWithOffs<EListSlice<TIndexOff, 16> > > sas_;
-	EList<GroupWalk2S<EListSlice<TIndexOff, 16>, 16> > gws_;
+	EList<TIndexOffU, 16> offs_;
+	EList<SARangeWithOffs<EListSlice<TIndexOffU, 16> > > sas_;
+	EList<GroupWalk2S<EListSlice<TIndexOffU, 16>, 16> > gws_;
 	GroupWalkState gwstate_;
 };
 
@@ -2297,19 +2317,29 @@ protected:
  * delivering batches of results to the caller.
  *
  * How to prioritize partial alignments?  One idea is to use the same
- * penalty-based prioritization used in the heap.  This has pros: (a) maintains
- * the guarantee that we're visiting alignments in best-to-worst order in
- * end-to-end alignment mode, (b) the heap is already prioritized this way, so
- * it's easier for us to compile high-priority partial alignments.  But the con
- * is that it doesn't take depth into account, which could mean that we're
- * extending a lot of very short partial alignments first.
+ * penalty-based prioritization used in the heap.  This has pros: (a) we can
+ * visit the partial alignments in best-to-worst order w/r/t penalty, (b) the
+ * heap is already prioritized this way, so it's easier for us to compile
+ * high-priority partial alignments.  But the con is that it doesn't take depth
+ * into account, which could mean that we're extending a lot of very short
+ * partial alignments first.
  *
- * A problem we should keep in mind is that some 
+ * Some ranges will be large and others will be small.  It's a good idea to try
+ * all the elements in the small ranges, but it's also good not to ignore the
+ * large ranges.  One idea is to keep all the large ranges in one category and
+ * all the small ranges in another and alternate between the two.
  */
 class DescentPartialAlignmentSelector {
 
 public:
 
+	// Ranges bigger than this are considered "big" and put in their own
+	// category.
+	static const size_t BIG_RANGE = 5;
+
+	// Number of ranges to pull out of the heap in one go
+	static const size_t NRANGE_AT_A_TIME = 3;
+
 	DescentPartialAlignmentSelector() : gwstate_(GW_CAT) { reset(); }
 
 	/**
@@ -2319,6 +2349,8 @@ public:
 	void init(
 		const Read& q,                   // read
 		const EHeap<TDescentPair>& heap, // the heap w/ the partial alns
+        EFactory<Descent>& df,           // Descent factory
+        EFactory<DescentPos>& pf,        // DescentPos factory
 		TAlScore depthBonus,             // use depth when prioritizing
 		size_t nbatch,                   // # of alignments in a batch
 		const Ebwt& ebwtFw,              // forward Bowtie index for walk-left
@@ -2334,34 +2366,17 @@ public:
 				p.first.pen += depthBonus * p.first.depth;
 				heap_.insert(p);
 			}
-		} else {
-			heap_ = heap;
-		}
-#if 0
-		// We're going to sample from space of *alignments*, not ranges.  So
-		// when we extract a sample, we'll have to do a little extra work to
-		// convert it to a <range, offset> coordinate.
-		rnd_.init(
-			sink.nelt(), // # elements to choose from
-			true);       // without replacement
-		offs_.resize(sink.nelt());
-		offs_.fill(std::numeric_limits<TIndexOff>::max());
-		sas_.resize(sink.nrange());
-		gws_.resize(sink.nrange());
-		size_t ei = 0;
-		for(size_t i = 0; i < sas_.size(); i++) {
-			size_t en = sink[i].botf - sink[i].topf;
-			sas_[i].init(sink[i].topf, q.length(), EListSlice<TIndexOff, 16>(offs_, ei, en));
-			gws_[i].init(ebwtFw, ref, sas_[i], rnd, met);
-			ei += en;
-		}
-#endif
+		} else heap_ = heap;
+		assert(!heap_.empty());
+		nextRanges(df, pf, ebwtFw, ref, rnd, met);
+		assert(!rangeExhausted());
 	}
 	
 	/**
-	 *
+	 * Return true iff there are no more partial alignments to extend.
 	 */
-	void compileBatch() {
+	bool empty() const {
+		return heap_.empty();
 	}
 	
 	/**
@@ -2369,6 +2384,9 @@ public:
 	 */
 	void reset() {
 		heap_.clear();
+		offs_.clear();
+		sas_.clear();
+		gws_.clear();
 	}
 	
 	/**
@@ -2379,9 +2397,9 @@ public:
 	}
 	
 	/**
-	 * Get next alignment and convert it to an AlnRes.
+	 * Get next partial alignment and convert it to an AlnRes.
 	 */
-	bool next(
+	bool nextPartial(
 		const DescentDriver& dr,
 		const Ebwt& ebwtFw,          // forward Bowtie index for walking left
 		const BitPairReference& ref, // bitpair-encoded reference
@@ -2400,9 +2418,9 @@ public:
 		assert_lt(off, al.size());
 		Coord refcoord;
 		WalkResult wr;
-		uint32_t tidx = 0, toff = 0, tlen = 0;
+		TIndexOffU tidx = 0, toff = 0, tlen = 0;
 		gws_[rangei].advanceElement(
-			(uint32_t)off,
+			(TIndexOffU)off,
 			ebwtFw,       // forward Bowtie index for walking left
 			ref,          // bitpair-encoded reference
 			sas_[rangei], // SA range with offsets
@@ -2410,7 +2428,7 @@ public:
 			wr,           // put the result here
 			met,          // metrics
 			prm);         // per-read metrics
-		assert_neq(0xffffffff, wr.toff);
+		assert_neq(OFF_MASK, wr.toff);
 		bool straddled = false;
 		ebwtFw.joinedToTextOff(
 			wr.elt.len,
@@ -2420,12 +2438,11 @@ public:
 			tlen,
 			true,        // reject straddlers?
 			straddled);  // straddled?
-		if(tidx == 0xffffffff) {
+		if(tidx == OFF_MASK) {
 			// The seed hit straddled a reference boundary so the seed
 			// hit isn't valid
 			return false;
 		}
-		// Coordinate of the seed hit w/r/t the pasted reference string
 		refcoord.init(tidx, (int64_t)toff, dr.sink()[rangei].fw);
 		const EList<Edit>& edits = dr.sink().edits();
 		size_t ns = 0, ngap = 0, nrefn = 0;
@@ -2438,13 +2455,6 @@ public:
 	}
 	
 	/**
-	 * Return true iff all elements have been reported.
-	 */
-	bool done() const {
-		return rnd_.done();
-	}
-
-	/**
 	 * Return the total size occupued by the Descent driver and all its
 	 * constituent parts.
 	 */
@@ -2455,7 +2465,7 @@ public:
 			   sas_.totalSizeBytes() +
 			   gws_.totalSizeBytes();
 	}
-
+	
 	/**
 	 * Return the total capacity of the Descent driver and all its constituent
 	 * parts.
@@ -2469,15 +2479,49 @@ public:
 	}
 	
 protected:
+	
+	/**
+	 * Return true iff all elements in the current range have been extended.
+	 */
+	bool rangeExhausted() const {
+		return rnd_.left() == 0;
+	}
+	
+	/**
+	 *
+	 */
+	void nextRanges(
+        EFactory<Descent>& df,           // Descent factory
+        EFactory<DescentPos>& pf,        // DescentPos factory
+		const Ebwt& ebwtFw,              // forward Bowtie index for walk-left
+		const BitPairReference& ref,     // bitpair-encoded reference
+		RandomSource& rnd,               // pseudo-randoms for sampling rows
+		WalkMetrics& met)                // metrics re: offset resolution
+	{
+		// Pop off the topmost
+		assert(!heap_.empty());
+		TDescentPair p = heap_.pop();
+		TIndexOffU topf = df[p.second].topf(), botf = df[p.second].botf();
+		assert_gt(botf, topf);
+		offs_.resize(botf - topf);
+		offs_.fill(std::numeric_limits<TIndexOffU>::max());
+		rnd_.init(botf - topf, true); // without replacement
+		sas_.resize(1);
+		gws_.resize(1);
+		sas_[0].init(topf, EListSlice<TIndexOffU, 16>(offs_, 0, botf - topf));
+		gws_[0].init(ebwtFw, ref, sas_[0], rnd, met);
+	}
+	
+	DescentPartialResolvedAlignmentSink palsink_;
 
 	// This class's working heap.  This might simply be a copy of the original
 	// heap, or it might be re-prioritized in some way.
 	EHeap<TDescentPair> heap_;
 
 	Random1toN rnd_;
-	EList<TIndexOff, 16> offs_;
-	EList<SARangeWithOffs<EListSlice<TIndexOff, 16> > > sas_;
-	EList<GroupWalk2S<EListSlice<TIndexOff, 16>, 16> > gws_;
+	EList<TIndexOffU, 16> offs_;
+	EList<SARangeWithOffs<EListSlice<TIndexOffU, 16> > > sas_;
+	EList<GroupWalk2S<EListSlice<TIndexOffU, 16>, 16> > gws_;
 	GroupWalkState gwstate_;
 };
 
diff --git a/aligner_sw.cpp b/aligner_sw.cpp
index 1a8e77f..d3d3855 100644
--- a/aligner_sw.cpp
+++ b/aligner_sw.cpp
@@ -58,6 +58,13 @@ void SwAligner::initRead(
 	sseI16fwBuilt_ = false;  // built fw query profile, 16-bit score
 	sseI16rcBuilt_ = false;  // built rc query profile, 16-bit score
 #endif
+	if(dpLog_ != NULL) {
+		if(!firstRead_) {
+			(*dpLog_) << '\n';
+		}
+		(*dpLog_) << rdfw.toZBuf() << '\t' << qufw.toZBuf();
+	}
+	firstRead_ = false;
 }
 
 /**
@@ -121,6 +128,19 @@ void SwAligner::initRef(
 		&cper_,              // in: checkpointer
 		*sc_,                // in: scoring scheme
 		nceil_);             // in: N ceiling
+	// Record the reference sequence in the log
+	if(dpLog_ != NULL) {
+		(*dpLog_) << '\t';
+		(*dpLog_) << refidx_ << ',';
+		(*dpLog_) << reflen_ << ',';
+		(*dpLog_) << minsc_ << ',';
+		(*dpLog_) << (fw ? '+' : '-') << ',';
+		rect_->write(*dpLog_);
+		(*dpLog_) << ',';
+		for(TRefOff i = rfi_; i < rff_; i++) {
+			(*dpLog_) << mask2dna[(int)rf[i]];
+		}
+	}
 }
 	
 /**
@@ -177,7 +197,7 @@ void SwAligner::initRef(
 			if(rfii < 0 || (TRefOff)rfii >= reflen) {
 				rfbuf2.push_back(4);
 			} else {
-				rfbuf2.push_back(refs.getBase(refidx, (uint32_t)rfii));
+				rfbuf2.push_back(refs.getBase(refidx, (size_t)rfii));
 			}
 			rfii++;
 		}
@@ -251,13 +271,17 @@ void SwAligner::initRef(
 }
 
 /**
- * Given a read, an alignment orientation, a range of characters in a referece
- * sequence, and a bit-encoded version of the reference, set up and execute the
- * corresponding ungapped alignment problem.  There can only be one solution.
+ * Given a read, an alignment orientation, a range of characters in a
+ * referece sequence, and a bit-encoded version of the reference, set up
+ * and execute the corresponding ungapped alignment problem.  There can
+ * only be one solution.
  *
- * The caller has already narrowed down the relevant portion of the reference
- * using, e.g., the location of a seed hit, or the range of possible fragment
- * lengths if we're searching for the opposite mate in a pair.
+ * The caller has already narrowed down the relevant portion of the
+ * reference.
+ *
+ * Does not handle the case where we'd like to scan a large section of the
+ * reference for an ungapped alignment, e.g., if we're searching for the
+ * opposite mate after finding an alignment for the anchor mate.
  */
 int SwAligner::ungappedAlign(
 	const BTDnaString&      rd,     // read sequence (could be RC)
@@ -310,7 +334,7 @@ int SwAligner::ungappedAlign(
 			if(rfii < 0 || (size_t)rfii >= reflen) {
 				rfbuf2.push_back(4);
 			} else {
-				rfbuf2.push_back(refs.getBase(refidx, (uint32_t)rfii));
+				rfbuf2.push_back(refs.getBase(refidx, (size_t)rfii));
 			}
 			rfii++;
 		}
@@ -472,7 +496,6 @@ int SwAligner::ungappedAlign(
  * last time init() was called.
  */
 bool SwAligner::align(
-	RandomSource& rnd, // source of pseudo-randoms
 	TAlScore& best)    // best alignment score observed in DP matrix
 {
 	assert(initedRef() && initedRead());
@@ -654,6 +677,9 @@ bool SwAligner::align(
 	assert(repOk());
 	cural_ = 0;
 	if(best == MIN_I64 || best < minsc_) {
+		if(dpLog_ != NULL) {
+			(*dpLog_) << ",0,0";
+		}
 		return false;
 	}
 	if(!gathered) {
@@ -694,6 +720,9 @@ bool SwAligner::align(
 	if(!btncand_.empty()) {
 		btncand_.sort();
 	}
+	if(dpLog_ != NULL) {
+		(*dpLog_) << ",1," << best;
+	}
 	return !btncand_.empty();
 }
 
diff --git a/aligner_sw.h b/aligner_sw.h
index add5c87..7ce55af 100644
--- a/aligner_sw.h
+++ b/aligner_sw.h
@@ -204,7 +204,7 @@ class SwAligner {
 
 public:
 
-	explicit SwAligner() :
+	explicit SwAligner(std::ostream *dpLog, bool firstRead = true) :
 		sseU8fw_(DP_CAT),
 		sseU8rc_(DP_CAT),
 		sseI16fw_(DP_CAT),
@@ -228,7 +228,9 @@ public:
 		cperTri_(),
 		colstop_(0),
 		lastsolcol_(0),
-		cural_(0)
+		cural_(0),
+		dpLog_(dpLog),
+		firstRead_(firstRead)
 		ASSERT_ONLY(, cand_tmp_(DP_CAT))
 	{ }
 
@@ -300,9 +302,11 @@ public:
 	 * only be one solution.
 	 *
 	 * The caller has already narrowed down the relevant portion of the
-	 * reference using, e.g., the location of a seed hit, or the range of
-	 * possible fragment lengths if we're searching for the opposite mate in a
-	 * pair.
+	 * reference.
+	 *
+	 * Does not handle the case where we'd like to scan a large section of the
+	 * reference for an ungapped alignment, e.g., if we're searching for the
+	 * opposite mate after finding an alignment for the anchor mate.
 	 */
 	int ungappedAlign(
 		const BTDnaString&      rd,     // read sequence (could be RC)
@@ -319,7 +323,7 @@ public:
 	 * Align read 'rd' to reference using read & reference information given
 	 * last time init() was called.  Uses dynamic programming.
 	 */
-	bool align(RandomSource& rnd, TAlScore& best);
+	bool align(TAlScore& best);
 	
 	/**
 	 * Populate the given SwResult with information about the "next best"
@@ -569,6 +573,8 @@ protected:
 			return bter_.nextAlignment(maxiter, res, off, nrej, niter, rnd);
 		}
 	}
+	
+	
 
 	const BTDnaString  *rd_;     // read sequence
 	const BTString     *qu_;     // read qualities
@@ -640,6 +646,9 @@ protected:
 	uint64_t nbtfiltsc_; // # candidates filtered b/c score uninteresting
 	uint64_t nbtfiltdo_; // # candidates filtered b/c dominated by other cell
 	
+	std::ostream *dpLog_;
+	bool firstRead_;
+	
 	ASSERT_ONLY(SStringExpandable<uint32_t> tmp_destU32_);
 	ASSERT_ONLY(BTDnaString tmp_editstr_, tmp_refstr_);
 	ASSERT_ONLY(EList<DpBtCandidate> cand_tmp_);
diff --git a/aligner_sw_driver.cpp b/aligner_sw_driver.cpp
index 63965de..918b71d 100644
--- a/aligner_sw_driver.cpp
+++ b/aligner_sw_driver.cpp
@@ -98,8 +98,13 @@ bool SwDriver::eeSaTups(
 	if(tot > 0) {
 		bool fwFirst = true;
         // Pick fw / rc to go first in a weighted random fashion
-		uint32_t rn32 = rnd.nextU32();
-		uint32_t rn = rn32 % (uint32_t)tot;
+#ifdef BOWTIE_64BIT_INDEX
+		TIndexOffU rn64 = rnd.nextU64();
+		TIndexOffU rn = rn64 % (uint64_t)tot;
+#else
+		TIndexOffU rn32 = rnd.nextU32();
+		TIndexOffU rn = rn32 % (uint32_t)tot;
+#endif        
 		if(rn >= sh.exactFwEEHit().size()) {
 			fwFirst = false;
 		}
@@ -112,13 +117,17 @@ bool SwDriver::eeSaTups(
 			assert(hit.fw == fw);
 			if(hit.bot > hit.top) {
                 // Possibly adjust bot and width if we would have exceeded maxelt
-                uint32_t tops[2] = { hit.top, 0 };
-                uint32_t bots[2] = { hit.bot, 0 };
-                uint32_t width = hit.bot - hit.top;
+                TIndexOffU tops[2] = { hit.top, 0 };
+                TIndexOffU bots[2] = { hit.bot, 0 };
+                TIndexOffU width = hit.bot - hit.top;
                 if(nelt_out + width > maxelt) {
-                    uint32_t trim = (uint32_t)((nelt_out + width) - maxelt);
-                    uint32_t rn = rnd.nextU32() % width;
-                    uint32_t newwidth = width - trim;
+                    TIndexOffU trim = (TIndexOffU)((nelt_out + width) - maxelt);
+#ifdef BOWTIE_64BIT_INDEX
+                    TIndexOffU rn = rnd.nextU64() % width;
+#else
+                    TIndexOffU rn = rnd.nextU32() % width;
+#endif
+                    TIndexOffU newwidth = width - trim;
                     if(hit.top + rn + newwidth > hit.bot) {
                         // Two pieces
                         tops[0] = hit.top + rn;
@@ -138,8 +147,8 @@ bool SwDriver::eeSaTups(
                 }
                 for(int i = 0; i < 2 && !done; i++) {
                     if(bots[i] <= tops[i]) break;
-                    uint32_t width = bots[i] - tops[i];
-                    uint32_t top = tops[i];
+                    TIndexOffU width = bots[i] - tops[i];
+                    TIndexOffU top = tops[i];
                     // Clear list where resolved offsets are stored
                     swmSeed.exranges++;
                     swmSeed.exrows += width;
@@ -153,9 +162,9 @@ bool SwDriver::eeSaTups(
                         firstEe = false;
                     }
                     // We have to be careful not to allocate excessive amounts of memory here
-                    TSlice o(salistEe_, (uint32_t)salistEe_.size(), width);
-                    for(size_t i = 0; i < width; i++) {
-                        if(!salistEe_.add(pool_, 0xffffffff)) {
+                    TSlice o(salistEe_, (TIndexOffU)salistEe_.size(), width);
+                    for(TIndexOffU i = 0; i < width; i++) {
+                        if(!salistEe_.add(pool_, OFF_MASK)) {
                             swmSeed.exooms++;
                             return false;
                         }
@@ -163,7 +172,7 @@ bool SwDriver::eeSaTups(
                     assert(!done);
                     eehits_.push_back(hit);
                     satpos_.expand();
-                    satpos_.back().sat.init(SAKey(), top, 0xffffffff, o);
+                    satpos_.back().sat.init(SAKey(), top, OFF_MASK, o);
                     satpos_.back().sat.key.seq = MAX_U64;
                     satpos_.back().sat.key.len = (uint32_t)rd.length();
                     satpos_.back().pos.init(fw, 0, 0, (uint32_t)rd.length());
@@ -199,13 +208,17 @@ bool SwDriver::eeSaTups(
 			assert(hit.repOk(rd));
 			assert(!hit.empty());
             // Possibly adjust bot and width if we would have exceeded maxelt
-            uint32_t tops[2] = { hit.top, 0 };
-            uint32_t bots[2] = { hit.bot, 0 };
-            uint32_t width = hit.bot - hit.top;
+            TIndexOffU tops[2] = { hit.top, 0 };
+            TIndexOffU bots[2] = { hit.bot, 0 };
+            TIndexOffU width = hit.bot - hit.top;
             if(nelt_out + width > maxelt) {
-                uint32_t trim = (uint32_t)((nelt_out + width) - maxelt);
-                uint32_t rn = rnd.nextU32() % width;
-                uint32_t newwidth = width - trim;
+                TIndexOffU trim = (TIndexOffU)((nelt_out + width) - maxelt);
+#ifdef BOWTIE_64BIT_INDEX
+                TIndexOffU rn = rnd.nextU64() % width;
+#else
+                TIndexOffU rn = rnd.nextU32() % width; 
+#endif
+                TIndexOffU newwidth = width - trim;
                 if(hit.top + rn + newwidth > hit.bot) {
                     // Two pieces
                     tops[0] = hit.top + rn;
@@ -225,8 +238,8 @@ bool SwDriver::eeSaTups(
             }
             for(int i = 0; i < 2 && !done; i++) {
                 if(bots[i] <= tops[i]) break;
-                uint32_t width = bots[i] - tops[i];
-                uint32_t top = tops[i];
+                TIndexOffU width = bots[i] - tops[i];
+                TIndexOffU top = tops[i];
                 // Clear list where resolved offsets are stored
                 swmSeed.mm1ranges++;
                 swmSeed.mm1rows += width;
@@ -239,16 +252,16 @@ bool SwDriver::eeSaTups(
                     pool_.clear();
                     firstEe = false;
                 }
-                TSlice o(salistEe_, (uint32_t)salistEe_.size(), width);
+                TSlice o(salistEe_, (TIndexOffU)salistEe_.size(), width);
                 for(size_t i = 0; i < width; i++) {
-                    if(!salistEe_.add(pool_, 0xffffffff)) {
+                    if(!salistEe_.add(pool_, OFF_MASK)) {
                         swmSeed.mm1ooms++;
                         return false;
                     }
                 }
                 eehits_.push_back(hit);
                 satpos_.expand();
-                satpos_.back().sat.init(SAKey(), top, 0xffffffff, o);
+                satpos_.back().sat.init(SAKey(), top, OFF_MASK, o);
                 satpos_.back().sat.key.seq = MAX_U64;
                 satpos_.back().sat.key.len = (uint32_t)rd.length();
                 satpos_.back().pos.init(hit.fw, 0, 0, (uint32_t)rd.length());
@@ -287,10 +300,10 @@ void SwDriver::extend(
 	const Read& rd,       // read
 	const Ebwt& ebwtFw,   // Forward Bowtie index
 	const Ebwt* ebwtBw,   // Backward Bowtie index
-	uint32_t topf,        // top in fw index
-	uint32_t botf,        // bot in fw index
-	uint32_t topb,        // top in bw index
-	uint32_t botb,        // bot in bw index
+	TIndexOffU topf,        // top in fw index
+	TIndexOffU botf,        // bot in fw index
+	TIndexOffU topb,        // top in bw index
+	TIndexOffU botb,        // bot in bw index
 	bool fw,              // seed orientation
 	size_t off,           // seed offset from 5' end
 	size_t len,           // seed length
@@ -298,8 +311,8 @@ void SwDriver::extend(
 	size_t& nlex,         // # positions we can extend to left w/o edit
 	size_t& nrex)         // # positions we can extend to right w/o edit
 {
-	uint32_t t[4], b[4];
-	uint32_t tp[4], bp[4];
+	TIndexOffU t[4], b[4];
+	TIndexOffU tp[4], bp[4];
 	SideLocus tloc, bloc;
 	size_t rdlen = rd.length();
 	size_t lim = fw ? off : rdlen - len - off;
@@ -327,7 +340,7 @@ void SwDriver::extend(
 		// Extend left using forward index
 		const BTDnaString& seq = fw ? rd.patFw : rd.patRc;
 		// See what we get by extending 
-		uint32_t top = topf, bot = botf;
+		TIndexOffU top = topf, bot = botf;
 		t[0] = t[1] = t[2] = t[3] = 0;
 		b[0] = b[1] = b[2] = b[3] = 0;
 		tp[0] = tp[1] = tp[2] = tp[3] = topb;
@@ -393,7 +406,7 @@ void SwDriver::extend(
 		// Extend right using backward index
 		const BTDnaString& seq = fw ? rd.patFw : rd.patRc;
 		// See what we get by extending 
-		uint32_t top = topb, bot = botb;
+		TIndexOffU top = topb, bot = botb;
 		t[0] = t[1] = t[2] = t[3] = 0;
 		b[0] = b[1] = b[2] = b[3] = 0;
 		tp[0] = tp[1] = tp[2] = tp[3] = topf;
@@ -565,9 +578,9 @@ void SwDriver::prioritizeSATups(
 					ebwtFw,
 					ebwtBw,
 					satpos.back().sat.topf,
-					(uint32_t)(satpos.back().sat.topf + sz),
+					(TIndexOffU)(satpos.back().sat.topf + sz),
 					satpos.back().sat.topb,
-					(uint32_t)(satpos.back().sat.topb + sz),
+					(TIndexOffU)(satpos.back().sat.topb + sz),
 					fw,
 					rdoff,
 					seedlen,
@@ -688,7 +701,7 @@ void SwDriver::prioritizeSATups(
 		}
 		assert(!rands2_[ri].done());
 		// Choose an element from the range
-		uint32_t r = rands2_[ri].next(rnd);
+		size_t r = rands2_[ri].next(rnd);
 		if(rands2_[ri].done()) {
 			// Tell the row sampler this range is done
 			rowsamp_.finishedRange(ri - nsmall);
@@ -697,7 +710,7 @@ void SwDriver::prioritizeSATups(
 		SATuple sat;
 		TSlice o;
 		o.init(satpos2_[ri].sat.offs, r, r+1);
-		sat.init(satpos2_[ri].sat.key, satpos2_[ri].sat.topf + r, 0xffffffff, o);
+		sat.init(satpos2_[ri].sat.key, (TIndexOffU)(satpos2_[ri].sat.topf + r), OFF_MASK, o);
 		satpos_.expand();
 		satpos_.back().sat = sat;
 		satpos_.back().origSz = satpos2_[ri].origSz;
@@ -777,7 +790,6 @@ int SwDriver::extendSeeds(
 	bool& exhaustive)            // set to true iff we searched all seeds exhaustively
 {
 	bool all = msink->allHits();
-	typedef std::pair<uint32_t, uint32_t> U32Pair;
 
 	assert(!reportImmediately || msink != NULL);
 	assert(!reportImmediately || !msink->maxed());
@@ -910,20 +922,20 @@ int SwDriver::extendSeeds(
 				first = false;
 				// Resolve next element offset
 				WalkResult wr;
-				uint32_t elt = rands_[i].next(rnd);
+				size_t elt = rands_[i].next(rnd);
 				//cerr << "elt=" << elt << endl;
 				SARangeWithOffs<TSlice> sa;
 				sa.topf = satpos_[i].sat.topf;
 				sa.len = satpos_[i].sat.key.len;
 				sa.offs = satpos_[i].sat.offs;
-				gws_[i].advanceElement(elt, ebwtFw, ref, sa, gwstate_, wr, wlm, prm);
+				gws_[i].advanceElement((TIndexOffU)elt, ebwtFw, ref, sa, gwstate_, wr, wlm, prm);
 				eltsDone++;
 				if(!eeMode) {
 					assert_gt(neltLeft, 0);
 					neltLeft--;
 				}
-				assert_neq(0xffffffff, wr.toff);
-				uint32_t tidx = 0, toff = 0, tlen = 0;
+				assert_neq(OFF_MASK, wr.toff);
+				TIndexOffU tidx = 0, toff = 0, tlen = 0;
 				bool straddled = false;
 				ebwtFw.joinedToTextOff(
 					wr.elt.len,
@@ -933,7 +945,7 @@ int SwDriver::extendSeeds(
 					tlen,
 					eeMode,     // reject straddlers?
 					straddled); // did it straddle?
-				if(tidx == 0xffffffff) {
+				if(tidx == OFF_MASK) {
 					// The seed hit straddled a reference boundary so the seed hit
 					// isn't valid
 					continue;
@@ -1078,7 +1090,7 @@ int SwDriver::extendSeeds(
 				}
 				int64_t leftShift = refoff - rect.refl;
 				size_t nwindow = 0;
-				if(toff >= rect.refl) {
+				if((int64_t)toff >= rect.refl) {
 					nwindow = (size_t)(toff - rect.refl);
 				}
 				// NOTE: We might be taking off more than we should because the
@@ -1122,7 +1134,7 @@ int SwDriver::extendSeeds(
 					// Now fill the dynamic programming matrix and return true iff
 					// there is at least one valid alignment
 					TAlScore bestCell = std::numeric_limits<TAlScore>::min();
-					found = swa.align(rnd, bestCell);
+					found = swa.align(bestCell);
 					swmSeed.tallyGappedDp(readGaps, refGaps);
 					prm.nExDps++;
 					if(!found) {
@@ -1419,7 +1431,6 @@ int SwDriver::extendSeedsPaired(
 	bool& exhaustive)
 {
 	bool all = msink->allHits();
-	typedef std::pair<uint32_t, uint32_t> U32Pair;
 
 	assert(!reportImmediately || msink != NULL);
 	assert(!reportImmediately || !msink->maxed());
@@ -1611,17 +1622,17 @@ int SwDriver::extendSeedsPaired(
 				assert(!gws_[i].done());
 				// Resolve next element offset
 				WalkResult wr;
-				uint32_t elt = rands_[i].next(rnd);
+				size_t elt = rands_[i].next(rnd);
 				SARangeWithOffs<TSlice> sa;
 				sa.topf = satpos_[i].sat.topf;
 				sa.len = satpos_[i].sat.key.len;
 				sa.offs = satpos_[i].sat.offs;
-				gws_[i].advanceElement(elt, ebwtFw, ref, sa, gwstate_, wr, wlm, prm);
+				gws_[i].advanceElement((TIndexOffU)elt, ebwtFw, ref, sa, gwstate_, wr, wlm, prm);
 				eltsDone++;
 				assert_gt(neltLeft, 0);
 				neltLeft--;
-				assert_neq(0xffffffff, wr.toff);
-				uint32_t tidx = 0, toff = 0, tlen = 0;
+				assert_neq(OFF_MASK, wr.toff);
+				TIndexOffU tidx = 0, toff = 0, tlen = 0;
 				bool straddled = false;
 				ebwtFw.joinedToTextOff(
 					wr.elt.len,
@@ -1631,7 +1642,7 @@ int SwDriver::extendSeedsPaired(
 					tlen,
 					eeMode,       // reject straddlers?
 					straddled);   // straddled?
-				if(tidx == 0xffffffff) {
+				if(tidx == OFF_MASK) {
 					// The seed hit straddled a reference boundary so the seed hit
 					// isn't valid
 					continue;
@@ -1771,7 +1782,7 @@ int SwDriver::extendSeedsPaired(
 				}
 				int64_t leftShift = refoff - rect.refl;
 				size_t nwindow = 0;
-				if(toff >= rect.refl) {
+				if((int64_t)toff >= rect.refl) {
 					nwindow = (size_t)(toff - rect.refl);
 				}
 				// NOTE: We might be taking off more than we should because the
@@ -1815,7 +1826,7 @@ int SwDriver::extendSeedsPaired(
 					// Now fill the dynamic programming matrix and return true iff
 					// there is at least one valid alignment
 					TAlScore bestCell = std::numeric_limits<TAlScore>::min();
-					found = swa.align(rnd, bestCell);
+					found = swa.align(bestCell);
 					swmSeed.tallyGappedDp(readGaps, refGaps);
 					prm.nExDps++;
 					prm.nDpFail++;    // failed until proven successful
@@ -2060,7 +2071,7 @@ int SwDriver::extendSeedsPaired(
 							// Now fill the dynamic programming matrix, return true
 							// iff there is at least one valid alignment
 							TAlScore bestCell = std::numeric_limits<TAlScore>::min();
-							foundMate = oswa.align(rnd, bestCell);
+							foundMate = oswa.align(bestCell);
 							prm.nMateDps++;
 							swmMate.tallyGappedDp(oreadGaps, orefGaps);
 							if(!foundMate) {
diff --git a/aligner_sw_driver.h b/aligner_sw_driver.h
index ad9438b..38ea5a9 100644
--- a/aligner_sw_driver.h
+++ b/aligner_sw_driver.h
@@ -293,7 +293,7 @@ struct ExtendRange {
 
 class SwDriver {
 
-	typedef PList<uint32_t, CACHE_PAGE_SZ> TSAList;
+	typedef PList<TIndexOffU, CACHE_PAGE_SZ> TSAList;
 
 public:
 
@@ -458,10 +458,10 @@ protected:
 		const Read& rd,       // read
 		const Ebwt& ebwtFw,   // Forward Bowtie index
 		const Ebwt* ebwtBw,   // Backward Bowtie index
-		uint32_t topf,        // top in fw index
-		uint32_t botf,        // bot in fw index
-		uint32_t topb,        // top in bw index
-		uint32_t botb,        // bot in bw index
+		TIndexOffU topf,        // top in fw index
+		TIndexOffU botf,        // bot in fw index
+		TIndexOffU topb,        // top in bw index
+		TIndexOffU botb,        // bot in bw index
 		bool fw,              // seed orientation
 		size_t off,           // seed offset from 5' end
 		size_t len,           // seed length
diff --git a/aln_sink.cpp b/aln_sink.cpp
index 13183fe..d3afd9f 100644
--- a/aln_sink.cpp
+++ b/aln_sink.cpp
@@ -653,7 +653,6 @@ void AlnSinkWrap::finishRead(
 	bool               lenfilt2,    // mate 2 length-filtered?
 	bool               qcfilt1,     // mate 1 qc-filtered?
 	bool               qcfilt2,     // mate 2 qc-filtered?
-	bool               sortByScore, // prioritize alignments by score
 	RandomSource&      rnd,         // pseudo-random generator
 	ReportingMetrics&  met,         // reporting metrics
 	const PerReadMetrics& prm,      // per-read metrics
@@ -715,15 +714,17 @@ void AlnSinkWrap::finishRead(
 			AlnSetSumm concordSumm(
 				rd1_, rd2_, &rs1_, &rs2_, &rs1u_, &rs2u_,
 				exhaust1, exhaust2, -1, -1);
-			// Possibly select a random subset
-			size_t off;
-			if(sortByScore) {
-				// Sort by score then pick from low to high
-				off = selectByScore(&rs1_, &rs2_, nconcord, select1_, rnd);
-			} else {
-				// Select subset randomly
-				off = selectAlnsToReport(rs1_, nconcord, select1_, rnd);
-			}
+			// Sort by score then pick from low to high
+			AlnScore bestUnchosen1, bestUnchosen2, bestUnchosenC;
+			size_t off = selectByScore(
+				&rs1_, &rs2_,
+				nconcord, select1_,
+				&rs1u_, &rs2u_,
+				bestUnchosen1, bestUnchosen2, bestUnchosenC,
+				rnd);
+			concordSumm.setUnchosen(bestUnchosen1, bestUnchosen2, bestUnchosenC);
+			assert(concordSumm.best(true).valid());
+			assert(concordSumm.best(false).valid());
 			assert_lt(off, rs1_.size());
 			const AlnRes *rs1 = &rs1_[off];
 			const AlnRes *rs2 = &rs2_[off];
@@ -784,7 +785,8 @@ void AlnSinkWrap::finishRead(
 				&flags2,
 				prm,
 				mapq_,
-				sc);
+				sc,
+				false);
 			if(pairMax) {
 				met.nconcord_rep++;
 			} else {
@@ -846,14 +848,13 @@ void AlnSinkWrap::finishRead(
 				assert(rs1_[i].isFraglenSet() == rs2_[i].isFraglenSet());
 				assert(!rs1_[i].isFraglenSet() || abs(rs1_[i].fragmentLength()) == abs(rs2_[i].fragmentLength()));
 			}
-			ASSERT_ONLY(size_t off);
-			if(sortByScore) {
-				// Sort by score then pick from low to high
-				ASSERT_ONLY(off =) selectByScore(&rs1_, &rs2_, ndiscord, select1_, rnd);
-			} else {
-				// Select subset randomly
-				ASSERT_ONLY(off =) selectAlnsToReport(rs1_, ndiscord, select1_, rnd);
-			}
+			AlnScore bestUnchosen1, bestUnchosen2, bestUnchosenC;
+			ASSERT_ONLY(size_t off =) selectByScore(
+				&rs1_, &rs2_,
+				ndiscord, select1_,
+				&rs1u_, &rs2u_,
+				bestUnchosen1, bestUnchosen2, bestUnchosenC,
+				rnd);
 			assert_eq(0, off);
 			assert(!select1_.empty());
 			g_.reportHits(
@@ -875,7 +876,8 @@ void AlnSinkWrap::finishRead(
 				&flags2,
 				prm,
 				mapq_,
-				sc);
+				sc,
+				false);
 			met.nconcord_0++;
 			met.ndiscord++;
 			init_ = false;
@@ -981,14 +983,10 @@ void AlnSinkWrap::finishRead(
 			summ1.init(
 				rd1_, NULL, NULL, NULL, &rs1u_, NULL,
 				exhaust1, exhaust2, -1, -1);
-			size_t off;
-			if(sortByScore) {
-				// Sort by score then pick from low to high
-				off = selectByScore(&rs1u_, NULL, nunpair1, select1_, rnd);
-			} else {
-				// Select subset randomly
-				off = selectAlnsToReport(rs1u_, nunpair1, select1_, rnd);
-			}
+			// Sort by score then pick from low to high
+			AlnScore tmp;
+			size_t off = selectByScore(
+				&rs1u_, NULL, nunpair1, select1_, NULL, NULL, tmp, tmp, tmp, rnd);
 			repRs1 = &rs1u_[off];
 		} else if(rd1_ != NULL) {
 			// Mate 1 failed to align - don't do anything yet.  First we want
@@ -1001,14 +999,10 @@ void AlnSinkWrap::finishRead(
 			summ2.init(
 				NULL, rd2_, NULL, NULL, NULL, &rs2u_,
 				exhaust1, exhaust2, -1, -1);
-			size_t off;
-			if(sortByScore) {
-				// Sort by score then pick from low to high
-				off = selectByScore(&rs2u_, NULL, nunpair2, select2_, rnd);
-			} else {
-				// Select subset randomly
-				off = selectAlnsToReport(rs2u_, nunpair2, select2_, rnd);
-			}
+			// Sort by score then pick from low to high
+			AlnScore tmp;
+			size_t off = selectByScore(
+				&rs2u_, NULL, nunpair2, select2_, NULL, NULL, tmp, tmp, tmp, rnd);
 			repRs2 = &rs2u_[off];
 		} else if(rd2_ != NULL) {
 			// Mate 2 failed to align - don't do anything yet.  First we want
@@ -1088,14 +1082,16 @@ void AlnSinkWrap::finishRead(
 				repRs2 != NULL ? &flags2 : NULL,
 				prm,
 				mapq_,
-				sc);
+				sc,
+				false);
 			assert_lt(select1_[0], rs1u_.size());
 			refid = rs1u_[select1_[0]].refid();
 			refoff = rs1u_[select1_[0]].refoff();
 		}
 		
 		// Now report mate 2
-		if(rep2 && !rep1) {
+		//if(rep2 && !rep1) {
+		if(rep2) {
 			SeedAlSumm ssm1, ssm2;
 			if(sr1 != NULL) sr1->toSeedAlSumm(ssm1);
 			if(sr2 != NULL) sr2->toSeedAlSumm(ssm2);
@@ -1119,7 +1115,8 @@ void AlnSinkWrap::finishRead(
 				repRs1 != NULL ? &flags1 : NULL,
 				prm,
 				mapq_,
-				sc);
+				sc,
+				false);
 			assert_lt(select2_[0], rs2u_.size());
 			refid = rs2u_[select2_[0]].refid();
 			refoff = rs2u_[select2_[0]].refoff();
@@ -1318,6 +1315,11 @@ size_t AlnSinkWrap::selectByScore(
 	const EList<AlnRes>* rs2,    // alignments to select from (mate 2, or NULL)
 	uint64_t             num,    // number of alignments to select
 	EList<size_t>&       select, // prioritized list to put results in
+	const EList<AlnRes>* rs1u,   // alignments to select from (mate 1)
+	const EList<AlnRes>* rs2u,   // alignments to select from (mate 2, or NULL)
+	AlnScore&            bestUnchosen1,
+	AlnScore&            bestUnchosen2,
+	AlnScore&            bestUnchosenC,
 	RandomSource&        rnd)
 	const
 {
@@ -1325,25 +1327,31 @@ size_t AlnSinkWrap::selectByScore(
 	assert(repOk());
 	assert_gt(num, 0);
 	assert(rs1 != NULL);
+	
+	if(rs2 != NULL) {
+		assert(rs1u != NULL);
+		assert(rs2u != NULL);
+	}
+	
 	size_t sz = rs1->size(); // sz = # alignments found
 	assert_leq(num, sz);
 	if(sz < num) {
 		num = sz;
 	}
 	// num = # to select
-	if(sz < 1) {
+	if(sz == 0) {
 		return 0;
 	}
 	select.resize((size_t)num);
 	// Use 'selectBuf_' as a temporary list for sorting purposes
-	EList<std::pair<TAlScore, size_t> >& buf =
-		const_cast<EList<std::pair<TAlScore, size_t> >& >(selectBuf_);
+	EList<std::pair<AlnScore, size_t> >& buf =
+		const_cast<EList<std::pair<AlnScore, size_t> >& >(selectBuf_);
 	buf.resize(sz);
 	// Sort by score.  If reads are pairs, sort by sum of mate scores.
 	for(size_t i = 0; i < sz; i++) {
-		buf[i].first = (*rs1)[i].score().score();
+		buf[i].first = (*rs1)[i].score();
 		if(rs2 != NULL) {
-			buf[i].first += (*rs2)[i].score().score();
+			buf[i].first += (*rs2)[i].score();
 		}
 		buf[i].second = i; // original offset
 	}
@@ -1368,6 +1376,29 @@ size_t AlnSinkWrap::selectByScore(
 	}
 	
 	for(size_t i = 0; i < num; i++) { select[i] = buf[i].second; }
+	
+	if(rs2 != NULL) {
+		for(size_t i = 0; i < rs1u->size(); i++) {
+			if((*rs1u)[i].refcoord() == (*rs1)[select[0]].refcoord()) {
+				continue;
+			}
+			if((*rs1u)[i].score() > bestUnchosen1) {
+				bestUnchosen1 = (*rs1u)[i].score();
+			}
+		}
+		for(size_t i = 0; i < rs2u->size(); i++) {
+			if((*rs2u)[i].refcoord() == (*rs2)[select[0]].refcoord()) {
+				continue;
+			}
+			if((*rs2u)[i].score() > bestUnchosen2) {
+				bestUnchosen2 = (*rs2u)[i].score();
+			}
+		}
+		if(buf.size() > 1) {
+			bestUnchosenC = buf[1].first;
+		}
+	}
+	
 	// Returns index of the representative alignment, but in 'select' also
 	// returns the indexes of the next best selected alignments in order by
 	// score.
@@ -1838,6 +1869,7 @@ void AlnSinkSam::appendMate(
 			o,           // output buffer
 			true,        // first opt flag printed is first overall?
 			rd,          // read
+			rdo,         // opposite read
 			*rs,         // individual alignment result
 			staln,       // stacked alignment
 			flags,       // alignment flags
diff --git a/aln_sink.h b/aln_sink.h
index 3970bb9..4bdfd78 100644
--- a/aln_sink.h
+++ b/aln_sink.h
@@ -670,6 +670,7 @@ public:
 		const PerReadMetrics& prm,            // per-read metrics
 		const Mapq&           mapq,           // MAPQ generator
 		const Scoring&        sc,             // scoring scheme
+		bool                  reportBoth,
 		bool                  getLock = true) // true iff lock held by caller
 	{
 		// There are a few scenarios:
@@ -694,7 +695,7 @@ public:
 			flagscp2.setPrimary(true);
 		}
 		if(select2 != NULL) {
-			// Handle case 5
+			// Handle case 4
 			assert(rd1 != NULL); assert(flags1 != NULL);
 			assert(rd2 != NULL); assert(flags2 != NULL);
 			assert_gt(select1.size(), 0);
@@ -702,7 +703,7 @@ public:
 			AlnRes* r1pri = ((rs1 != NULL) ? &rs1->get(select1[0]) : NULL);
 			AlnRes* r2pri = ((rs2 != NULL) ? &rs2->get((*select2)[0]) : NULL);
 			append(o, staln, threadId, rd1, rd2, rdid, r1pri, r2pri, summ,
-			       ssm1, ssm2, flags1, flags2, prm, mapq, sc, true);
+			       ssm1, ssm2, flags1, flags2, prm, mapq, sc, false);
 			flagscp1.setPrimary(false);
 			flagscp2.setPrimary(false);
 			for(size_t i = 1; i < select1.size(); i++) {
@@ -710,13 +711,15 @@ public:
 				append(o, staln, threadId, rd1, rd2, rdid, r1, r2pri, summ,
 				       ssm1, ssm2, flags1, flags2, prm, mapq, sc, false);
 			}
-			for(size_t i = 1; i < select2->size(); i++) {
-				AlnRes* r2 = ((rs2 != NULL) ? &rs2->get((*select2)[i]) : NULL);
-				append(o, staln, threadId, rd2, rd1, rdid, r2, r1pri, summ,
-				       ssm2, ssm1, flags2, flags1, prm, mapq, sc, false);
+			if(reportBoth) {
+				for(size_t i = 1; i < select2->size(); i++) {
+					AlnRes* r2 = ((rs2 != NULL) ? &rs2->get((*select2)[i]) : NULL);
+					append(o, staln, threadId, rd2, rd1, rdid, r2, r1pri, summ,
+						   ssm2, ssm1, flags2, flags1, prm, mapq, sc, false);
+				}
 			}
 		} else {
-			// Handle cases 1-4
+			// Handle cases 1-3 and 5
 			for(size_t i = 0; i < select1.size(); i++) {
 				AlnRes* r1 = ((rs1 != NULL) ? &rs1->get(select1[i]) : NULL);
 				AlnRes* r2 = ((rs2 != NULL) ? &rs2->get(select1[i]) : NULL);
@@ -1029,7 +1032,6 @@ public:
 		bool               lenfilt2,    // mate 2 length-filtered?
 		bool               qcfilt1,     // mate 1 qc-filtered?
 		bool               qcfilt2,     // mate 2 qc-filtered?
-		bool               sortByScore, // prioritize alignments by score
 		RandomSource&      rnd,         // pseudo-random generator
 		ReportingMetrics&  met,         // reporting metrics
 		const PerReadMetrics& prm,      // per-read metrics
@@ -1236,6 +1238,11 @@ protected:
 		const EList<AlnRes>* rs2,    // alignments to select from (mate 2, or NULL)
 		uint64_t             num,    // number of alignments to select
 		EList<size_t>&       select, // prioritized list to put results in
+		const EList<AlnRes>* rs1u,   // alignments to select from (mate 1)
+		const EList<AlnRes>* rs2u,   // alignments to select from (mate 2, or NULL)
+		AlnScore&            bestUnchosen1,
+		AlnScore&            bestUnchosen2,
+		AlnScore&            bestUnchosenC,
 		RandomSource&        rnd)
 		const;
 
@@ -1264,7 +1271,7 @@ protected:
 	EList<size_t>   select2_; // parallel to rs1_/rs2_ - which to report
 	ReportingState  st_;      // reporting state - what's left to do?
 	
-	EList<std::pair<TAlScore, size_t> > selectBuf_;
+	EList<std::pair<AlnScore, size_t> > selectBuf_;
 	BTString obuf_;
 	StackedAln staln_;
 };
diff --git a/binary_sa_search.h b/binary_sa_search.h
index 25ceea0..4bb6eb7 100644
--- a/binary_sa_search.h
+++ b/binary_sa_search.h
@@ -22,9 +22,11 @@
 
 #include <stdint.h>
 #include <iostream>
+#include <limits>
 #include "alphabet.h"
 #include "assert_helpers.h"
 #include "ds.h"
+#include "btypes.h"
 
 /**
  * Do a binary search using the suffix of 'host' beginning at offset
@@ -39,30 +41,34 @@
  * hand sides and using the min of the two as a way of skipping over
  * characters at the beginning of a new round.
  *
- * Returns 0xffffffff if the query suffix matches an element of sa.
+ * Returns maximum value if the query suffix matches an element of sa.
  */
 template<typename TStr, typename TSufElt> inline
-uint32_t binarySASearch(
+TIndexOffU binarySASearch(
 	const TStr& host,
-	uint32_t qry,
+	TIndexOffU qry,
 	const EList<TSufElt>& sa)
 {
-	uint32_t lLcp = 0, rLcp = 0; // greatest observed LCPs on left and right
-	uint32_t l = 0, r = (uint32_t)sa.size()+1; // binary-search window
-	uint32_t hostLen = (uint32_t)host.length();
+	TIndexOffU lLcp = 0, rLcp = 0; // greatest observed LCPs on left and right
+	TIndexOffU l = 0, r = (TIndexOffU)sa.size()+1; // binary-search window
+	TIndexOffU hostLen = (TIndexOffU)host.length();
 	while(true) {
 		assert_gt(r, l);
-		uint32_t m = (l+r) >> 1;
+		TIndexOffU m = (l+r) >> 1;
 		if(m == l) {
 			// Binary-search window has closed: we have an answer
-			if(m > 0 && sa[m-1] == qry) return 0xffffffff; // qry matches
+			if(m > 0 && sa[m-1] == qry) {
+				return std::numeric_limits<TIndexOffU>::max(); // qry matches
+			}
 			assert_leq(m, sa.size());
 			return m; // Return index of right-hand suffix
 		}
 		assert_gt(m, 0);
-		uint32_t suf = sa[m-1];
-		if(suf == qry) return 0xffffffff; // query matches an elt of sa
-		uint32_t lcp = min(lLcp, rLcp);
+		TIndexOffU suf = sa[m-1];
+		if(suf == qry) {
+			return std::numeric_limits<TIndexOffU>::max(); // query matches an elt of sa
+		}
+		TIndexOffU lcp = min(lLcp, rLcp);
 #ifndef NDEBUG
 		if(sstr_suf_upto_neq(host, qry, host, suf, lcp)) {
 			assert(0);
@@ -90,7 +96,7 @@ uint32_t binarySASearch(
 	}
 	// Shouldn't get here
 	assert(false);
-	return 0xffffffff;
+	return std::numeric_limits<TIndexOffU>::max();
 }
 
 #endif /*BINARY_SA_SEARCH_H_*/
diff --git a/blockwise_sa.h b/blockwise_sa.h
index d819d23..6402160 100644
--- a/blockwise_sa.h
+++ b/blockwise_sa.h
@@ -65,19 +65,19 @@ template<typename TStr>
 class BlockwiseSA {
 public:
 	BlockwiseSA(const TStr& __text,
-	            uint32_t __bucketSz,
+	            TIndexOffU __bucketSz,
 	            bool __sanityCheck = false,
 	            bool __passMemExc = false,
 	            bool __verbose = false,
 	            ostream& __logger = cout) :
 	_text(__text),
-	_bucketSz(max<uint32_t>(__bucketSz, 2u)),
+	_bucketSz(max<TIndexOffU>(__bucketSz, 2u)),
 	_sanityCheck(__sanityCheck),
 	_passMemExc(__passMemExc),
 	_verbose(__verbose),
 	_itrBucket(EBWTB_CAT),
-	_itrBucketPos(0xffffffff),
-	_itrPushedBackSuffix(0xffffffff),
+	_itrBucketPos(OFF_MASK),
+	_itrPushedBackSuffix(OFF_MASK),
 	_logger(__logger)
 	{ }
 
@@ -86,10 +86,10 @@ public:
 	/**
 	 * Get the next suffix; compute the next bucket if necessary.
 	 */
-	uint32_t nextSuffix() {
-		if(_itrPushedBackSuffix != 0xffffffff) {
-			uint32_t tmp = _itrPushedBackSuffix;
-			_itrPushedBackSuffix = 0xffffffff;
+	TIndexOffU nextSuffix() {
+		if(_itrPushedBackSuffix != OFF_MASK) {
+			TIndexOffU tmp = _itrPushedBackSuffix;
+			_itrPushedBackSuffix = OFF_MASK;
 			return tmp;
 		}
 		while(_itrBucketPos >= _itrBucket.size() ||
@@ -108,11 +108,11 @@ public:
 	 * Return true iff the next call to nextSuffix will succeed.
 	 */
 	bool hasMoreSuffixes() {
-		if(_itrPushedBackSuffix != 0xffffffff) return true;
+		if(_itrPushedBackSuffix != OFF_MASK) return true;
 		try {
 			_itrPushedBackSuffix = nextSuffix();
 		} catch(out_of_range& e) {
-			assert_eq(0xffffffff, _itrPushedBackSuffix);
+			assert_eq(OFF_MASK, _itrPushedBackSuffix);
 			return false;
 		}
 		return true;
@@ -124,8 +124,8 @@ public:
 	 */
 	void resetSuffixItr() {
 		_itrBucket.clear();
-		_itrBucketPos = 0xffffffff;
-		_itrPushedBackSuffix = 0xffffffff;
+		_itrBucketPos = OFF_MASK;
+		_itrPushedBackSuffix = OFF_MASK;
 		reset();
 		assert(suffixItrIsReset());
 	}
@@ -136,13 +136,13 @@ public:
 	 */
 	bool suffixItrIsReset() {
 		return _itrBucket.size()    == 0 &&
-		       _itrBucketPos        == 0xffffffff &&
-		       _itrPushedBackSuffix == 0xffffffff &&
+		       _itrBucketPos        == OFF_MASK &&
+		       _itrPushedBackSuffix == OFF_MASK &&
 		       isReset();
 	}
 
 	const TStr& text()  const { return _text; }
-	uint32_t bucketSz() const { return _bucketSz; }
+	TIndexOffU bucketSz() const { return _bucketSz; }
 	bool sanityCheck()  const { return _sanityCheck; }
 	bool verbose()      const { return _verbose; }
 	ostream& log()      const { return _logger; }
@@ -170,13 +170,13 @@ protected:
 	}
 
 	const TStr&      _text;        /// original string
-	const uint32_t   _bucketSz;    /// target maximum bucket size
+	const TIndexOffU   _bucketSz;    /// target maximum bucket size
 	const bool       _sanityCheck; /// whether to perform sanity checks
 	const bool       _passMemExc;  /// true -> pass on memory exceptions
 	const bool       _verbose;     /// be talkative
-	EList<uint32_t>  _itrBucket;   /// current bucket
-	uint32_t         _itrBucketPos;/// offset into current bucket
-	uint32_t         _itrPushedBackSuffix; /// temporary slot for lookahead
+	EList<TIndexOffU>  _itrBucket;   /// current bucket
+	TIndexOffU         _itrBucketPos;/// offset into current bucket
+	TIndexOffU         _itrPushedBackSuffix; /// temporary slot for lookahead
 	ostream&         _logger;      /// write log messages here
 };
 
@@ -188,7 +188,7 @@ template<typename TStr>
 class InorderBlockwiseSA : public BlockwiseSA<TStr> {
 public:
 	InorderBlockwiseSA(const TStr& __text,
-	                   uint32_t __bucketSz,
+	                   TIndexOffU __bucketSz,
 	                   bool __sanityCheck = false,
 	   	               bool __passMemExc = false,
 	                   bool __verbose = false,
@@ -207,7 +207,7 @@ public:
 	typedef DifferenceCoverSample<TStr> TDC;
 
 	KarkkainenBlockwiseSA(const TStr& __text,
-	                      uint32_t __bucketSz,
+	                      TIndexOffU __bucketSz,
 	                      uint32_t __dcV,
 	                      uint32_t __seed = 0,
 	      	              bool __sanityCheck = false,
@@ -226,12 +226,12 @@ public:
 	 * Throws bad_alloc if it's not going to fit in memory.  Returns
 	 * the approximate number of bytes the Cover takes at all times.
 	 */
-	static size_t simulateAllocs(const TStr& text, uint32_t bucketSz) {
+	static size_t simulateAllocs(const TStr& text, TIndexOffU bucketSz) {
 		size_t len = text.length();
 		// _sampleSuffs and _itrBucket are in memory at the peak
 		size_t bsz = bucketSz;
-		size_t sssz = len / max<uint32_t>(bucketSz-1, 1);
-		AutoArray<uint32_t> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/), EBWT_CAT);
+		size_t sssz = len / max<TIndexOffU>(bucketSz-1, 1);
+		AutoArray<TIndexOffU> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/), EBWT_CAT);
 		return bsz;
 	}
 
@@ -239,7 +239,7 @@ public:
 	virtual void nextBlock();
 
 	/// Defined in blockwise_sa.cpp
-	virtual void qsort(EList<uint32_t>& bucket);
+	virtual void qsort(EList<TIndexOffU>& bucket);
 
 	/// Return true iff more blocks are available
 	virtual bool hasMoreBlocks() const {
@@ -301,25 +301,25 @@ private:
 	 *
 	 * Defined in blockwise_sa.cpp
 	 */
-	inline bool tieBreakingLcp(uint32_t aOff,
-	                           uint32_t bOff,
-	                           uint32_t& lcp,
+	inline bool tieBreakingLcp(TIndexOffU aOff,
+	                           TIndexOffU bOff,
+	                           TIndexOffU& lcp,
 	                           bool& lcpIsSoft);
 
 	/**
 	 * Compare two suffixes using the difference-cover sample.
 	 */
-	inline bool suffixCmp(uint32_t cmp,
-	                      uint32_t i,
+	inline bool suffixCmp(TIndexOffU cmp,
+	                      TIndexOffU i,
 	                      int64_t& j,
 	                      int64_t& k,
 	                      bool& kSoft,
-	                      const EList<uint32_t>& z);
+	                      const EList<TIndexOffU>& z);
 
 	void buildSamples();
 
-	EList<uint32_t>  _sampleSuffs; /// sample suffixes
-	uint32_t         _cur;         /// offset to 1st elt of next block
+	EList<TIndexOffU>  _sampleSuffs; /// sample suffixes
+	TIndexOffU         _cur;         /// offset to 1st elt of next block
 	const uint32_t   _dcV;         /// difference-cover periodicity
 	PtrWrap<TDC>     _dc;          /// queryable difference-cover data
 	bool             _built;       /// whether samples/DC have been built
@@ -330,11 +330,11 @@ private:
  * Qsort the set of suffixes whose offsets are in 'bucket'.
  */
 template<typename TStr>
-inline void KarkkainenBlockwiseSA<TStr>::qsort(EList<uint32_t>& bucket) {
+inline void KarkkainenBlockwiseSA<TStr>::qsort(EList<TIndexOffU>& bucket) {
 	const TStr& t = this->text();
-	uint32_t *s = bucket.ptr();
+	TIndexOffU *s = bucket.ptr();
 	size_t slen = bucket.size();
-	uint32_t len = (uint32_t)t.length();
+	TIndexOffU len = (TIndexOffU)t.length();
 	if(_dc.get() != NULL) {
 		// Use the difference cover as a tie-breaker if we have it
 		VMSG_NL("  (Using difference cover)");
@@ -361,10 +361,10 @@ inline void KarkkainenBlockwiseSA<TStr>::qsort(EList<uint32_t>& bucket) {
  */
 template<>
 inline void KarkkainenBlockwiseSA<S2bDnaString>::qsort(
-	EList<uint32_t>& bucket)
+	EList<TIndexOffU>& bucket)
 {
 	const S2bDnaString& t = this->text();
-	uint32_t *s = bucket.ptr();
+	TIndexOffU *s = bucket.ptr();
 	size_t slen = bucket.size();
 	size_t len = t.length();
 	if(_dc.get() != NULL) {
@@ -392,11 +392,11 @@ inline void KarkkainenBlockwiseSA<S2bDnaString>::qsort(
 template<typename TStr>
 void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 	const TStr& t = this->text();
-	uint32_t bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
+	TIndexOffU bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
 	size_t len = this->text().length();
 	// Prepare _sampleSuffs array
 	_sampleSuffs.clear();
-	uint32_t numSamples = (uint32_t)((len/bsz)+1)<<1; // ~len/bsz x 2
+	TIndexOffU numSamples = (TIndexOffU)((len/bsz)+1)<<1; // ~len/bsz x 2
 	assert_gt(numSamples, 0);
 	VMSG_NL("Reserving space for " << numSamples << " sample suffixes");
 	if(this->_passMemExc) {
@@ -404,7 +404,11 @@ void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 		// Randomly generate samples.  Allow duplicates for now.
 		VMSG_NL("Generating random suffixes");
 		for(size_t i = 0; i < numSamples; i++) {
-			_sampleSuffs[i] = (uint32_t)(_randomSrc.nextU32() % len);
+#ifdef BOWTIE_64BIT_INDEX         
+			_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU64() % len); 
+#else
+			_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU32() % len); 
+#endif
 		}
 	} else {
 		try {
@@ -412,13 +416,17 @@ void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 			// Randomly generate samples.  Allow duplicates for now.
 			VMSG_NL("Generating random suffixes");
 			for(size_t i = 0; i < numSamples; i++) {
-				_sampleSuffs[i] = (uint32_t)(_randomSrc.nextU32() % len);
+#ifdef BOWTIE_64BIT_INDEX
+				_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU64() % len); 
+#else
+				_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU32() % len); 
+#endif                
 			}
 		} catch(bad_alloc &e) {
 			if(this->_passMemExc) {
 				throw e; // rethrow immediately
 			} else {
-				cerr << "Could not allocate sample suffix container of " << (numSamples * 4) << " bytes." << endl
+				cerr << "Could not allocate sample suffix container of " << (numSamples * OFF_SIZE) << " bytes." << endl
 				     << "Please try using a smaller number of blocks by specifying a larger --bmax or" << endl
 				     << "a smaller --bmaxdivn" << endl;
 				throw 1;
@@ -455,16 +463,16 @@ void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 	while(--limit >= 0) {
 		// Calculate bucket sizes by doing a binary search for each
 		// suffix and noting where it lands
-		uint32_t numBuckets = (uint32_t)_sampleSuffs.size()+1;
-		EList<uint32_t> bucketSzs(EBWTB_CAT); // holds computed bucket sizes
-		EList<uint32_t> bucketReps(EBWTB_CAT); // holds 1 member of each bucket (for splitting)
+		TIndexOffU numBuckets = (TIndexOffU)_sampleSuffs.size()+1;
+		EList<TIndexOffU> bucketSzs(EBWTB_CAT); // holds computed bucket sizes
+		EList<TIndexOffU> bucketReps(EBWTB_CAT); // holds 1 member of each bucket (for splitting)
 		try {
 			// Allocate and initialize containers for holding bucket
 			// sizes and representatives.
 			bucketSzs.resizeExact(numBuckets);
 			bucketReps.resizeExact(numBuckets);
 			bucketSzs.fillZero();
-			bucketReps.fill(0xffffffff);
+			bucketReps.fill(OFF_MASK);
 		} catch(bad_alloc &e) {
 			if(this->_passMemExc) {
 				throw e; // rethrow immediately
@@ -485,17 +493,17 @@ void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 		{
 			VMSG_NL("  Binary sorting into buckets");
 			Timer timer(cout, "  Binary sorting into buckets time: ", this->verbose());
-			uint32_t lenDiv10 = (uint32_t)((len + 9) / 10);
-			for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
-				uint32_t itenNext = iten + lenDiv10;
+			TIndexOffU lenDiv10 = (TIndexOffU)((len + 9) / 10);
+			for(TIndexOffU iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
+				TIndexOffU itenNext = iten + lenDiv10;
 				if(ten > 0) VMSG_NL("  " << (ten * 10) << "%");
-				for(uint32_t i = iten; i < itenNext && i < len; i++) {
-					uint32_t r = binarySASearch(t, i, _sampleSuffs);
-					if(r == 0xffffffff) continue; // r was one of the samples
+				for(TIndexOffU i = iten; i < itenNext && i < len; i++) {
+					TIndexOffU r = binarySASearch(t, i, _sampleSuffs);
+					if(r == std::numeric_limits<TIndexOffU>::max()) continue; // r was one of the samples
 					assert_lt(r, numBuckets);
 					bucketSzs[r]++;
 					assert_lt(bucketSzs[r], len);
-					if(bucketReps[r] == 0xffffffff ||
+					if(bucketReps[r] == OFF_MASK ||
 					   (_randomSrc.nextU32() & 100) == 0)
 					{
 						bucketReps[r] = i; // clobbers previous one, but that's OK
@@ -506,17 +514,17 @@ void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 		}
 		// Check for large buckets and mergeable pairs of small buckets
 		// and split/merge as necessary
-		int added = 0;
-		int merged = 0;
+		TIndexOff added = 0;
+		TIndexOff merged = 0;
 		assert_eq(bucketSzs.size(), numBuckets);
 		assert_eq(bucketReps.size(), numBuckets);
 		{
 			Timer timer(cout, "  Splitting and merging time: ", this->verbose());
 			VMSG_NL("Splitting and merging");
-			for(int64_t i = 0; i < numBuckets; i++) {
-				uint32_t mergedSz = bsz + 1;
-				assert(bucketSzs[(size_t)i] == 0 || bucketReps[(size_t)i] != 0xffffffff);
-				if(i < (int64_t)numBuckets-1) {
+			for(TIndexOffU i = 0; i < numBuckets; i++) {
+				TIndexOffU mergedSz = bsz + 1;
+				assert(bucketSzs[(size_t)i] == 0 || bucketReps[(size_t)i] != OFF_MASK);
+				if(i < numBuckets-1) {
 					mergedSz = bucketSzs[(size_t)i] + bucketSzs[(size_t)i+1] + 1;
 				}
 				// Merge?
@@ -539,7 +547,7 @@ void KarkkainenBlockwiseSA<TStr>::buildSamples() {
 					// Add an additional sample from the bucketReps[]
 					// set accumulated in the binarySASearch loop; this
 					// effectively splits the bucket
-					_sampleSuffs.insert(bucketReps[(size_t)i], (uint32_t)(i + (added++)));
+					_sampleSuffs.insert(bucketReps[(size_t)i], (TIndexOffU)(i + (added++)));
 				}
 			}
 		}
@@ -569,8 +577,8 @@ void KarkkainenBlockwiseSA<TStr>::buildSamples() {
  * Do a simple LCP calculation on two strings.
  */
 template<typename T> inline
-static uint32_t suffixLcp(const T& t, uint32_t aOff, uint32_t bOff) {
-	uint32_t c = 0;
+static TIndexOffU suffixLcp(const T& t, TIndexOffU aOff, TIndexOffU bOff) {
+	TIndexOffU c = 0;
 	size_t len = t.length();
 	assert_leq(aOff, len);
 	assert_leq(bOff, len);
@@ -585,14 +593,14 @@ static uint32_t suffixLcp(const T& t, uint32_t aOff, uint32_t bOff) {
  * employed, lcpIsSoft will be set to true (otherwise, false).
  */
 template<typename TStr> inline
-bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(uint32_t aOff,
-                                                 uint32_t bOff,
-                                                 uint32_t& lcp,
+bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(TIndexOffU aOff,
+                                                 TIndexOffU bOff,
+                                                 TIndexOffU& lcp,
                                                  bool& lcpIsSoft)
 {
 	const TStr& t = this->text();
-	uint32_t c = 0;
-	uint32_t tlen = (uint32_t)t.length();
+	TIndexOffU c = 0;
+	TIndexOffU tlen = (TIndexOffU)t.length();
 	assert_leq(aOff, tlen);
 	assert_leq(bOff, tlen);
 	assert(_dc.get() != NULL);
@@ -626,14 +634,14 @@ bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(uint32_t aOff,
  * filled in then calculate it from scratch.
  */
 template<typename T>
-static uint32_t lookupSuffixZ(
+static TIndexOffU lookupSuffixZ(
 	const T& t,
-	uint32_t zOff,
-	uint32_t off,
-	const EList<uint32_t>& z)
+	TIndexOffU zOff,
+	TIndexOffU off,
+	const EList<TIndexOffU>& z)
 {
 	if(zOff < z.size()) {
-		uint32_t ret = z[zOff];
+		TIndexOffU ret = z[zOff];
 		assert_eq(ret, suffixLcp(t, off + zOff, off));
 		return ret;
 	}
@@ -647,18 +655,18 @@ static uint32_t lookupSuffixZ(
  */
 template<typename TStr> inline
 bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
-	uint32_t cmp,
-	uint32_t i,
+	TIndexOffU cmp,
+	TIndexOffU i,
 	int64_t& j,
 	int64_t& k,
 	bool& kSoft,
-	const EList<uint32_t>& z)
+	const EList<TIndexOffU>& z)
 {
 	const TStr& t = this->text();
-	uint32_t len = (uint32_t)t.length();
+	TIndexOffU len = (TIndexOffU)t.length();
 	// i is not covered by any previous match
-	uint32_t l;
-	if(i > k) {
+	TIndexOffU l;
+	if((int64_t)i > k) {
 		k = i; // so that i + lHi == kHi
 		l = 0; // erase any previous l
 		kSoft = false;
@@ -667,7 +675,7 @@ bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
 	// i is covered by a previous match
 	else /* i <= k */ {
 		assert_gt((int64_t)i, j);
-		uint32_t zIdx = (uint32_t)(i-j);
+		TIndexOffU zIdx = (TIndexOffU)(i-j);
 		assert_leq(zIdx, len-cmp);
 		if(zIdx < _dcV || _dc.get() == NULL) {
 			// Go as far as the Z-box says
@@ -697,9 +705,9 @@ bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
 
 	// Z box extends exactly as far as previous match (or there
 	// is neither a Z box nor a previous match)
-	if(i + l == k) {
+	if((int64_t)(i + l) == k) {
 		// Extend
-		while(l < len-cmp && k < len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
+		while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
 			k++; l++;
 		}
 		j = i; // update furthest-extending LHS
@@ -707,11 +715,11 @@ bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
 		assert_eq(l, suffixLcp(t, i, cmp));
 	}
 	// Z box extends further than previous match
-	else if(i + l > k) {
-		l = (uint32_t)(k - i); // point to just after previous match
+	else if((int64_t)(i + l) > k) {
+		l = (TIndexOffU)(k - i); // point to just after previous match
 		j = i; // update furthest-extending LHS
 		if(kSoft) {
-			while(l < len-cmp && k < len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
+			while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
 				k++; l++;
 			}
 			kSoft = false;
@@ -767,16 +775,16 @@ bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
  */
 template<typename TStr>
 void KarkkainenBlockwiseSA<TStr>::nextBlock() {
-	EList<uint32_t>& bucket = this->_itrBucket;
+	EList<TIndexOffU>& bucket = this->_itrBucket;
 	VMSG_NL("Getting block " << (_cur+1) << " of " << _sampleSuffs.size()+1);
 	assert(_built);
 	assert_gt(_dcV, 3);
 	assert_leq(_cur, _sampleSuffs.size());
 	const TStr& t = this->text();
-	uint32_t len = (uint32_t)t.length();
+	TIndexOffU len = (TIndexOffU)t.length();
 	// Set up the bucket
 	bucket.clear();
-	uint32_t lo = 0xffffffff, hi = 0xffffffff;
+	TIndexOffU lo = OFF_MASK, hi = OFF_MASK;
 	if(_sampleSuffs.size() == 0) {
 		// Special case: if _sampleSuffs is 0, then multikey-quicksort
 		// everything
@@ -787,7 +795,7 @@ void KarkkainenBlockwiseSA<TStr>::nextBlock() {
 				bucket.reserveExact(len+1);
 			}
 			bucket.resize(len);
-			for(uint32_t i = 0; i < len; i++) {
+			for(TIndexOffU i = 0; i < len; i++) {
 				bucket[i] = i;
 			}
 		} catch(bad_alloc &e) {
@@ -821,7 +829,7 @@ void KarkkainenBlockwiseSA<TStr>::nextBlock() {
 		// Select upper and lower bounds from _sampleSuffs[] and
 		// calculate the Z array up to the difference-cover periodicity
 		// for both.  Be careful about first/last buckets.
-		EList<uint32_t> zLo(EBWTB_CAT), zHi(EBWTB_CAT);
+		EList<TIndexOffU> zLo(EBWTB_CAT), zHi(EBWTB_CAT);
 		assert_geq(_cur, 0);
 		assert_leq(_cur, _sampleSuffs.size());
 		bool first = (_cur == 0);
@@ -874,18 +882,18 @@ void KarkkainenBlockwiseSA<TStr>::nextBlock() {
 		{
 			Timer timer(cout, "  Block accumulator loop time: ", this->verbose());
 			VMSG_NL("  Entering block accumulator loop:");
-			uint32_t lenDiv10 = (len + 9) / 10;
-			for(uint32_t iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
-			uint32_t itenNext = iten + lenDiv10;
+			TIndexOffU lenDiv10 = (len + 9) / 10;
+			for(TIndexOffU iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
+			TIndexOffU itenNext = iten + lenDiv10;
 			if(ten > 0) VMSG_NL("  " << (ten * 10) << "%");
-			for(uint32_t i = iten; i < itenNext && i < len; i++) {
-				assert_lt(jLo, i); assert_lt(jHi, i);
+			for(TIndexOffU i = iten; i < itenNext && i < len; i++) {
+				assert_lt(jLo, (TIndexOff)i); assert_lt(jHi, (TIndexOff)i);
 				// Advance the upper-bound comparison by one character
 				if(i == hi || i == lo) continue; // equal to one of the bookends
-				if(hi != 0xffffffff && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
+				if(hi != OFF_MASK && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
 					continue; // not in the bucket
 				}
-				if(lo != 0xffffffff && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
+				if(lo != OFF_MASK && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
 					continue; // not in the bucket
 				}
 				// In the bucket! - add it
@@ -893,7 +901,7 @@ void KarkkainenBlockwiseSA<TStr>::nextBlock() {
 				try {
 					bucket.push_back(i);
 				} catch(bad_alloc &e) {
-					cerr << "Could not append element to block of " << ((bucket.size()) * 4) << " bytes" << endl;
+					cerr << "Could not append element to block of " << ((bucket.size()) * OFF_SIZE) << " bytes" << endl;
 					if(this->_passMemExc) {
 						throw e; // rethrow immediately
 					} else {
@@ -917,7 +925,7 @@ void KarkkainenBlockwiseSA<TStr>::nextBlock() {
 		VMSG_NL("  Sorting block of length " << bucket.size());
 		this->qsort(bucket);
 	}
-	if(hi != 0xffffffff) {
+	if(hi != OFF_MASK) {
 		// Not the final bucket; throw in the sample on the RHS
 		bucket.push_back(hi);
 	} else {
diff --git a/bowtie2 b/bowtie2
index 7e0c828..808f7e9 100755
--- a/bowtie2
+++ b/bowtie2
@@ -30,21 +30,33 @@
 
 use strict;
 use warnings;
-use Getopt::Long;
+use Getopt::Long qw(GetOptionsFromArray GetOptions);
 use File::Spec;
 use POSIX;
 
-my ($vol,$script_path,$prog) 
-                = File::Spec->splitpath(File::Spec->rel2abs( __FILE__ ));
+
+my ($vol,$script_path,$prog);
+$prog = File::Spec->rel2abs( __FILE__ );
+
+while (-f $prog && -l $prog){
+    $prog = File::Spec->rel2abs(readlink($prog));   
+}
+
+($vol,$script_path,$prog) 
+                = File::Spec->splitpath($prog);
 my $os_is_nix   = ($^O eq "linux") || ($^O eq "darwin");
-my $align_bin   = $os_is_nix ? 'bowtie2-align' : 'bowtie2-align.exe'; 
+my $align_bin_s = $os_is_nix ? 'bowtie2-align-s' : 'bowtie2-align-s.exe'; 
 my $build_bin   = $os_is_nix ? 'bowtie2-build' : 'bowtie2-build.exe';               
-my $align_prog  = File::Spec->catpath($vol,$script_path,$align_bin);
-my $build_prog  = File::Spec->catpath($vol,$script_path,$build_bin);
+my $align_bin_l = $os_is_nix ? 'bowtie2-align-l' : 'bowtie2-align-l.exe'; 
+my $align_prog_s= File::Spec->catpath($vol,$script_path,$align_bin_s);
+my $align_prog_l= File::Spec->catpath($vol,$script_path,$align_bin_l);
+my $align_prog  = $align_prog_s;
+my $idx_ext_l     = 'bt2l'; 
+my $idx_ext_s     = 'bt2'; 
+my $idx_ext       = $idx_ext_s; 
 my %signo       = ();
 my @signame     = ();
 
-
 {
 	# Get signal info
 	use Config;
@@ -57,14 +69,14 @@ my @signame     = ();
 }
 
 (-x "$align_prog") ||
-	die "Error: Expected bowtie2 to be in same directory with bowtie2-align:\n$script_path";
+	Fail("Expected bowtie2 to be in same directory with bowtie2-align:\n$script_path\n");
 
 # Get description of arguments from Bowtie 2 so that we can distinguish Bowtie
 # 2 args from wrapper args
 sub getBt2Desc($) {
 	my $d = shift;
 	my $cmd = "$align_prog --wrapper basic-0 --arg-desc";
-	open(my $fh, "$cmd |") || die "Failed to run command '$cmd'";
+	open(my $fh, "$cmd |") || Fail("Failed to run command '$cmd'\n");
 	while(readline $fh) {
 		chomp;
 		next if /^\s*$/;
@@ -72,7 +84,7 @@ sub getBt2Desc($) {
 		$d->{$ts[0]} = $ts[1];
 	}
 	close($fh);
-	$? == 0 || die;
+	$? == 0 || Fail("Description of arguments failed!\n");
 }
 
 my %desc = ();
@@ -106,7 +118,7 @@ my %read_fns = ();
 my %read_compress = ();
 my $cap_out = undef;       # Filename for passthrough
 my $no_unal = 0;
-
+my $large_idx = 0;
 # Remove whitespace
 for my $i (0..$#bt2_args) {
 	$bt2_args[$i]=~ s/^\s+//; $bt2_args[$i] =~ s/\s+$//;
@@ -132,7 +144,7 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
 			for my $a (@args) { push @bt2w_args, ("-U", $a); }
 		} else {
 			# Argument is in the next token
-			$i < scalar(@bt2_args)-1 || die;
+			$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
 			$i++;
 			my @args = split(/,/, $bt2_args[$i]);
 			for my $a (@args) { push @bt2w_args, ("-U", $a); }
@@ -149,7 +161,7 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
 			for my $a (@args) { push @bt2w_args, ("-$mate", $a); }
 		} else {
 			# Argument is in the next token
-			$i < scalar(@bt2_args)-1 || die;
+			$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
 			$i++;
 			my @args = split(/,/, $bt2_args[$i]);
 			for my $a (@args) { push @bt2w_args, ("-$mate", $a); }
@@ -164,13 +176,17 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
 		$no_unal = 1;
 		$bt2_args[$i] = undef;
 	}
+	if($arg eq "--large-index") {
+		$large_idx = 1;
+		$bt2_args[$i] = undef;
+	}
 	for my $rarg ("un-conc", "al-conc", "un", "al") {
 		if($arg =~ /^--${rarg}$/ || $arg =~ /^--${rarg}-gz$/ || $arg =~ /^--${rarg}-bz2$/) {
 			$bt2_args[$i] = undef;
 			if(scalar(@args) > 1 && $args[1] ne "") {
 				$read_fns{$rarg} = $args[1];
 			} else {
-				$i < scalar(@bt2_args)-1 || die "Error: --${rarg}* option takes an argument";
+				$i < scalar(@bt2_args)-1 || Fail("--${rarg}* option takes an argument.\n");
 				$read_fns{$rarg} = $bt2_args[$i+1];
 				$bt2_args[$i+1] = undef;
 			}
@@ -193,7 +209,7 @@ if(scalar(keys %read_fns) > 0 || $no_unal) {
 		next unless defined($bt2_args[$i]);
 		my $arg = $bt2_args[$i];
 		if($arg eq "-S" || $arg eq "--output") {
-			$i < scalar(@bt2_args)-1 || die "Error: -S/--output takes an argument";
+			$i < scalar(@bt2_args)-1 || Fail("-S/--output takes an argument.\n");
 			$cap_out = $bt2_args[$i+1];
 			$bt2_args[$i] = undef;
 			$bt2_args[$i+1] = undef;
@@ -215,39 +231,50 @@ my $no_pipes = 0;
 my $keep = 0;
 my $verbose = 0;
 my $readpipe = undef;
+my $log_fName = undef;
+my $help = 0;
+
+my @bt2w_args_cp = (@bt2w_args>0) ? @bt2w_args : @bt2_args;
+Getopt::Long::Configure("pass_through","no_ignore_case");
 
-my @bt2w_args_cp = @bt2w_args;
- at ARGV = @bt2w_args;
-GetOptions(
+GetOptionsFromArray(
+    \@bt2w_args_cp,
 	"1=s"                           => \@mate1s,
 	"2=s"                           => \@mate2s,
-	"reads=s"                       => \@unps,
-	"U=s"                           => \@unps,
+	"reads|U=s"                     => \@unps,
 	"temp-directory=s"              => \$temp_dir,
 	"bam"                           => \$bam_out,
 	"no-named-pipes"                => \$no_pipes,
 	"ref-string|reference-string=s" => \$ref_str,
 	"keep"                          => \$keep,
-	"verbose"                       => \$verbose
-) || die "Bad option";
+	"verbose"                       => \$verbose,
+	"log-file=s"                    => \$log_fName,
+	"help|h"                        => \$help
+);
 
-if($verbose) {
-	print STDERR "Before arg handling:\n";
-	print STDERR "  Wrapper args:\n[ @bt2w_args_cp ]\n";
-	print STDERR "  Binary args:\n[ @bt2_args ]\n";
+
+my $old_stderr;
+
+if ($log_fName) {
+    open($old_stderr, ">&STDERR") or Fail("Cannot dup STDERR!\n");
+    open(STDERR, ">", $log_fName) or Fail("Cannot redirect to log file $log_fName.\n");
 }
 
+Info("Before arg handling:\n");
+Info("  Wrapper args:\n[ @bt2w_args ]\n");
+Info("  Binary args:\n[ @bt2_args ]\n");
+
 sub cat_file($$) {
 	my ($ifn, $ofh) = @_;
 	my $ifh = undef;
 	if($ifn =~ /\.gz$/) {
 		open($ifh, "gzip -dc $ifn |") ||
-			die "Error: could not open gzipped read file: $ifn";
+			 Fail("Could not open gzipped read file: $ifn \n");
 	} elsif($ifn =~ /\.bz2/) {
 		open($ifh, "bzip2 -dc $ifn |") ||
-			die "Error: could not open bzip2ed read file: $ifn";
+			Fail("Could not open bzip2ed read file: $ifn \n");
 	} else {
-		open($ifh, $ifn) || die "Error: could not open read file: $ifn";
+		open($ifh, $ifn) || Fail("Could not open read file: $ifn \n");
 	}
 	while(readline $ifh) { print {$ofh} $_; }
 	close($ifh);
@@ -263,6 +290,33 @@ sub wrapInput($$$) {
 	return 0;
 }
 
+sub Info {
+    if ($verbose) {
+        print STDERR "(INFO): " , at _;
+    }
+}
+
+sub Error {
+    my @msg = @_;
+    $msg[0] = "(ERR): ".$msg[0];
+    printf STDERR @msg;
+}
+
+sub Fail {
+    Error(@_);
+    die("Exiting now ...\n");    
+}
+
+sub Extract_IndexName_From {
+    my $index_opt = $ref_str ? '--index' : '-x';
+    for (my $i=0; $i<@_; $i++) {
+        if ($_[$i] eq $index_opt){
+            return $_[$i+1];
+        }
+    }
+    Info("Cannot find any index option (--reference-string, --ref-string or -x) in the given command line.\n");    
+}
+
 if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 	if(scalar(@mate2s) > 0) {
 		#
@@ -270,20 +324,20 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 		#
 		# Put reads into temporary files or fork off processes to feed named pipes
 		scalar(@mate2s) == scalar(@mate1s) ||
-			die "Different number of files specified with --reads/-1 as with -2";
+			Fail("Different number of files specified with --reads/-1 as with -2\n");
 		# Make a named pipe for delivering mate #1s
 		my $m1fn = "$temp_dir/$$.inpipe1";
 		push @to_delete, $m1fn;
 		push @bt2_args, "-1 $m1fn";
 		# Create named pipe 1 for writing
 		if(!$no_pipes) {
-			mkfifo($m1fn, 0700) || die "Error: mkfifo($m1fn) failed.";
+			mkfifo($m1fn, 0700) || Fail("mkfifo($m1fn) failed.\n");
 		}
 		my $pid = 0;
 		$pid = fork() unless $no_pipes;
 		if($pid == 0) {
 			# Open named pipe 1 for writing
-			open(my $ofh, ">$m1fn") || die "Can't open '$m1fn' for writing";
+			open(my $ofh, ">$m1fn") || Fail("Can't open '$m1fn' for writing\n");
 			for my $ifn (@mate1s) { cat_file($ifn, $ofh); }
 			close($ofh);
 			exit 0 unless $no_pipes;
@@ -294,13 +348,13 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 		push @bt2_args, "-2 $m2fn";
 		# Create named pipe 2 for writing
 		if(!$no_pipes) {
-			mkfifo($m2fn, 0700) || die "Error: mkfifo($m2fn) failed.";
+			mkfifo($m2fn, 0700) || Fail("mkfifo($m2fn) failed.\n");
 		}
 		$pid = 0;
 		$pid = fork() unless $no_pipes;
 		if($pid == 0) {
 			# Open named pipe 2 for writing
-			open(my $ofh, ">$m2fn") || die "Can't open '$m2fn' for writing";
+			open(my $ofh, ">$m2fn") || Fail("Can't open '$m2fn' for writing.\n");
 			for my $ifn (@mate2s) { cat_file($ifn, $ofh); }
 			close($ofh);
 			exit 0 unless $no_pipes;
@@ -316,13 +370,13 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 		push @bt2_args, "-U $ufn";
 		# Create named pipe 2 for writing
 		if(!$no_pipes) {
-			mkfifo($ufn, 0700) || die "Error: mkfifo($ufn) failed.";
+			mkfifo($ufn, 0700) || Fail("mkfifo($ufn) failed.\n");
 		}
 		my $pid = 0;
 		$pid = fork() unless $no_pipes;
 		if($pid == 0) {
 			# Open named pipe 2 for writing
-			open(my $ofh, ">$ufn") || die "Can't open '$ufn' for writing";
+			open(my $ofh, ">$ufn") || Fail("Can't open '$ufn' for writing.\n");
 			for my $ifn (@unps) { cat_file($ifn, $ofh); }
 			close($ofh);
 			exit 0 unless $no_pipes;
@@ -342,20 +396,43 @@ if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
 if(defined($ref_str)) {
 	my $ofn = "$temp_dir/$$.ref_str.fa";
 	open(my $ofh, ">$ofn") ||
-		die "Error: could not open temporary fasta file '$ofn' for writing";
+		Fail("could not open temporary fasta file '$ofn' for writing.\n");
 	print {$ofh} ">1\n$ref_str\n";
 	close($ofh);
 	push @to_delete, $ofn;
 	system("$build_bin $ofn $ofn") == 0 ||
-		die "Error: bowtie2-build returned non-0 exit level";
+		Fail("bowtie2-build returned non-0 exit level.\n");
 	push @bt2_args, ("--index", "$ofn");
-	push @to_delete, ("$ofn.1.bt2", "$ofn.2.bt2", "$ofn.3.bt2", "$ofn.4.bt2",
-	                  "$ofn.rev.1.bt2", "$ofn.rev.2.bt2");
+	push @to_delete, ("$ofn.1.".$idx_ext, "$ofn.2.".$idx_ext, 
+	                  "$ofn.3.".$idx_ext, "$ofn.4.".$idx_ext,
+	                  "$ofn.rev.1.".$idx_ext, "$ofn.rev.2.".$idx_ext);
 }
 
-if($verbose) {
-	print STDERR "After arg handling:\n";
-	print STDERR "  Binary args:\n[ @bt2_args ]\n";
+Info("After arg handling:\n");
+Info("  Binary args:\n[ @bt2_args ]\n");
+
+my $index_name = Extract_IndexName_From(@bt2_args);
+
+if ($large_idx) {
+    Info("Using a large index enforced by user.\n");
+    $align_prog  = $align_prog_l;
+    $idx_ext     = $idx_ext_l;
+    if (not -f $index_name.".1.".$idx_ext_l) {
+        Fail("Cannot find the large index ${index_name}.1.${idx_ext_l}\n");
+    }
+    Info("Using large index (${index_name}.1.${idx_ext_l}).\n");
+}
+else {
+    if ((-f $index_name.".1.".$idx_ext_l) && 
+        (not -f $index_name.".1.".$idx_ext_s)) {
+        Info("Cannot find a small index but a large one seems to be present.\n");
+        Info("Switching to using the large index (${index_name}.1.${idx_ext_l}).\n");
+        $align_prog  = $align_prog_l;
+        $idx_ext     = $idx_ext_l;
+    }
+    else {
+        Info("Using the small index (${index_name}.1.${idx_ext_s}).\n")
+    }
 }
 
 my $debug_str = ($debug ? "-debug" : "");
@@ -366,17 +443,17 @@ my $cmd = "$align_prog$debug_str --wrapper basic-0 ".join(" ", @bt2_args);
 # Possibly add read input on an anonymous pipe
 $cmd = "$readpipe $cmd" if defined($readpipe);
 
-print STDERR "$cmd\n" if $verbose;
+Info("$cmd\n");
 my $ret;
 if(defined($cap_out)) {
 	# Open Bowtie 2 pipe
-	open(BT, "$cmd |") || die "Error: Could not open Bowtie 2 pipe: '$cmd |'";
+	open(BT, "$cmd |") || Fail("Could not open Bowtie 2 pipe: '$cmd |'\n");
 	# Open output pipe
 	my $ofh = *STDOUT;
 	my @fhs_to_close = ();
 	if($cap_out ne "-") {
 		open($ofh, ">$cap_out") ||
-			die "Error: Could not open output file '$cap_out' for writing";
+			Fail("Could not open output file '$cap_out' for writing.\n");
 	}
 	my %read_fhs = ();
 	for my $i ("al", "un", "al-conc", "un-conc") {
@@ -393,21 +470,21 @@ if(defined($cap_out)) {
 					$fn1 .= ".1";
 					$fn2 .= ".2";
 				}
-				$fn1 ne $fn2 || die "$fn1\n$fn2\n";
+				$fn1 ne $fn2 || Fail("$fn1\n$fn2\n");
 				my ($redir1, $redir2) = (">$fn1", ">$fn2");
 				$redir1 = "| gzip -c $redir1"  if $read_compress{$i} eq "gzip";
 				$redir1 = "| bzip2 -c $redir1" if $read_compress{$i} eq "bzip2";
 				$redir2 = "| gzip -c $redir2"  if $read_compress{$i} eq "gzip";
 				$redir2 = "| bzip2 -c $redir2" if $read_compress{$i} eq "bzip2";
-				open($read_fhs{$i}{1}, $redir1) || die "Error: Could not open --$i mate-1 output file '$fn1'";
-				open($read_fhs{$i}{2}, $redir2) || die "Error: Could not open --$i mate-2 output file '$fn2'";
+				open($read_fhs{$i}{1}, $redir1) || Fail("Could not open --$i mate-1 output file '$fn1'\n");
+				open($read_fhs{$i}{2}, $redir2) || Fail("Could not open --$i mate-2 output file '$fn2'\n");
 				push @fhs_to_close, $read_fhs{$i}{1};
 				push @fhs_to_close, $read_fhs{$i}{2};
 			} else {
 				my $redir = ">$read_fns{$i}";
 				$redir = "| gzip -c $redir"  if $read_compress{$i} eq "gzip";
 				$redir = "| bzip2 -c $redir" if $read_compress{$i} eq "bzip2";
-				open($read_fhs{$i}, $redir) || die "Error: Could not open --$i output file '$read_fns{$i}'";
+				open($read_fhs{$i}, $redir) || Fail("Could not open --$i output file '$read_fns{$i}'\n");
 				push @fhs_to_close, $read_fhs{$i};
 			}
 		}
@@ -472,16 +549,16 @@ if(defined($cap_out)) {
 if(!$keep) { for(@to_delete) { unlink($_); } }
 
 if ($ret == -1) {
-    print STDERR "Failed to execute bowtie2-align: $!\n";
+    Error("Failed to execute bowtie2-align: $!\n");
 	exit 1;
 } elsif ($ret & 127) {
 	my $signm = "(unknown)";
 	$signm = $signame[$ret & 127] if defined($signame[$ret & 127]);
 	my $ad = "";
 	$ad = "(core dumped)" if (($ret & 128) != 0);
-    printf STDERR "bowtie2-align died with signal %d (%s) $ad\n", ($ret & 127), $signm;
+    Error("bowtie2-align died with signal %d (%s) $ad\n", ($ret & 127), $signm);
 	exit 1;
 } elsif($ret != 0) {
-	printf STDERR "bowtie2-align exited with value %d\n", ($ret >> 8);
+	Error("bowtie2-align exited with value %d\n", ($ret >> 8));
 }
 exit ($ret >> 8);
diff --git a/bowtie2-build b/bowtie2-build
new file mode 100755
index 0000000..d448e2b
--- /dev/null
+++ b/bowtie2-build
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+"""
+ Copyright 2014, Ben Langmead <langmea at cs.jhu.edu>
+
+ This file is part of Bowtie 2.
+
+ Bowtie 2 is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Bowtie 2 is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+import os
+import sys
+import inspect
+import logging
+
+
+def build_args():
+    """
+    Parse the wrapper arguments. Returns the options,<programm arguments> tuple.
+    """
+
+    parsed_args = {}
+    to_remove = []
+    argv = sys.argv[:]
+    for i, arg in enumerate(argv):
+        if arg == '--large-index':
+            parsed_args[arg] = ""
+            to_remove.append(i)
+        elif arg == '--debug':
+            parsed_args[arg] = ""
+            to_remove.append(i)
+        elif arg == '--verbose':
+            parsed_args[arg] = ""
+            to_remove.append(i)
+
+    for i in reversed(to_remove):
+        del argv[i]
+
+    return parsed_args, argv
+
+
+def main():
+    logging.basicConfig(level=logging.ERROR,
+                        format='%(levelname)s: %(message)s'
+                        )
+    delta               = 200
+    small_index_max_size= 4 * 1024**3 - delta
+    build_bin_name      = "bowtie2-build"
+    build_bin_s         = "bowtie2-build-s"
+    build_bin_l         = "bowtie2-build-l"
+    curr_script         = os.path.realpath(inspect.getsourcefile(main))
+    ex_path             = os.path.dirname(curr_script)
+    build_bin_spec      = os.path.join(ex_path,build_bin_s)
+
+    options, argv = build_args()
+
+    if '--verbose' in options:
+        logging.getLogger().setLevel(logging.INFO)
+        
+    if '--debug' in options:
+        build_bin_spec += '-debug'
+        build_bin_l += '-debug'
+
+    if '--large-index' in options:
+        build_bin_spec = os.path.join(ex_path,build_bin_l)
+    elif len(argv) >= 2:
+        ref_fnames = argv[-2]
+        tot_size = 0
+        for fn in ref_fnames.split(','):
+            if os.path.exists(fn):
+                statinfo = os.stat(fn)
+                tot_size += statinfo.st_size
+        if tot_size > small_index_max_size:
+            build_bin_spec = os.path.join(ex_path,build_bin_l)
+
+    argv[0] = build_bin_name
+    argv.insert(1, 'basic-0')
+    argv.insert(1, '--wrapper')
+    logging.info('Command: %s %s' % (build_bin_spec, ' '.join(argv[1:])))
+    os.execv(build_bin_spec, argv)
+
+if __name__ == "__main__":
+    main()
+
+
+
diff --git a/bowtie2-inspect b/bowtie2-inspect
new file mode 100755
index 0000000..62a1c5f
--- /dev/null
+++ b/bowtie2-inspect
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+"""
+ Copyright 2014, Ben Langmead <langmea at cs.jhu.edu>
+
+ This file is part of Bowtie 2.
+
+ Bowtie 2 is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Bowtie 2 is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+import os
+import imp
+import inspect
+import logging
+
+
+
+def main():
+    logging.basicConfig(level=logging.ERROR,
+                        format='%(levelname)s: %(message)s'
+                        )
+    inspect_bin_name      = "bowtie-inspect"
+    inspect_bin_s         = "bowtie2-inspect-s"
+    inspect_bin_l         = "bowtie2-inspect-l"
+    idx_ext_l             = '.1.bt2l'; 
+    idx_ext_s             = '.1.bt2'; 
+    curr_script           = os.path.realpath(inspect.getsourcefile(main))
+    ex_path               = os.path.dirname(curr_script)
+    inspect_bin_spec      = os.path.join(ex_path,inspect_bin_s)
+    bld                   = imp.load_source('bowtie2-build',os.path.join(ex_path,'bowtie2-build'))
+    options,arguments     = bld.build_args()
+
+    if '--verbose' in options:
+        logging.getLogger().setLevel(logging.INFO)
+        
+    if '--debug' in options:
+        inspect_bin_spec += '-debug'
+        inspect_bin_l += '-debug'
+        
+    if '--large-index' in options:
+        inspect_bin_spec = os.path.join(ex_path,inspect_bin_l)
+    elif len(arguments) >= 1:
+        idx_basename = arguments[-1]
+        large_idx_exists = os.path.exists(idx_basename + idx_ext_l)
+        small_idx_exists = os.path.exists(idx_basename + idx_ext_s)
+        if large_idx_exists and not small_idx_exists:
+            inspect_bin_spec = os.path.join(ex_path,inspect_bin_l)
+    
+    arguments[0] = inspect_bin_name
+    arguments.insert(1, 'basic-0')
+    arguments.insert(1, '--wrapper')
+    logging.info('Command: %s %s' %  (inspect_bin_spec,' '.join(arguments[1:])))
+    os.execv(inspect_bin_spec, arguments)        
+        
+        
+if __name__ == "__main__":
+    main()
+
+
+
+
+
diff --git a/bt2_build.cpp b/bt2_build.cpp
index cca4246..8bcbee0 100644
--- a/bt2_build.cpp
+++ b/bt2_build.cpp
@@ -42,8 +42,8 @@
 int verbose;
 static int sanityCheck;
 static int format;
-static uint32_t bmax;
-static uint32_t bmaxMultSqrt;
+static TIndexOffU bmax;
+static TIndexOffU bmaxMultSqrt;
 static uint32_t bmaxDivN;
 static int dcv;
 static int noDc;
@@ -56,19 +56,22 @@ static int32_t linesPerSide;
 static int32_t offRate;
 static int32_t ftabChars;
 static int  bigEndian;
-static bool nsToAs;
+static bool nsToAs;    // convert Ns to As
+static bool doSaFile;  // make a file with just the suffix array in it
+static bool doBwtFile; // make a file with just the BWT string in it
 static bool autoMem;
 static bool packed;
 static bool writeRef;
 static bool justRef;
 static bool reverseEach;
+static string wrapper;
 
 static void resetOptions() {
 	verbose      = true;  // be talkative (default)
 	sanityCheck  = 0;     // do slow sanity checks
 	format       = FASTA; // input sequence format
-	bmax         = 0xffffffff; // max blockwise SA bucket size
-	bmaxMultSqrt = 0xffffffff; // same, as multplier of sqrt(n)
+	bmax         = OFF_MASK; // max blockwise SA bucket size
+	bmaxMultSqrt = OFF_MASK; // same, as multplier of sqrt(n)
 	bmaxDivN     = 4;          // same, as divisor of n
 	dcv          = 1024;  // bwise SA difference-cover sample sz
 	noDc         = 0;     // disable difference-cover sample
@@ -76,17 +79,20 @@ static void resetOptions() {
 	seed         = 0;     // srandom seed
 	showVersion  = 0;     // just print version and quit?
 	//   Ebwt parameters
-	lineRate     = 6;  // a "line" is 64 bytes
+	lineRate     = Ebwt::default_lineRate; // a "line" is 64 or 128 bytes
 	linesPerSide = 1;  // 1 64-byte line on a side
 	offRate      = 4;  // sample 1 out of 16 SA elts
 	ftabChars    = 10; // 10 chars in initial lookup table
 	bigEndian    = 0;  // little endian
 	nsToAs       = false; // convert reference Ns to As prior to indexing
+	doSaFile     = false; // make a file with just the suffix array in it
+	doBwtFile    = false; // make a file with just the BWT string in it
 	autoMem      = true;  // automatically adjust memory usage parameters
 	packed       = false; //
-	writeRef     = true;  // write compact reference to .3.bt2/.4.bt2
+	writeRef     = true;  // write compact reference to .3.gEbwt_ext/.4.gEbwt_ext
 	justRef      = false; // *just* write compact reference, don't index
 	reverseEach  = false;
+	wrapper.clear();
 }
 
 // Argument constants for getopts
@@ -100,7 +106,9 @@ enum {
 	ARG_PMAP,
 	ARG_NTOA,
 	ARG_USAGE,
-	ARG_REVERSE_EACH
+	ARG_REVERSE_EACH,
+	ARG_SA,
+	ARG_WRAPPER
 };
 
 /**
@@ -108,22 +116,39 @@ enum {
  */
 static void printUsage(ostream& out) {
 	out << "Bowtie 2 version " << string(BOWTIE2_VERSION).c_str() << " by Ben Langmead (langmea at cs.jhu.edu, www.cs.jhu.edu/~langmea)" << endl;
-	out << "Usage: bowtie2-build [options]* <reference_in> <bt2_index_base>" << endl
+	
+#ifdef BOWTIE_64BIT_INDEX
+	string tool_name = "bowtie2-build-l";
+#else
+	string tool_name = "bowtie2-build-s";
+#endif
+	if(wrapper == "basic-0") {
+		tool_name = "bowtie2-build";
+	}
+	
+	//               1         2         3         4         5         6         7         8
+	//      12345678901234567890123456789012345678901234567890123456789012345678901234567890
+	out << "Usage: " << tool_name << " [options]* <reference_in> <bt2_index_base>" << endl
 	    << "    reference_in            comma-separated list of files with ref sequences" << endl
-	    << "    bt2_index_base          write .bt2 data to files with this dir/basename" << endl
+	    << "    bt2_index_base          write " + gEbwt_ext + " data to files with this dir/basename" << endl
 		<< "*** Bowtie 2 indexes work only with v2 (not v1).  Likewise for v1 indexes. ***" << endl
 	    << "Options:" << endl
 	    << "    -f                      reference files are Fasta (default)" << endl
-	    << "    -c                      reference sequences given on cmd line (as <seq_in>)" << endl
-	    << "    -a/--noauto             disable automatic -p/--bmax/--dcv memory-fitting" << endl
-	    << "    -p/--packed             use packed strings internally; slower, uses less mem" << endl
+	    << "    -c                      reference sequences given on cmd line (as" << endl
+		<< "                            <reference_in>)" << endl;
+	if(wrapper == "basic-0") {
+	out << "    --large-index           force generated index to be 'large', even if ref" << endl
+		<< "                            has fewer than 4 billion nucleotides" << endl;
+	}
+	out << "    -a/--noauto             disable automatic -p/--bmax/--dcv memory-fitting" << endl
+	    << "    -p/--packed             use packed strings internally; slower, less memory" << endl
 	    << "    --bmax <int>            max bucket sz for blockwise suffix-array builder" << endl
 	    << "    --bmaxdivn <int>        max bucket sz as divisor of ref len (default: 4)" << endl
 	    << "    --dcv <int>             diff-cover period for blockwise (default: 1024)" << endl
 	    << "    --nodc                  disable diff-cover (algorithm becomes quadratic)" << endl
-	    << "    -r/--noref              don't build .3/.4.bt2 (packed reference) portion" << endl
-	    << "    -3/--justref            just build .3/.4.bt2 (packed reference) portion" << endl
-	    << "    -o/--offrate <int>      SA is sampled every 2^offRate BWT chars (default: 5)" << endl
+	    << "    -r/--noref              don't build .3/.4 index files" << endl
+	    << "    -3/--justref            just build .3/.4 index files" << endl
+	    << "    -o/--offrate <int>      SA is sampled every 2^<int> BWT chars (default: 5)" << endl
 	    << "    -t/--ftabchars <int>    # of chars consumed in initial lookup (default: 10)" << endl
 	    //<< "    --ntoa                  convert Ns in reference to As" << endl
 	    //<< "    --big --little          endianness (default: little, this host: "
@@ -134,6 +159,13 @@ static void printUsage(ostream& out) {
 	    << "    --usage                 print this usage message" << endl
 	    << "    --version               print version information and quit" << endl
 	    ;
+	if(wrapper.empty()) {
+		cerr << endl
+		     << "*** Warning ***" << endl
+			 << "'" << tool_name << "' was run directly.  It is recommended "
+			 << "that you run the wrapper script 'bowtie2-build' instead."
+			 << endl << endl;
+	}
 }
 
 static const char *short_options = "qraph?nscfl:i:o:t:h:3C";
@@ -163,8 +195,10 @@ static struct option long_options[] = {
 	{(char*)"justref",      no_argument,       0,            '3'},
 	{(char*)"noref",        no_argument,       0,            'r'},
 	{(char*)"color",        no_argument,       0,            'C'},
+	{(char*)"sa",           no_argument,       0,            ARG_SA},
 	{(char*)"reverse-each", no_argument,       0,            ARG_REVERSE_EACH},
 	{(char*)"usage",        no_argument,       0,            ARG_USAGE},
+	{(char*)"wrapper",      required_argument, 0,            ARG_WRAPPER},
 	{(char*)0, 0, 0, 0} // terminator
 };
 
@@ -174,7 +208,7 @@ static struct option long_options[] = {
  * exit with an error and a usage message.
  */
 template<typename T>
-static int parseNumber(T lower, const char *errmsg) {
+static T parseNumber(T lower, const char *errmsg) {
 	char *endPtr= NULL;
 	T t = (T)strtoll(optarg, &endPtr, 10);
 	if (endPtr != NULL) {
@@ -194,14 +228,18 @@ static int parseNumber(T lower, const char *errmsg) {
 /**
  * Read command-line arguments
  */
-static void parseOptions(int argc, const char **argv) {
+static bool parseOptions(int argc, const char **argv) {
 	int option_index = 0;
 	int next_option;
+	bool abort = false;
 	do {
 		next_option = getopt_long(
 			argc, const_cast<char**>(argv),
 			short_options, long_options, &option_index);
 		switch (next_option) {
+			case ARG_WRAPPER:
+				wrapper = optarg;
+				break;
 			case 'f': format = FASTA; break;
 			case 'c': format = CMDLINE; break;
 			case 'p': packed = true; break;
@@ -231,22 +269,22 @@ static void parseOptions(int argc, const char **argv) {
 			case 'h':
 			case ARG_USAGE:
 				printUsage(cout);
-				throw 0;
+				abort = true;
 				break;
 			case ARG_BMAX:
-				bmax = parseNumber<uint32_t>(1, "--bmax arg must be at least 1");
-				bmaxMultSqrt = 0xffffffff; // don't use multSqrt
+				bmax = parseNumber<TIndexOffU>(1, "--bmax arg must be at least 1");
+				bmaxMultSqrt = OFF_MASK; // don't use multSqrt
 				bmaxDivN = 0xffffffff;     // don't use multSqrt
 				break;
 			case ARG_BMAX_MULT:
-				bmaxMultSqrt = parseNumber<uint32_t>(1, "--bmaxmultsqrt arg must be at least 1");
-				bmax = 0xffffffff;     // don't use bmax
+				bmaxMultSqrt = parseNumber<TIndexOffU>(1, "--bmaxmultsqrt arg must be at least 1");
+				bmax = OFF_MASK;     // don't use bmax
 				bmaxDivN = 0xffffffff; // don't use multSqrt
 				break;
 			case ARG_BMAX_DIV:
 				bmaxDivN = parseNumber<uint32_t>(1, "--bmaxdivn arg must be at least 1");
-				bmax = 0xffffffff;         // don't use bmax
-				bmaxMultSqrt = 0xffffffff; // don't use multSqrt
+				bmax = OFF_MASK;         // don't use bmax
+				bmaxMultSqrt = OFF_MASK; // don't use multSqrt
 				break;
 			case ARG_DCV:
 				dcv = parseNumber<int>(3, "--dcv arg must be at least 3");
@@ -257,6 +295,9 @@ static void parseOptions(int argc, const char **argv) {
 			case ARG_REVERSE_EACH:
 				reverseEach = true;
 				break;
+			case ARG_SA:
+				doSaFile = true;
+				break;
 			case ARG_NTOA: nsToAs = true; break;
 			case 'a': autoMem = false; break;
 			case 'q': verbose = false; break;
@@ -278,6 +319,7 @@ static void parseOptions(int argc, const char **argv) {
 		     << "extremely slow performance and memory exhaustion.  Perhaps you meant to specify" << endl
 		     << "a small --bmaxdivn?" << endl;
 	}
+	return abort;
 }
 
 EList<string> filesWritten;
@@ -331,7 +373,7 @@ static void driver(
 	} else {
 		// Adapt sequence files to ifstreams
 		for(size_t i = 0; i < infiles.size(); i++) {
-			FILE *f = fopen(infiles[i].c_str(), "r");
+			FILE *f = fopen(infiles[i].c_str(), "rb");
 			if (f == NULL) {
 				cerr << "Error: could not open "<< infiles[i].c_str() << endl;
 				throw 1;
@@ -353,6 +395,13 @@ static void driver(
 		cerr << "Warning: All fasta inputs were empty" << endl;
 		throw 1;
 	}
+	if(!reverse) {
+#ifdef BOWTIE_64BIT_INDEX
+		cerr << "Building a LARGE index" << endl;
+#else
+		cerr << "Building a SMALL index" << endl;
+#endif
+	}
 	// Vector for the ordered list of "records" comprising the input
 	// sequences.  A record represents a stretch of unambiguous
 	// characters in one of the input sequences.
@@ -362,8 +411,8 @@ static void driver(
 		if(verbose) cout << "Reading reference sizes" << endl;
 		Timer _t(cout, "  Time reading reference sizes: ", verbose);
 		if(!reverse && (writeRef || justRef)) {
-			filesWritten.push_back(outfile + ".3.bt2");
-			filesWritten.push_back(outfile + ".4.bt2");
+			filesWritten.push_back(outfile + ".3." + gEbwt_ext);
+			filesWritten.push_back(outfile + ".4." + gEbwt_ext);
 			sztot = BitPairReference::szsFromFasta(is, outfile, bigEndian, refparams, szs, sanityCheck);
 		} else {
 			sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck);
@@ -374,8 +423,8 @@ static void driver(
 	assert_gt(sztot.second, 0);
 	assert_gt(szs.size(), 0);
 	// Construct index from input strings and parameters
-	filesWritten.push_back(outfile + ".1.bt2");
-	filesWritten.push_back(outfile + ".2.bt2");
+	filesWritten.push_back(outfile + ".1." + gEbwt_ext);
+	filesWritten.push_back(outfile + ".2." + gEbwt_ext);
 	Ebwt ebwt(
 		TStr(),
 		packed,
@@ -393,10 +442,12 @@ static void driver(
 		noDc? 0 : dcv,// difference-cover period
 		is,           // list of input streams
 		szs,          // list of reference sizes
-		(uint32_t)sztot.first,  // total size of all unambiguous ref chars
+		(TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
 		refparams,    // reference read-in parameters
 		seed,         // pseudo-random number generator seed
 		-1,           // override offRate
+		doSaFile,     // make a file with just the suffix array in it
+		doBwtFile,    // make a file with just the BWT string in it
 		verbose,      // be talkative
 		autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
 		sanityCheck); // verify results and internal consistency
@@ -425,7 +476,7 @@ static void driver(
 			SString<char> joinedss = Ebwt::join<SString<char> >(
 				is,          // list of input streams
 				szs,         // list of reference sizes
-				(uint32_t)sztot.first, // total size of all unambiguous ref chars
+				(TIndexOffU)sztot.first, // total size of all unambiguous ref chars
 				refparams,   // reference read-in parameters
 				seed);       // pseudo-random number generator seed
 			if(refparams.reverse == REF_READ_REVERSE) {
@@ -460,7 +511,9 @@ int bowtie_build(int argc, const char **argv) {
 		string infile;
 		EList<string> infiles(MISC_CAT);
 
-		parseOptions(argc, argv);
+		if(parseOptions(argc, argv)) {
+			return 0;
+		}
 		argv0 = argv[0];
 		if(showVersion) {
 			cout << argv0 << " version " << string(BOWTIE2_VERSION).c_str() << endl;
@@ -509,19 +562,19 @@ int bowtie_build(int argc, const char **argv) {
 		// Optionally summarize
 		if(verbose) {
 			cout << "Settings:" << endl
-				 << "  Output files: \"" << outfile.c_str() << ".*.bt2\"" << endl
+				 << "  Output files: \"" << outfile.c_str() << ".*." + gEbwt_ext + "\"" << endl
 				 << "  Line rate: " << lineRate << " (line is " << (1<<lineRate) << " bytes)" << endl
 				 << "  Lines per side: " << linesPerSide << " (side is " << ((1<<lineRate)*linesPerSide) << " bytes)" << endl
 				 << "  Offset rate: " << offRate << " (one in " << (1<<offRate) << ")" << endl
 				 << "  FTable chars: " << ftabChars << endl
 				 << "  Strings: " << (packed? "packed" : "unpacked") << endl
 				 ;
-			if(bmax == 0xffffffff) {
+			if(bmax == OFF_MASK) {
 				cout << "  Max bucket size: default" << endl;
 			} else {
 				cout << "  Max bucket size: " << bmax << endl;
 			}
-			if(bmaxMultSqrt == 0xffffffff) {
+			if(bmaxMultSqrt == OFF_MASK) {
 				cout << "  Max bucket size, sqrt multiplier: default" << endl;
 			} else {
 				cout << "  Max bucket size, sqrt multiplier: " << bmaxMultSqrt << endl;
diff --git a/bt2_dp.cpp b/bt2_dp.cpp
new file mode 100644
index 0000000..565518d
--- /dev/null
+++ b/bt2_dp.cpp
@@ -0,0 +1,788 @@
+/*
+ * Copyright 2013, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <getopt.h>
+#include "assert_helpers.h"
+#include "ds.h"
+#include "simple_func.h"
+#include "aligner_seed_policy.h"
+#include "scoring.h"
+#include "opts.h"
+#include "aligner_sw.h"
+
+using namespace std;
+
+int gVerbose;               // be talkative
+int gQuiet;                 // print nothing but the alignments
+static int sanityCheck;     // enable expensive sanity checks
+static int seed;            // srandom() seed
+static bool showVersion;    // just print version and quit?
+static uint32_t qUpto;      // max # of queries to read
+static int nthreads;        // number of pthreads operating concurrently
+static bool useSpinlock;    // false -> don't use of spinlocks even if they're #defines
+static uint32_t skipReads;  // # reads/read pairs to skip
+int gGapBarrier;            // # diags on top/bot only to be entered diagonally
+static int bonusMatchType;  // how to reward matches
+static int bonusMatch;      // constant reward if bonusMatchType=constant
+static int penMmcType;      // how to penalize mismatches
+static int penMmcMax;       // max mm penalty
+static int penMmcMin;       // min mm penalty
+static int penNType;        // how to penalize Ns in the read
+static int penN;            // constant if N pelanty is a constant
+static bool penNCatPair;    // concatenate mates before N filtering?
+static bool localAlign;     // do local alignment in DP steps
+static int   penRdGapConst;   // constant cost of extending a gap in the read
+static int   penRfGapConst;   // constant cost of extending a gap in the reference
+static int   penRdGapLinear;  // coeff of linear term for cost of gap extension in read
+static int   penRfGapLinear;  // coeff of linear term for cost of gap extension in ref
+static SimpleFunc scoreMin;   // minimum valid score as function of read len
+static SimpleFunc nCeil;      // max # Ns allowed as function of read len
+static SimpleFunc msIval;     // interval between seeds as function of read len
+static bool enable8;          // use 8-bit SSE where possible?
+static size_t cminlen;        // longer reads use checkpointing
+static size_t cpow2;          // checkpoint interval log2
+static bool doTri;            // do triangular mini-fills?
+static bool ignoreQuals;      // all mms incur same penalty, regardless of qual
+static EList<string> queries; // list of query files
+static string outfile;        // write output to this file
+
+static void resetOptions() {
+	gVerbose                = 0;
+	gQuiet					= false;
+	sanityCheck				= 0;  // enable expensive sanity checks
+	seed					= 0; // srandom() seed
+	showVersion				= false; // just print version and quit?
+	qUpto					= 0xffffffff; // max # of queries to read
+	nthreads				= 1;     // number of pthreads operating concurrently
+	useSpinlock				= true;  // false -> don't use of spinlocks even if they're #defines
+	skipReads				= 0;     // # reads/read pairs to skip
+	gGapBarrier				= 4;     // disallow gaps within this many chars of either end of alignment
+	bonusMatchType  = DEFAULT_MATCH_BONUS_TYPE;
+	bonusMatch      = DEFAULT_MATCH_BONUS;
+	penMmcType      = DEFAULT_MM_PENALTY_TYPE;
+	penMmcMax       = DEFAULT_MM_PENALTY_MAX;
+	penMmcMin       = DEFAULT_MM_PENALTY_MIN;
+	penNType        = DEFAULT_N_PENALTY_TYPE;
+	penN            = DEFAULT_N_PENALTY;
+	penNCatPair     = DEFAULT_N_CAT_PAIR; // concatenate mates before N filtering?
+	localAlign      = false;     // do local alignment in DP steps
+	penRdGapConst   = DEFAULT_READ_GAP_CONST;
+	penRfGapConst   = DEFAULT_REF_GAP_CONST;
+	penRdGapLinear  = DEFAULT_READ_GAP_LINEAR;
+	penRfGapLinear  = DEFAULT_REF_GAP_LINEAR;
+	scoreMin.init  (SIMPLE_FUNC_LINEAR, DEFAULT_MIN_CONST,   DEFAULT_MIN_LINEAR);
+	nCeil.init     (SIMPLE_FUNC_LINEAR, 0.0f, std::numeric_limits<double>::max(), 2.0f, 0.1f);
+	msIval.init    (SIMPLE_FUNC_LINEAR, 1.0f, std::numeric_limits<double>::max(), DEFAULT_IVAL_B, DEFAULT_IVAL_A);
+	enable8            = true;  // use 8-bit SSE where possible?
+	cminlen            = 2000;  // longer reads use checkpointing
+	cpow2              = 4;     // checkpoint interval log2
+	doTri              = false; // do triangular mini-fills?
+	ignoreQuals = false;     // all mms incur same penalty, regardless of qual
+	queries.clear();         // list of query files
+	outfile.clear();         // write output to this file
+}
+
+static const char *short_options = "u:hp:P:S:";
+
+static struct option long_options[] = {
+	{(char*)"verbose",          no_argument,       0, ARG_VERBOSE},
+	{(char*)"quiet",            no_argument,       0, ARG_QUIET},
+	{(char*)"sanity",           no_argument,       0, ARG_SANITY},
+	{(char*)"qupto",            required_argument, 0, 'u'},
+	{(char*)"upto",             required_argument, 0, 'u'},
+	{(char*)"version",          no_argument,       0, ARG_VERSION},
+	{(char*)"help",             no_argument,       0, 'h'},
+	{(char*)"threads",          required_argument, 0, 'p'},
+	{(char*)"usage",            no_argument,       0, ARG_USAGE},
+	{(char*)"gbar",             required_argument, 0, ARG_GAP_BAR},
+	{(char*)"policy",           required_argument, 0, ARG_ALIGN_POLICY},
+	{(char*)"454",              no_argument,       0, ARG_NOISY_HPOLY},
+	{(char*)"ion-torrent",      no_argument,       0, ARG_NOISY_HPOLY},
+	{(char*)"local",            no_argument,       0, ARG_LOCAL},
+	{(char*)"end-to-end",       no_argument,       0, ARG_END_TO_END},
+	{(char*)"sse8",             no_argument,       0, ARG_SSE8},
+	{(char*)"no-sse8",          no_argument,       0, ARG_SSE8_NO},
+	{(char*)"ma",               required_argument, 0, ARG_SCORE_MA},
+	{(char*)"mp",               required_argument, 0, ARG_SCORE_MMP},
+	{(char*)"np",               required_argument, 0, ARG_SCORE_NP},
+	{(char*)"rdg",              required_argument, 0, ARG_SCORE_RDG},
+	{(char*)"rfg",              required_argument, 0, ARG_SCORE_RFG},
+	{(char*)"score-min",        required_argument, 0, ARG_SCORE_MIN},
+	{(char*)"min-score",        required_argument, 0, ARG_SCORE_MIN},
+	{(char*)"n-ceil",           required_argument, 0, ARG_N_CEIL},
+	{(char*)"ignore-quals",     no_argument,       0, ARG_IGNORE_QUALS},
+	{(char*)"output",           required_argument, 0, 'S'},
+	{(char*)"cp-min",           required_argument, 0, ARG_CP_MIN},
+	{(char*)"cp-ival",          required_argument, 0, ARG_CP_IVAL},
+	{(char*)"tri",              no_argument,       0, ARG_TRI},
+	{(char*)0, 0, 0, 0} // terminator
+};
+
+/**
+ * Print a summary usage message to the provided output stream.
+ */
+static void printUsage(ostream& out) {
+	out << "Bowtie 2 dynamic programming engine, by Ben Langmead (langmea at cs.jhu.edu, www.cs.jhu.edu/~langmea)" << endl;
+	string tool_name = "bowtie2-dp";
+	out << "Usage: " << endl
+	    << "  " << tool_name.c_str() << " [options]* <in> <out>" << endl
+	    << endl
+	    <<     "  <in>           File with DP input problems (default: stdin)" << endl
+	    <<     "  <out>          File with DP output solutions (default: stdout)" << endl
+		<< endl
+	    << "Options (defaults in parentheses):" << endl
+		<< endl
+	    << " Input:" << endl
+	    << "  -s/--skip <int>    skip the first <int> problems in the input (none)" << endl
+	    << "  -u/--upto <int>    stop after first <int> problems (no limit)" << endl
+		<< endl
+	    << " Alignment:" << endl
+		<< "  --n-ceil <func>    func for max # non-A/C/G/Ts permitted in aln (L,0,0.15)" << endl
+		<< "  --gbar <int>       disallow gaps within <int> nucs of read extremes (4)" << endl
+		<< "  --ignore-quals     treat all quality values as 30 on Phred scale (off)" << endl
+		<< endl
+		<< "  --end-to-end       entire read must align; no clipping (on)" << endl
+		<< "   OR" << endl
+		<< "  --local            local alignment; ends might be soft clipped (off)" << endl
+		<< endl
+	    << " Scoring:" << endl
+		<< "  --ma <int>         match bonus (0 for --end-to-end, 2 for --local) " << endl
+		<< "  --mp <int>         max penalty for mismatch; lower qual = lower penalty (6)" << endl
+		<< "  --np <int>         penalty for non-A/C/G/Ts in read/ref (1)" << endl
+		<< "  --rdg <int>,<int>  read gap open, extend penalties (5,3)" << endl
+		<< "  --rfg <int>,<int>  reference gap open, extend penalties (5,3)" << endl
+		<< "  --score-min <func> min acceptable alignment score w/r/t read length" << endl
+		<< "                     (G,20,8 for local, L,-0.6,-0.6 for end-to-end)" << endl
+	    << "  --quiet            print nothing to stderr except serious errors" << endl
+		<< endl
+	    << " Performance:" << endl
+	    << "  -p/--threads <int> number of alignment threads to launch (1)" << endl
+		<< endl
+	    << " Other:" << endl
+	    << "  --version          print version information and quit" << endl
+	    << "  -h/--help          print this usage message" << endl
+	    ;
+}
+
+/**
+ * Parse an int out of optarg and enforce that it be at least 'lower';
+ * if it is less than 'lower', than output the given error message and
+ * exit with an error and a usage message.
+ */
+static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
+	long l;
+	char *endPtr= NULL;
+	l = strtol(arg, &endPtr, 10);
+	if (endPtr != NULL) {
+		if (l < lower || l > upper) {
+			cerr << errmsg << endl;
+			printUsage(cerr);
+			throw 1;
+		}
+		return (int32_t)l;
+	}
+	cerr << errmsg << endl;
+	printUsage(cerr);
+	throw 1;
+	return -1;
+}
+
+/**
+ * Upper is maximum int by default.
+ */
+static int parseInt(int lower, const char *errmsg, const char *arg) {
+	return parseInt(lower, std::numeric_limits<int>::max(), errmsg, arg);
+}
+
+/**
+ * Parse a T string 'str'.
+ */
+template<typename T>
+T parse(const char *s) {
+	T tmp;
+	stringstream ss(s);
+	ss >> tmp;
+	return tmp;
+}
+
+static int parseFuncType(const std::string& otype) {
+	string type = otype;
+	if(type == "C" || type == "Constant") {
+		return SIMPLE_FUNC_CONST;
+	} else if(type == "L" || type == "Linear") {
+		return SIMPLE_FUNC_LINEAR;
+	} else if(type == "S" || type == "Sqrt") {
+		return SIMPLE_FUNC_SQRT;
+	} else if(type == "G" || type == "Log") {
+		return SIMPLE_FUNC_LOG;
+	}
+	std::cerr << "Error: Bad function type '" << otype.c_str()
+	          << "'.  Should be C (constant), L (linear), "
+	          << "S (square root) or G (natural log)." << std::endl;
+	throw 1;
+}
+
+#define PARSE_FUNC(fv) { \
+	if(args.size() >= 1) { \
+		fv.setType(parseFuncType(args[0])); \
+	} \
+	if(args.size() >= 2) { \
+		double co; \
+		istringstream tmpss(args[1]); \
+		tmpss >> co; \
+		fv.setConst(co); \
+	} \
+	if(args.size() >= 3) { \
+		double ce; \
+		istringstream tmpss(args[2]); \
+		tmpss >> ce; \
+		fv.setCoeff(ce); \
+	} \
+	if(args.size() >= 4) { \
+		double mn; \
+		istringstream tmpss(args[3]); \
+		tmpss >> mn; \
+		fv.setMin(mn); \
+	} \
+	if(args.size() >= 5) { \
+		double mx; \
+		istringstream tmpss(args[4]); \
+		tmpss >> mx; \
+		fv.setMin(mx); \
+	} \
+}
+
+/**
+ * TODO: Argument parsing is very, very flawed.  The biggest problem is that
+ * there are two separate worlds of arguments, the ones set via polstr, and
+ * the ones set directly in variables.  This makes for nasty interactions,
+ * e.g., with the -M option being resolved at an awkward time relative to
+ * the -k and -a options.
+ */
+static void parseOption(int next_option, const char *arg) {
+	switch (next_option) {
+		case 's':
+			skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg);
+			break;
+		case ARG_GAP_BAR:
+			gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg);
+			break;
+		case 'u':
+			qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg);
+			break;
+		case 'p':
+			nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg);
+			break;
+		case 'h': printUsage(cout); throw 0; break;
+		case ARG_USAGE: printUsage(cout); throw 0; break;
+		case ARG_VERBOSE: gVerbose = 1; break;
+		case ARG_QUIET: gQuiet = true; break;
+		case ARG_SANITY: sanityCheck = true; break;
+		case ARG_CP_MIN:
+			cminlen = parse<size_t>(arg);
+			break;
+		case ARG_CP_IVAL:
+			cpow2 = parse<size_t>(arg);
+			break;
+		case ARG_TRI:
+			doTri = true;
+			break;
+		case ARG_LOCAL: localAlign = true; break;
+		case ARG_END_TO_END: localAlign = false; break;
+		case ARG_SSE8: enable8 = true; break;
+		case ARG_SSE8_NO: enable8 = false; break;
+		case ARG_IGNORE_QUALS: ignoreQuals = true; break;
+		case ARG_N_CEIL: {
+			// Split argument by comma
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 3) {
+				cerr << "Error: expected 3 or fewer comma-separated "
+					 << "arguments to --n-ceil option, got "
+					 << args.size() << endl;
+				throw 1;
+			}
+			if(args.size() == 0) {
+				cerr << "Error: expected at least one argument to --n-ceil option" << endl;
+				throw 1;
+			}
+			PARSE_FUNC(nCeil);
+			break;
+		}
+		case ARG_SCORE_MA: {
+			// Split argument by comma
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() != 1) {
+				cerr << "Error parsing --ma; RHS must have 1 token" << endl;
+				assert(false); throw 1;
+			}
+			string tmp = args[0];
+			istringstream tmpss(tmp);
+			tmpss >> bonusMatch;
+			break;
+		}
+		case ARG_SCORE_MMP: {
+			// Split argument by comma
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 3) {
+				cerr << "Error parsing --mmp "
+				     << "; RHS must have at most 3 tokens" << endl;
+				assert(false); throw 1;
+			}
+			if(args[0][0] == 'C') {
+				string tmp = args[0].substr(1);
+				// Parse constant penalty
+				istringstream tmpss(tmp);
+				tmpss >> penMmcMax;
+				penMmcMin = penMmcMax;
+				// Parse constant penalty
+				penMmcType = COST_MODEL_CONSTANT;
+			} else if(args[0][0] == 'Q') {
+				if(args.size() >= 2) {
+					string tmp = args[1];
+					istringstream tmpss(tmp);
+					tmpss >> penMmcMax;
+				} else {
+					penMmcMax = DEFAULT_MM_PENALTY_MAX;
+				}
+				if(args.size() >= 3) {
+					string tmp = args[2];
+					istringstream tmpss(tmp);
+					tmpss >> penMmcMin;
+				} else {
+					penMmcMin = DEFAULT_MM_PENALTY_MIN;
+				}
+				if(penMmcMin > penMmcMax) {
+					cerr << "Error: Maximum mismatch penalty (" << penMmcMax
+					     << ") is less than minimum penalty (" << penMmcMin
+						 << endl;
+					throw 1;
+				}
+				// Set type to =quality
+				penMmcType = COST_MODEL_QUAL;
+			} else if(args[0][0] == 'R') {
+				// Set type to=Maq-quality
+				penMmcType = COST_MODEL_ROUNDED_QUAL;
+			} else {
+				cerr << "Error parsing --mmp "
+				     << "; RHS must start with C, Q or R" << endl;
+				assert(false); throw 1;
+			}
+			break;
+		}
+		case ARG_SCORE_NP: {
+			// Split argument by comma
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() != 1) {
+				cerr << "Error parsing --np "
+				     << "; RHS must have 1 token" << endl;
+				assert(false); throw 1;
+			}
+			if(args[0][0] == 'C') {
+				string tmp = args[0].substr(1);
+				// Parse constant penalty
+				istringstream tmpss(tmp);
+				tmpss >> penN;
+				// Parse constant penalty
+				penNType = COST_MODEL_CONSTANT;
+			} else if(args[0][0] == 'Q') {
+				// Set type to =quality
+				penNType = COST_MODEL_QUAL;
+			} else if(args[0][0] == 'R') {
+				// Set type to=Maq-quality
+				penNType = COST_MODEL_ROUNDED_QUAL;
+			} else {
+				cerr << "Error parsing --np "
+				     << "; RHS must start with C, Q or R" << endl;
+				assert(false); throw 1;
+			}
+		}
+		case ARG_SCORE_RDG: {
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() >= 1) {
+				istringstream tmpss(args[0]);
+				tmpss >> penRdGapConst;
+			} else {
+				penRdGapConst = DEFAULT_READ_GAP_CONST;
+			}
+			if(args.size() >= 2) {
+				istringstream tmpss(args[1]);
+				tmpss >> penRdGapLinear;
+			} else {
+				penRdGapLinear = DEFAULT_READ_GAP_LINEAR;
+			}
+		}
+		case ARG_SCORE_RFG: {
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() >= 1) {
+				istringstream tmpss(args[0]);
+				tmpss >> penRfGapConst;
+			} else {
+				penRfGapConst = DEFAULT_REF_GAP_CONST;
+			}
+			if(args.size() >= 2) {
+				istringstream tmpss(args[1]);
+				tmpss >> penRfGapLinear;
+			} else {
+				penRfGapLinear = DEFAULT_REF_GAP_LINEAR;
+			}
+		}
+		case ARG_SCORE_MIN: {
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 3 && args.size() == 0) {
+				cerr << "Error: expected 3 or fewer comma-separated "
+					 << "arguments to --n-ceil option, got "
+					 << args.size() << endl;
+				throw 1;
+			}
+			PARSE_FUNC(scoreMin);
+			break;
+		}
+		case 'S': outfile = arg; break;
+		case 'U': {
+			EList<string> args;
+			tokenize(arg, ",", args);
+			for(size_t i = 0; i < args.size(); i++) {
+				queries.push_back(args[i]);
+			}
+			break;
+		}
+		case ARG_VERSION: showVersion = 1; break;
+		default:
+			printUsage(cerr);
+			throw 1;
+	}
+}
+
+/**
+ * Read command-line arguments
+ */
+static void parseOptions(int argc, const char **argv) {
+	int option_index = 0;
+	int next_option;
+	while(true) {
+		next_option = getopt_long(
+			argc, const_cast<char**>(argv),
+			short_options, long_options, &option_index);
+		const char * arg = optarg;
+		if(next_option == EOF) {
+			break;
+		}
+		parseOption(next_option, arg);
+	}
+	// If both -s and -u are used, we need to adjust qUpto accordingly
+	// since it uses rdid to know if we've reached the -u limit (and
+	// rdids are all shifted up by skipReads characters)
+	if(qUpto + skipReads > qUpto) {
+		qUpto += skipReads;
+	}
+	if(gGapBarrier < 1) {
+		cerr << "Warning: --gbar was set less than 1 (=" << gGapBarrier
+		     << "); setting to 1 instead" << endl;
+		gGapBarrier = 1;
+	}
+#ifndef NDEBUG
+	if(!gQuiet) {
+		cerr << "Warning: Running in debug mode.  Please use debug mode only "
+			 << "for diagnosing errors, and not for typical use of Bowtie 2."
+			 << endl;
+	}
+#endif
+}
+
+struct DpProblem {
+	void reset() {
+		ref.clear();
+	}
+
+	TRefId   refidx;
+	TRefOff  reflen;
+	TAlScore minsc;
+	BTString ref;
+	bool     fw;
+	DPRect   rect;
+	bool     aligned;
+	TAlScore score;
+};
+
+class DpLogReader {
+
+public:
+
+	DpLogReader() { }
+	
+	~DpLogReader() { reset(); }
+	
+	void init(const string& fn) {
+		reset();
+		fn_ = fn;
+		ih_.open(fn_.c_str());
+		ih_.sync_with_stdio(false);
+	}
+	
+	void reset() {
+		if(ih_.is_open()) {
+			ih_.close();
+		}
+	}
+	
+	bool nextRead(
+		BTDnaString& seq,
+		BTString& qual,
+		EList<DpProblem>& refs)
+	{
+		if(done()) {
+			return false;
+		}
+		ln_.clear();
+		getline(ih_, ln_);
+		while(ln_.empty() && ih_.good()) {
+			getline(ih_, ln_);
+		}
+		if(ln_.empty() && !ih_.good()) {
+			return false;
+		}
+		EList<string> buf;
+		tokenize(ln_, '\t', buf);
+		assert_gt(buf.size(), 2);
+		seq.install(buf[0].c_str(), true);
+		qual = buf[1];
+		for(size_t i = 2; i < buf.size(); i++) {
+			refs.expand();
+			istringstream is(buf[i]);
+			char comma, tmp;
+			// ref idx
+			is >> refs.back().refidx;
+			is >> comma; assert_eq(',', comma);
+			// ref length
+			is >> refs.back().reflen;
+			is >> comma; assert_eq(',', comma);
+			// minimum score
+			is >> refs.back().minsc;
+			is >> comma; assert_eq(',', comma);
+			// read orientation
+			is >> tmp;
+			assert(tmp == '-' || tmp == '+');
+			refs.back().fw = (tmp == '+');
+			is >> comma; assert_eq(',', comma);
+			// DP rectangle
+			is >> refs.back().rect.refl;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.refr;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.refl_pretrim;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.refr_pretrim;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.triml;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.trimr;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.corel;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.corer;
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().rect.maxgap;
+			is >> comma; assert_eq(',', comma);
+			// reference string
+			string ref;
+			while(true) {
+				char c;
+				is >> c;
+				if(c == ',') break;
+				ref.push_back(c);
+			}
+			refs.back().ref.install(ref.c_str());
+			for(size_t i = 0; i < ref.length(); i++) {
+				int m = asc2dnamask[(int)refs.back().ref[i]];
+				if(m == 15) {
+					m = 16; // N
+				}
+				refs.back().ref.set(m, i);
+			}
+			// whether the DP alignment was successful
+			int aligned;
+			is >> aligned;
+			refs.back().aligned = (aligned == 1);
+			// alignment score
+			is >> comma; assert_eq(',', comma);
+			is >> refs.back().score;
+		}
+		return true;
+	}
+	
+	bool done() const {
+		return !ih_.good();
+	}
+
+protected:
+
+	string   fn_; // file name
+	ifstream ih_; // file handle
+	string   ln_; // line buffer
+};
+
+int main(int argc, const char **argv) {
+	try {
+		// Reset all global state, including getopt state
+		opterr = optind = 1;
+		resetOptions();
+		parseOptions(argc, argv);
+		if(showVersion) {
+			if(sizeof(void*) == 4) {
+				cout << "32-bit" << endl;
+			} else if(sizeof(void*) == 8) {
+				cout << "64-bit" << endl;
+			} else {
+				cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
+			}
+			cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
+				 << sizeof(int)
+				 << ", " << sizeof(long) << ", " << sizeof(long long)
+				 << ", " << sizeof(void *) << ", " << sizeof(size_t)
+				 << ", " << sizeof(off_t) << "}" << endl;
+			return 0;
+		}
+		while(optind < argc) {
+			queries.push_back(argv[optind++]);
+		}
+		{
+			// Optionally summarize
+			if(gVerbose) {
+				cout << "DP inputs:" << endl;
+				for(size_t i = 0; i < queries.size(); i++) {
+					cout << "  " << queries[i].c_str() << endl;
+				}
+				cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
+				cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
+			#ifdef NDEBUG
+				cout << "Assertions: disabled" << endl;
+			#else
+				cout << "Assertions: enabled" << endl;
+			#endif
+			}
+		}
+		// Do stuff
+		SwAligner sw(NULL);
+		DpLogReader logrd;
+		Scoring sc(
+			bonusMatch,     // constant reward for match
+			penMmcType,     // how to penalize mismatches
+			penMmcMax,      // max mm pelanty
+			penMmcMin,      // min mm pelanty
+			scoreMin,       // min score as function of read len
+			nCeil,          // max # Ns as function of read len
+			penNType,       // how to penalize Ns in the read
+			penN,           // constant if N pelanty is a constant
+			penNCatPair,    // whether to concat mates before N filtering
+			penRdGapConst,  // constant coeff for read gap cost
+			penRfGapConst,  // constant coeff for ref gap cost
+			penRdGapLinear, // linear coeff for read gap cost
+			penRfGapLinear, // linear coeff for ref gap cost
+			gGapBarrier);   // # rows at top/bot only entered diagonally
+		RandomSource rnd(seed);
+		{
+			Timer tim(std::cerr, "Alignment ", true);
+			BTDnaString seq, seqrc;
+			BTString qual, qualrc;
+			EList<DpProblem> probs;
+			size_t qid = 0;
+			size_t totnuc = 0, totcup = 0;
+			for(size_t i = 0; i < queries.size(); i++) {
+				logrd.init(queries[i]);
+				while(logrd.nextRead(seq, qual, probs)) {
+					totnuc += seq.length();
+					seqrc = seq;
+					seqrc.reverseComp();
+					qualrc = qual;
+					qualrc.reverse();
+					//cerr << "Initing read with " << probs.size() << " problems" << endl;
+					sw.initRead(seq, seqrc, qual, qualrc, 0, seq.length(), sc);
+					// Calculate minimum score
+					bool extend = true;
+					for(size_t j = 0; j < probs.size(); j++) {
+						sw.initRef(
+							probs[j].fw,
+							probs[j].refidx,
+							probs[j].rect,
+							const_cast<char *>(probs[j].ref.toZBuf()),
+							0,
+							probs[j].ref.length(),
+							probs[j].reflen,
+							sc,
+							probs[j].minsc,
+							enable8,
+							cminlen,
+							cpow2,
+							doTri,
+							extend);
+						// Now fill the dynamic programming matrix and return true iff
+						// there is at least one valid alignment
+						TAlScore bestCell = std::numeric_limits<TAlScore>::min();
+						ASSERT_ONLY(bool aligned =) sw.align(bestCell);
+						assert(aligned == probs[j].aligned);
+						assert(!aligned || bestCell == probs[j].score);
+						totcup += (seq.length() * probs[j].ref.length());
+					}
+					seq.clear();  seqrc.clear();
+					qual.clear(); qualrc.clear();
+					probs.clear();
+					qid++;
+				}
+			}
+			size_t el = (size_t)tim.elapsed();
+			double cups = 0.0;
+			double totnucps = 0.0;
+			double readps = 0.0;
+			if(el > 0) {
+				cups = totcup / (double)el;
+				totnucps = totnuc / (double)el;
+				readps = qid / (double)el;
+			}
+			cerr << qid << " reads" << endl;
+			cerr << std::setprecision(4) << "  " << readps << " reads per second" << endl;
+			cerr << totnuc << " nucleotides" << endl;
+			cerr << std::setprecision(4) << "  " << totnucps << " nucleotides per second" << endl;
+			cerr << totcup << " cell updates" << endl;
+			cerr << std::setprecision(4) << "  " << cups << " cell updates per second (CUPS)" << endl;
+		}
+		return 0;
+	} catch(std::exception& e) {
+		cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
+		cerr << "Command: ";
+		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+		cerr << endl;
+		return 1;
+	} catch(int e) {
+		if(e != 0) {
+			cerr << "Error: Encountered internal Bowtie 2 exception (#" << e << ")" << endl;
+			cerr << "Command: ";
+			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+			cerr << endl;
+		}
+		return e;
+	}
+}
+
diff --git a/bt2_idx.cpp b/bt2_idx.cpp
index 284d910..f6b44a9 100644
--- a/bt2_idx.cpp
+++ b/bt2_idx.cpp
@@ -26,6 +26,18 @@
 
 using namespace std;
 
+#ifdef BOWTIE_64BIT_INDEX
+
+const std::string gEbwt_ext("bt2l");
+
+#else
+
+const std::string gEbwt_ext("bt2");
+
+#endif  // BOWTIE_64BIT_INDEX
+
+string gLastIOErrMsg;
+
 ///////////////////////////////////////////////////////////////////////
 //
 // Functions for searching Ebwts
@@ -40,32 +52,32 @@ using namespace std;
  * sorted list of reference fragment ranges t
  */
 void Ebwt::joinedToTextOff(
-	uint32_t qlen,
-	uint32_t off,
-	uint32_t& tidx,
-    uint32_t& textoff,
-    uint32_t& tlen,
+	TIndexOffU qlen, 
+	TIndexOffU off,
+	TIndexOffU& tidx,
+	TIndexOffU& textoff,
+	TIndexOffU& tlen,
 	bool rejectStraddle,
 	bool& straddled) const
 {
 	assert(rstarts() != NULL); // must have loaded rstarts
-	uint32_t top = 0;
-	uint32_t bot = _nFrag; // 1 greater than largest addressable element
-	uint32_t elt = 0xffffffff;
+	TIndexOffU top = 0;
+	TIndexOffU bot = _nFrag; // 1 greater than largest addressable element
+	TIndexOffU elt = OFF_MASK;
 	// Begin binary search
 	while(true) {
-		ASSERT_ONLY(uint32_t oldelt = elt);
+		ASSERT_ONLY(TIndexOffU oldelt = elt);
 		elt = top + ((bot - top) >> 1);
 		assert_neq(oldelt, elt); // must have made progress
-		uint32_t lower = rstarts()[elt*3];
-		uint32_t upper;
+		TIndexOffU lower = rstarts()[elt*3];
+		TIndexOffU upper;
 		if(elt == _nFrag-1) {
 			upper = _eh._len;
 		} else {
 			upper = rstarts()[((elt+1)*3)];
 		}
 		assert_gt(upper, lower);
-		uint32_t fraglen = upper - lower;
+		TIndexOffU fraglen = upper - lower;
 		if(lower <= off) {
 			if(upper > off) { // not last element, but it's within
 				// off is in this range; check if it falls off
@@ -73,7 +85,7 @@ void Ebwt::joinedToTextOff(
 					straddled = true;
 					if(rejectStraddle) {
 						// it falls off; signal no-go and return
-						tidx = 0xffffffff;
+						tidx = OFF_MASK;
 						assert_lt(elt, _nFrag-1);
 						return;
 					}
@@ -86,7 +98,7 @@ void Ebwt::joinedToTextOff(
 				// it doesn't fall off; now calculate textoff.
 				// Initially it's the number of characters that precede
 				// the alignment in the fragment
-				uint32_t fragoff = off - rstarts()[(elt*3)];
+				TIndexOffU fragoff = off - rstarts()[(elt*3)];
 				if(!this->fw_) {
 					fragoff = fraglen - fragoff - 1;
 					fragoff -= (qlen-1);
@@ -113,17 +125,17 @@ void Ebwt::joinedToTextOff(
 
 /**
  * Walk 'steps' steps to the left and return the row arrived at.  If we
- * walk through the dollar sign, return 0xffffffff.
+ * walk through the dollar sign, return max value.
  */
-uint32_t Ebwt::walkLeft(uint32_t row, uint32_t steps) const {
+TIndexOffU Ebwt::walkLeft(TIndexOffU row, TIndexOffU steps) const {
 	assert(offs() != NULL);
-	assert_neq(0xffffffff, row);
+	assert_neq(OFF_MASK, row);
 	SideLocus l;
 	if(steps > 0) l.initFromRow(row, _eh, ebwt());
 	while(steps > 0) {
-		if(row == _zOff) return 0xffffffff;
-		uint32_t newrow = this->mapLF(l ASSERT_ONLY(, false));
-		assert_neq(0xffffffff, newrow);
+		if(row == _zOff) return OFF_MASK;
+		TIndexOffU newrow = this->mapLF(l ASSERT_ONLY(, false));
+		assert_neq(OFF_MASK, newrow);
 		assert_neq(newrow, row);
 		row = newrow;
 		steps--;
@@ -135,18 +147,18 @@ uint32_t Ebwt::walkLeft(uint32_t row, uint32_t steps) const {
 /**
  * Resolve the reference offset of the BW element 'elt'.
  */
-uint32_t Ebwt::getOffset(uint32_t row) const {
+TIndexOffU Ebwt::getOffset(TIndexOffU row) const {
 	assert(offs() != NULL);
-	assert_neq(0xffffffff, row);
+	assert_neq(OFF_MASK, row);
 	if(row == _zOff) return 0;
 	if((row & _eh._offMask) == row) return this->offs()[row >> _eh._offRate];
-	int jumps = 0;
+	TIndexOffU jumps = 0;
 	SideLocus l;
 	l.initFromRow(row, _eh, ebwt());
 	while(true) {
-		uint32_t newrow = this->mapLF(l ASSERT_ONLY(, false));
+		TIndexOffU newrow = this->mapLF(l ASSERT_ONLY(, false));
 		jumps++;
-		assert_neq(0xffffffff, newrow);
+		assert_neq(OFF_MASK, newrow);
 		assert_neq(newrow, row);
 		row = newrow;
 		if(row == _zOff) {
@@ -163,13 +175,13 @@ uint32_t Ebwt::getOffset(uint32_t row) const {
  * the offset returned is at the right-hand side of the forward
  * reference substring involved in the hit.
  */
-uint32_t Ebwt::getOffset(
-	uint32_t elt,
+TIndexOffU Ebwt::getOffset(
+	TIndexOffU elt,
 	bool fw,
-	uint32_t hitlen) const
+	TIndexOffU hitlen) const
 {
-	uint32_t off = getOffset(elt);
-	assert_neq(0xffffffff, off);
+	TIndexOffU off = getOffset(elt);
+	assert_neq(OFF_MASK, off);
 	if(!fw) {
 		assert_lt(off, _eh._len);
 		off = _eh._len - off - 1;
@@ -187,8 +199,8 @@ uint32_t Ebwt::getOffset(
  */
 bool Ebwt::contains(
 	const BTDnaString& str,
-	uint32_t *otop,
-	uint32_t *obot) const
+	TIndexOffU *otop,
+	TIndexOffU *obot) const
 {
 	assert(isInMemory());
 	SideLocus tloc, bloc;
@@ -198,7 +210,7 @@ bool Ebwt::contains(
 	}
 	int c = str[str.length()-1];
 	assert_range(0, 4, c);
-	uint32_t top = 0, bot = 0;
+	TIndexOffU top = 0, bot = 0;
 	if(c < 4) {
 		top = fchr()[c];
 		bot = fchr()[c+1];
@@ -219,15 +231,15 @@ bool Ebwt::contains(
 	assert_geq(bot, top);
 	tloc.initFromRow(top, eh(), ebwt());
 	bloc.initFromRow(bot, eh(), ebwt());
-	ASSERT_ONLY(uint32_t lastDiff = bot - top);
-	for(int i = (int)str.length()-2; i >= 0; i--) {
+	ASSERT_ONLY(TIndexOffU lastDiff = bot - top);
+	for(TIndexOff i = (TIndexOff)str.length()-2; i >= 0; i--) {
 		c = str[i];
 		assert_range(0, 4, c);
 		if(c <= 3) {
 			top = mapLF(tloc, c);
 			bot = mapLF(bloc, c);
 		} else {
-			size_t sz = bot - top;
+			TIndexOffU sz = bot - top;
 			int c1 = mapLF1(top, tloc ASSERT_ONLY(, false));
 			bot = mapLF(bloc, c1);
 			assert_leq(bot - top, sz);
@@ -266,14 +278,14 @@ string adjustEbwtBase(const string& cmdline,
 	string str = ebwtFileBase;
 	ifstream in;
 	if(verbose) cout << "Trying " << str.c_str() << endl;
-	in.open((str + ".1.bt2").c_str(), ios_base::in | ios::binary);
+	in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
 	if(!in.is_open()) {
 		if(verbose) cout << "  didn't work" << endl;
 		in.close();
 		if(getenv("BOWTIE2_INDEXES") != NULL) {
 			str = string(getenv("BOWTIE2_INDEXES")) + "/" + ebwtFileBase;
 			if(verbose) cout << "Trying " << str.c_str() << endl;
-			in.open((str + ".1.bt2").c_str(), ios_base::in | ios::binary);
+			in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
 			if(!in.is_open()) {
 				if(verbose) cout << "  didn't work" << endl;
 				in.close();
diff --git a/bt2_idx.h b/bt2_idx.h
index c5221dc..93330e8 100644
--- a/bt2_idx.h
+++ b/bt2_idx.h
@@ -53,14 +53,24 @@
 #include "ds.h"
 #include "random_source.h"
 #include "mem_ids.h"
+#include "btypes.h"
 
-using namespace std;
+#ifdef POPCNT_CAPABILITY 
+    #include "processor_support.h" 
+#endif 
 
-#define EBWT_EXT ".bt2"
+using namespace std;
 
 // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
 extern uint8_t cCntLUT_4[4][4][256];
 
+static const uint64_t c_table[4] = {
+	0xffffffffffffffff,
+	0xaaaaaaaaaaaaaaaa,
+	0x5555555555555555,
+	0x0000000000000000
+};
+
 #ifndef VMSG_NL
 #define VMSG_NL(...) \
 if(this->verbose()) { \
@@ -100,7 +110,7 @@ public:
 	EbwtParams() { }
 
 	EbwtParams(
-		uint32_t len,
+		TIndexOffU len,
 		int32_t lineRate,
 		int32_t offRate,
 		int32_t ftabChars,
@@ -116,7 +126,7 @@ public:
 	}
 
 	void init(
-		uint32_t len,
+		TIndexOffU len,
 		int32_t lineRate,
 		int32_t offRate,
 		int32_t ftabChars,
@@ -132,17 +142,17 @@ public:
 		_lineRate = lineRate;
 		_origOffRate = offRate;
 		_offRate = offRate;
-		_offMask = 0xffffffff << _offRate;
+		_offMask = OFF_MASK << _offRate;
 		_ftabChars = ftabChars;
 		_eftabLen = _ftabChars*2;
-		_eftabSz = _eftabLen*4;
+		_eftabSz = _eftabLen*OFF_SIZE;
 		_ftabLen = (1 << (_ftabChars*2))+1;
-		_ftabSz = _ftabLen*4;
+		_ftabSz = _ftabLen*OFF_SIZE;
 		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
-		_offsSz = _offsLen*4;
+		_offsSz = (uint64_t)_offsLen*OFF_SIZE;
 		_lineSz = 1 << _lineRate;
 		_sideSz = _lineSz * 1 /* lines per side */;
-		_sideBwtSz = _sideSz - 16;
+		_sideBwtSz = _sideSz - OFF_SIZE*4;
 		_sideBwtLen = _sideBwtSz*4;
 		_numSides = (_bwtSz+(_sideBwtSz)-1)/(_sideBwtSz);
 		_numLines = _numSides * 1 /* lines per side */;
@@ -151,30 +161,30 @@ public:
 		assert(repOk());
 	}
 
-	uint32_t len() const           { return _len; }
-	uint32_t lenNucs() const       { return _len + (_color ? 1 : 0); }
-	uint32_t bwtLen() const        { return _bwtLen; }
-	uint32_t sz() const            { return _sz; }
-	uint32_t bwtSz() const         { return _bwtSz; }
-	int32_t  lineRate() const      { return _lineRate; }
-	int32_t  origOffRate() const   { return _origOffRate; }
-	int32_t  offRate() const       { return _offRate; }
-	uint32_t offMask() const       { return _offMask; }
-	int32_t  ftabChars() const     { return _ftabChars; }
-	uint32_t eftabLen() const      { return _eftabLen; }
-	uint32_t eftabSz() const       { return _eftabSz; }
-	uint32_t ftabLen() const       { return _ftabLen; }
-	uint32_t ftabSz() const        { return _ftabSz; }
-	uint32_t offsLen() const       { return _offsLen; }
-	uint32_t offsSz() const        { return _offsSz; }
-	uint32_t lineSz() const        { return _lineSz; }
-	uint32_t sideSz() const        { return _sideSz; }
-	uint32_t sideBwtSz() const     { return _sideBwtSz; }
-	uint32_t sideBwtLen() const    { return _sideBwtLen; }
-	uint32_t numSides() const      { return _numSides; }
-	uint32_t numLines() const      { return _numLines; }
-	uint32_t ebwtTotLen() const    { return _ebwtTotLen; }
-	uint32_t ebwtTotSz() const     { return _ebwtTotSz; }
+	TIndexOffU len() const           { return _len; }
+	TIndexOffU lenNucs() const       { return _len + (_color ? 1 : 0); }
+	TIndexOffU bwtLen() const        { return _bwtLen; }
+	TIndexOffU sz() const            { return _sz; }
+	TIndexOffU bwtSz() const         { return _bwtSz; }
+	int32_t   lineRate() const      { return _lineRate; }
+	int32_t   origOffRate() const   { return _origOffRate; }
+	int32_t   offRate() const       { return _offRate; }
+	TIndexOffU offMask() const       { return _offMask; }
+	int32_t   ftabChars() const     { return _ftabChars; }
+	int32_t eftabLen() const      { return _eftabLen; } 
+	int32_t eftabSz() const       { return _eftabSz; } 
+	TIndexOffU ftabLen() const       { return _ftabLen; }
+	TIndexOffU ftabSz() const        { return _ftabSz; }
+	TIndexOffU offsLen() const       { return _offsLen; }
+	uint64_t offsSz() const        { return _offsSz; }
+	int32_t lineSz() const        { return _lineSz; } 
+	int32_t sideSz() const        { return _sideSz; } 
+	int32_t sideBwtSz() const     { return _sideBwtSz; } 
+	int32_t sideBwtLen() const    { return _sideBwtLen; } 
+	TIndexOffU numSides() const      { return _numSides; }
+	TIndexOffU numLines() const      { return _numLines; }
+	TIndexOffU ebwtTotLen() const    { return _ebwtTotLen; }
+	TIndexOffU ebwtTotSz() const     { return _ebwtTotSz; }
 	bool color() const             { return _color; }
 	bool entireReverse() const     { return _entireReverse; }
 
@@ -184,9 +194,9 @@ public:
 	 */
 	void setOffRate(int __offRate) {
 		_offRate = __offRate;
-		_offMask = 0xffffffff << _offRate;
+		_offMask = OFF_MASK << _offRate;
 		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
-		_offsSz = _offsLen*4;
+		_offsSz = (uint64_t)_offsLen * OFF_SIZE;
 	}
 
 #ifndef NDEBUG
@@ -197,10 +207,7 @@ public:
 		assert_geq(_offRate, 0);
 		assert_leq(_ftabChars, 16);
 		assert_geq(_ftabChars, 1);
-		// Only 6 supported for now, due to hardcoded constants in
-		// SideLocus.
-		assert_eq(6, _lineRate);
-		//assert_lt(_lineRate, 32);
+		assert_lt(_lineRate, 32);
 		assert_lt(_ftabChars, 32);
 		assert_eq(0, _ebwtTotSz % _lineSz);
 		return true;
@@ -238,29 +245,29 @@ public:
 		    << "    reverse: "      << _entireReverse << endl;
 	}
 
-	uint32_t _len;
-	uint32_t _bwtLen;
-	uint32_t _sz;
-	uint32_t _bwtSz;
+	TIndexOffU _len;
+	TIndexOffU _bwtLen;
+	TIndexOffU _sz;
+	TIndexOffU _bwtSz;
 	int32_t  _lineRate;
 	int32_t  _origOffRate;
 	int32_t  _offRate;
-	uint32_t _offMask;
+	TIndexOffU _offMask;
 	int32_t  _ftabChars;
 	uint32_t _eftabLen;
 	uint32_t _eftabSz;
-	uint32_t _ftabLen;
-	uint32_t _ftabSz;
-	uint32_t _offsLen;
-	uint32_t _offsSz;
-	uint32_t _lineSz;
-	uint32_t _sideSz;
-	uint32_t _sideBwtSz;
-	uint32_t _sideBwtLen;
-	uint32_t _numSides;
-	uint32_t _numLines;
-	uint32_t _ebwtTotLen;
-	uint32_t _ebwtTotSz;
+	TIndexOffU _ftabLen; 
+	TIndexOffU _ftabSz; 
+	TIndexOffU _offsLen;
+	uint64_t _offsSz;
+	uint32_t _lineSz; 
+	uint32_t _sideSz; 
+	uint32_t _sideBwtSz; 
+	uint32_t _sideBwtLen; 
+	TIndexOffU _numSides;
+	TIndexOffU _numLines;
+	TIndexOffU _ebwtTotLen;
+	TIndexOffU _ebwtTotSz;
 	bool     _color;
 	bool     _entireReverse;
 };
@@ -302,7 +309,7 @@ struct SideLocus {
 	/**
 	 * Construct from row and other relevant information about the Ebwt.
 	 */
-	SideLocus(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) {
+	SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
 		initFromRow(row, ep, ebwt);
 	}
 
@@ -311,20 +318,20 @@ struct SideLocus {
 	 * from one call to initFromRow to possibly avoid a second call.
 	 */
 	static void initFromTopBot(
-		uint32_t top,
-		uint32_t bot,
+		TIndexOffU top,
+		TIndexOffU bot,
 		const EbwtParams& ep,
 		const uint8_t* ebwt,
 		SideLocus& ltop,
 		SideLocus& lbot)
 	{
-		const uint32_t sideBwtLen = ep._sideBwtLen;
+		const TIndexOffU sideBwtLen = ep._sideBwtLen;
 		assert_gt(bot, top);
 		ltop.initFromRow(top, ep, ebwt);
-		uint32_t spread = bot - top;
+		TIndexOffU spread = bot - top;
 		// Many cache misses on the following lines
 		if(ltop._charOff + spread < sideBwtLen) {
-			lbot._charOff = ltop._charOff + spread;
+			lbot._charOff = (uint32_t)(ltop._charOff + spread);
 			lbot._sideNum = ltop._sideNum;
 			lbot._sideByteOff = ltop._sideByteOff;
 			lbot._by = lbot._charOff >> 2;
@@ -339,13 +346,13 @@ struct SideLocus {
 	 * Calculate SideLocus based on a row and other relevant
 	 * information about the shape of the Ebwt.
 	 */
-	void initFromRow(uint32_t row, const EbwtParams& ep, const uint8_t* ebwt) {
-		const uint32_t sideSz     = ep._sideSz;
+	void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
+		const int32_t sideSz     = ep._sideSz;
 		// Side length is hard-coded for now; this allows the compiler
 		// to do clever things to accelerate / and %.
-		_sideNum                  = row / 192;
+		_sideNum                  = row / (48*OFF_SIZE);
 		assert_lt(_sideNum, ep._numSides);
-		_charOff                  = row % 192;
+		_charOff                  = row % (48*OFF_SIZE);
 		_sideByteOff              = _sideNum * sideSz;
 		assert_leq(row, ep._len);
 		assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
@@ -381,8 +388,8 @@ struct SideLocus {
 	/**
 	 * Convert locus to BW row it corresponds to.
 	 */
-	uint32_t toBWRow() const {
-		return _sideNum * 192 + _charOff;
+	TIndexOffU toBWRow() const {
+		return _sideNum * 48*OFF_SIZE + _charOff;
 	}
 	
 #ifndef NDEBUG
@@ -391,7 +398,7 @@ struct SideLocus {
 	 * with the (provided) EbwtParams.
 	 */
 	bool repOk(const EbwtParams& ep) const {
-		ASSERT_ONLY(uint32_t row = _sideNum * 192 + _charOff);
+		ASSERT_ONLY(TIndexOffU row = _sideNum * 48*OFF_SIZE + _charOff);
 		assert_leq(row, ep._len);
 		assert_range(-1, 3, _bp);
 		assert_range(0, (int)ep._sideBwtSz, _by);
@@ -411,13 +418,15 @@ struct SideLocus {
 		return ebwt + _sideByteOff;
 	}
 
-	uint32_t _sideByteOff; // offset of top side within ebwt[]
-	uint32_t _sideNum;     // index of side
-	uint32_t _charOff;     // character offset within side
-	int32_t _by;           // byte within side (not adjusted for bw sides)
+	TIndexOffU _sideByteOff; // offset of top side within ebwt[]
+	TIndexOffU _sideNum;     // index of side
+	uint32_t _charOff;      // character offset within side
+	int32_t _by;            // byte within side (not adjusted for bw sides)
 	int32_t _bp;            // bitpair within byte (not adjusted for bw sides)
 };
-
+#ifdef POPCNT_CAPABILITY   // wrapping of "struct"
+struct USE_POPCNT_GENERIC {
+#endif
 // Use this standard bit-bashing population count
 inline static int pop64(uint64_t x) {
 	// Lots of cache misses on following lines (>10K)
@@ -429,38 +438,39 @@ inline static int pop64(uint64_t x) {
 	x = x + (x >> 32);
 	return (int)(x & 0x3Fllu);
 }
+#ifdef POPCNT_CAPABILITY  // wrapping a "struct"
+};
+#endif
+
+#ifdef POPCNT_CAPABILITY
+    struct USE_POPCNT_INSTRUCTION {
+        inline static int pop64(uint64_t x) {
+            int64_t count;
+            asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x));
+            return count;
+        }
+    };
+#endif
 
 /**
  * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
  * within a 64-bit argument.
  */
+#ifdef POPCNT_CAPABILITY
+template<typename Operation>
+#endif
 inline static int countInU64(int c, uint64_t dw) {
-	uint64_t dwA  = dw &  0xAAAAAAAAAAAAAAAAllu;
-	uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
-	uint64_t tmp;
-	switch(c) {
-		case 0:
-			tmp = (dwA >> 1) | dwNA;
-			break;
-		case 1:
-			tmp = ~(dwA >> 1) & dwNA;
-			break;
-		case 2:
-			tmp = (dwA >> 1) & ~dwNA;
-			break;
-		case 3:
-			tmp = (dwA >> 1) & dwNA;
-			break;
-		default:
-			throw;
-	}
-	tmp = pop64(tmp); // Gets 7.62% in profile
-	if(c == 0) {
-		tmp = 32 - tmp;
-	}
-	assert_leq(tmp, 32);
-	assert_geq(tmp, 0);
-	return (int)tmp;
+	uint64_t c0 = c_table[c];
+	uint64_t x0 = dw ^ c0;
+	uint64_t x1 = (x0 >> 1);
+	uint64_t x2 = x1 & (0x5555555555555555);
+	uint64_t x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+    uint64_t tmp = Operation().pop64(x3);
+#else
+    uint64_t tmp = pop64(x3);
+#endif
+	return (int) tmp;
 }
 
 // Forward declarations for Ebwt class
@@ -486,8 +496,8 @@ public:
 	    fw_(fw), \
 	    _in1(MM_FILE_INIT), \
 	    _in2(MM_FILE_INIT), \
-	    _zOff(0xffffffff), \
-	    _zEbwtByteOff(0xffffffff), \
+	    _zOff(OFF_MASK), \
+	    _zEbwtByteOff(OFF_MASK), \
 	    _zEbwtBpOff(-1), \
 	    _nPat(0), \
 	    _nFrag(0), \
@@ -525,11 +535,15 @@ public:
 	     Ebwt_INITS
 	{
 		assert(!useMm || !useShmem);
+#ifdef POPCNT_CAPABILITY 
+        ProcessorSupport ps; 
+        _usePOPCNTinstruction = ps.POPCNTenabled(); 
+#endif 
 		packed_ = false;
 		_useMm = useMm;
 		useShmem_ = useShmem;
-		_in1Str = in + ".1.bt2";
-		_in2Str = in + ".2.bt2";
+		_in1Str = in + ".1." + gEbwt_ext;
+		_in2Str = in + ".2." + gEbwt_ext;
 		readIntoMemory(
 			color,       // expect index to be colorspace?
 			fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
@@ -569,16 +583,18 @@ public:
 		const string& file,   // base filename for EBWT files
 		bool fw,
 		bool useBlockwise,
-		uint32_t bmax,
-		uint32_t bmaxSqrtMult,
-		uint32_t bmaxDivN,
+		TIndexOffU bmax,
+		TIndexOffU bmaxSqrtMult,
+		TIndexOffU bmaxDivN,
 		int dcv,
 		EList<FileBuf*>& is,
 		EList<RefRecord>& szs,
-		uint32_t sztot,
+		TIndexOffU sztot,
 		const RefReadInParams& refparams,
 		uint32_t seed,
 		int32_t overrideOffRate = -1,
+		bool doSaFile = false,
+		bool doBwtFile = false,
 		bool verbose = false,
 		bool passMemExc = false,
 		bool sanityCheck = false) :
@@ -591,8 +607,12 @@ public:
 			color,
 			refparams.reverse == REF_READ_REVERSE)
 	{
-		_in1Str = file + ".1.bt2";
-		_in2Str = file + ".2.bt2";
+#ifdef POPCNT_CAPABILITY 
+        ProcessorSupport ps; 
+        _usePOPCNTinstruction = ps.POPCNTenabled(); 
+#endif 
+		_in1Str = file + ".1." + gEbwt_ext;
+		_in2Str = file + ".2." + gEbwt_ext;
 		packed_ = packed;
 		// Open output files
 		ofstream fout1(_in1Str.c_str(), ios::binary);
@@ -609,7 +629,28 @@ public:
 			     << "Bowtie." << endl;
 			throw 1;
 		}
-		// Build
+		_inSaStr = file + ".sa";
+		_inBwtStr = file + ".bwt";
+		ofstream *saOut = NULL, *bwtOut = NULL;
+		if(doSaFile) {
+			saOut = new ofstream(_inSaStr.c_str(), ios::binary);
+			if(!saOut->good()) {
+				cerr << "Could not open suffix-array file for writing: \"" << _inSaStr.c_str() << "\"" << endl
+			         << "Please make sure the directory exists and that permissions allow writing by" << endl
+			         << "Bowtie." << endl;
+				throw 1;
+			}
+		}
+		if(doBwtFile) {
+			bwtOut = new ofstream(_inBwtStr.c_str(), ios::binary);
+			if(!bwtOut->good()) {
+				cerr << "Could not open suffix-array file for writing: \"" << _inBwtStr.c_str() << "\"" << endl
+			         << "Please make sure the directory exists and that permissions allow writing by" << endl
+			         << "Bowtie." << endl;
+				throw 1;
+			}
+		}
+		// Build SA(T) and BWT(T) block by block
 		initFromVector<TStr>(
 			is,
 		    szs,
@@ -617,6 +658,8 @@ public:
 		    refparams,
 		    fout1,
 		    fout2,
+			saOut,
+			bwtOut,
 		    useBlockwise,
 		    bmax,
 		    bmaxSqrtMult,
@@ -626,6 +669,7 @@ public:
 		    verbose);
 		// Close output files
 		fout1.flush();
+		
 		int64_t tellpSz1 = (int64_t)fout1.tellp();
 		VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str.c_str());
 		fout1.close();
@@ -636,6 +680,7 @@ public:
 			     << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
 		}
 		fout2.flush();
+		
 		int64_t tellpSz2 = (int64_t)fout2.tellp();
 		VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str.c_str());
 		fout2.close();
@@ -644,10 +689,36 @@ public:
 			cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
 			     << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
 		}
+		
+		if(saOut != NULL) {
+			// Check on suffix array output file size
+			int64_t tellpSzSa = (int64_t)saOut->tellp();
+			VMSG_NL("Wrote " << tellpSzSa << " bytes to suffix-array file: " << _inSaStr.c_str());
+			saOut->close();
+			if(tellpSzSa > fileSize(_inSaStr.c_str())) {
+				err = true;
+				cerr << "Index is corrupt: File size for " << _inSaStr.c_str() << " should have been " << tellpSzSa
+					 << " but is actually " << fileSize(_inSaStr.c_str()) << "." << endl;
+			}
+		}
+
+		if(bwtOut != NULL) {
+			// Check on suffix array output file size
+			int64_t tellpSzBwt = (int64_t)bwtOut->tellp();
+			VMSG_NL("Wrote " << tellpSzBwt << " bytes to BWT file: " << _inBwtStr.c_str());
+			bwtOut->close();
+			if(tellpSzBwt > fileSize(_inBwtStr.c_str())) {
+				err = true;
+				cerr << "Index is corrupt: File size for " << _inBwtStr.c_str() << " should have been " << tellpSzBwt
+					 << " but is actually " << fileSize(_inBwtStr.c_str()) << "." << endl;
+			}
+		}
+		
 		if(err) {
 			cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
 			throw 1;
 		}
+		
 		// Reopen as input streams
 		VMSG_NL("Re-opening _in1 and _in2 as input streams");
 		if(_sanity) {
@@ -688,9 +759,9 @@ public:
 		int32_t ftabChars,
 		const string& file,
 		bool useBlockwise,
-		uint32_t bmax,
-		uint32_t bmaxSqrtMult,
-		uint32_t bmaxDivN,
+		TIndexOffU bmax,
+		TIndexOffU bmaxSqrtMult,
+		TIndexOffU bmaxDivN,
 		int dcv,
 		uint32_t seed,
 		bool verbose,
@@ -737,21 +808,21 @@ public:
 		int32_t ftabChars,
 		const string& file,
 		bool useBlockwise,
-		uint32_t bmax,
-		uint32_t bmaxSqrtMult,
-		uint32_t bmaxDivN,
+		TIndexOffU bmax,
+		TIndexOffU bmaxSqrtMult,
+		TIndexOffU bmaxDivN,
 		int dcv,
 		uint32_t seed,
 		bool verbose,
 		bool autoMem,
 		bool sanity)
 	{
-        assert(!strs.empty());
+		assert(!strs.empty());
 		EList<FileBuf*> is(EBWT_CAT);
 		RefReadInParams refparams(color, REF_READ_FORWARD, false, false);
 		// Adapt sequence strings to stringstreams open for input
 		auto_ptr<stringstream> ss(new stringstream());
-		for(size_t i = 0; i < strs.size(); i++) {
+		for(TIndexOffU i = 0; i < strs.size(); i++) {
 			(*ss) << ">" << i << endl << strs[i] << endl;
 		}
 		auto_ptr<FileBuf> fb(new FileBuf(ss.get()));
@@ -764,7 +835,7 @@ public:
 		// sequences.  A record represents a stretch of unambiguous
 		// characters in one of the input sequences.
 		EList<RefRecord> szs(EBWT_CAT);
-		std::pair<uint32_t, uint32_t> sztot;
+		std::pair<TIndexOffU, TIndexOffU> sztot;
 		sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
 		// Construct Ebwt from input strings and parameters
 		Ebwt *ebwtFw = new Ebwt(
@@ -843,22 +914,24 @@ public:
 	template <typename TStr>
 	void initFromVector(EList<FileBuf*>& is,
 	                    EList<RefRecord>& szs,
-	                    uint32_t sztot,
+	                    TIndexOffU sztot,
 	                    const RefReadInParams& refparams,
 	                    ofstream& out1,
 	                    ofstream& out2,
+	                    ofstream* saOut,
+	                    ofstream* bwtOut,
 	                    bool useBlockwise,
-	                    uint32_t bmax,
-	                    uint32_t bmaxSqrtMult,
-	                    uint32_t bmaxDivN,
+	                    TIndexOffU bmax,
+	                    TIndexOffU bmaxSqrtMult,
+	                    TIndexOffU bmaxDivN,
 	                    int dcv,
 	                    uint32_t seed,
-						bool verbose)
+	                    bool verbose)
 	{
 		// Compose text strings into single string
 		VMSG_NL("Calculating joined length");
 		TStr s; // holds the entire joined reference after call to joinToDisk
-		uint32_t jlen;
+		TIndexOffU jlen;
 		jlen = joinedLen(szs);
 		assert_geq(jlen, sztot);
 		VMSG_NL("Writing header");
@@ -915,19 +988,19 @@ public:
 		}
 		// Succesfully obtained joined reference string
 		assert_geq(s.length(), jlen);
-		if(bmax != 0xffffffff) {
+		if(bmax != OFF_MASK) {
 			VMSG_NL("bmax according to bmax setting: " << bmax);
 		}
-		else if(bmaxSqrtMult != 0xffffffff) {
+		else if(bmaxSqrtMult != OFF_MASK) {
 			bmax *= bmaxSqrtMult;
 			VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
 		}
-		else if(bmaxDivN != 0xffffffff) {
-			bmax = max<uint32_t>(jlen / bmaxDivN, 1);
+		else if(bmaxDivN != OFF_MASK) {
+			bmax = max<TIndexOffU>(jlen / bmaxDivN, 1);
 			VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
 		}
 		else {
-			bmax = (uint32_t)sqrt(s.length());
+			bmax = (TIndexOffU)sqrt(s.length());
 			VMSG_NL("bmax defaulted to: " << bmax);
 		}
 		int iter = 0;
@@ -976,15 +1049,15 @@ public:
 					// we would have thrown one eventually as part of
 					// constructing the DifferenceCoverSample
 					dcv <<= 1;
-					size_t sz = DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
+					TIndexOffU sz = (TIndexOffU)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
 					AutoArray<uint8_t> tmp(sz, EBWT_CAT);
 					dcv >>= 1;
 					// Likewise with the KarkkainenBlockwiseSA
-					sz = KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
+					sz = (TIndexOffU)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
 					AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
 					// Now throw in the 'ftab' and 'isaSample' structures
 					// that we'll eventually allocate in buildToDisk
-					AutoArray<uint32_t> ftab(_eh._ftabLen * 2, EBWT_CAT);
+					AutoArray<TIndexOffU> ftab(_eh._ftabLen * 2, EBWT_CAT);
 					AutoArray<uint8_t> side(_eh._sideSz, EBWT_CAT);
 					// Grab another 20 MB out of caution
 					AutoArray<uint32_t> extra(20*1024*1024, EBWT_CAT);
@@ -1001,9 +1074,18 @@ public:
 				assert(bsa.suffixItrIsReset());
 				assert_eq(bsa.size(), s.length()+1);
 				VMSG_NL("Converting suffix-array elements to index image");
-				buildToDisk(bsa, s, out1, out2);
+				buildToDisk(bsa, s, out1, out2, saOut, bwtOut);
 				out1.flush(); out2.flush();
-				if(out1.fail() || out2.fail()) {
+				bool failed = out1.fail() || out2.fail();
+				if(saOut != NULL) {
+					saOut->flush();
+					failed = failed || saOut->fail();
+				}
+				if(bwtOut != NULL) {
+					bwtOut->flush();
+					failed = failed || bwtOut->fail();
+				}
+				if(failed) {
 					cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
 					throw 1;
 				}
@@ -1022,7 +1104,7 @@ public:
 		assert(repOk());
 		// Now write reference sequence names on the end
 		assert_eq(this->_refnames.size(), this->_nPat);
-		for(size_t i = 0; i < this->_refnames.size(); i++) {
+		for(TIndexOffU i = 0; i < this->_refnames.size(); i++) {
 			out1 << this->_refnames[i].c_str() << endl;
 		}
 		out1 << '\0';
@@ -1040,8 +1122,8 @@ public:
 	 * fragments correspond to input sequences - it just cares about
 	 * the lengths of the fragments.
 	 */
-	uint32_t joinedLen(EList<RefRecord>& szs) {
-		uint32_t ret = 0;
+	TIndexOffU joinedLen(EList<RefRecord>& szs) {
+		TIndexOffU ret = 0;
 		for(unsigned int i = 0; i < szs.size(); i++) {
 			ret += szs[i].len;
 		}
@@ -1069,30 +1151,33 @@ public:
 
 	/// Accessors
 	inline const EbwtParams& eh() const     { return _eh; }
-	uint32_t    zOff() const         { return _zOff; }
-	uint32_t    zEbwtByteOff() const { return _zEbwtByteOff; }
-	int         zEbwtBpOff() const   { return _zEbwtBpOff; }
-	uint32_t    nPat() const         { return _nPat; }
-	uint32_t    nFrag() const        { return _nFrag; }
-	inline uint32_t*   fchr()              { return _fchr.get(); }
-	inline uint32_t*   ftab()              { return _ftab.get(); }
-	inline uint32_t*   eftab()             { return _eftab.get(); }
-	inline uint32_t*   offs()              { return _offs.get(); }
-	inline uint32_t*   plen()              { return _plen.get(); }
-	inline uint32_t*   rstarts()           { return _rstarts.get(); }
+	TIndexOffU    zOff() const         { return _zOff; }
+	TIndexOffU    zEbwtByteOff() const { return _zEbwtByteOff; }
+	TIndexOff         zEbwtBpOff() const   { return _zEbwtBpOff; } 
+	TIndexOffU    nPat() const         { return _nPat; }
+	TIndexOffU    nFrag() const        { return _nFrag; }
+	inline TIndexOffU*   fchr()              { return _fchr.get(); }
+	inline TIndexOffU*   ftab()              { return _ftab.get(); }
+	inline TIndexOffU*   eftab()             { return _eftab.get(); }
+	inline TIndexOffU*   offs()              { return _offs.get(); }
+	inline TIndexOffU*   plen()              { return _plen.get(); }
+	inline TIndexOffU*   rstarts()           { return _rstarts.get(); }
 	inline uint8_t*    ebwt()              { return _ebwt.get(); }
-	inline const uint32_t* fchr() const    { return _fchr.get(); }
-	inline const uint32_t* ftab() const    { return _ftab.get(); }
-	inline const uint32_t* eftab() const   { return _eftab.get(); }
-	inline const uint32_t* offs() const    { return _offs.get(); }
-	inline const uint32_t* plen() const    { return _plen.get(); }
-	inline const uint32_t* rstarts() const { return _rstarts.get(); }
+	inline const TIndexOffU* fchr() const    { return _fchr.get(); }
+	inline const TIndexOffU* ftab() const    { return _ftab.get(); }
+	inline const TIndexOffU* eftab() const   { return _eftab.get(); }
+	inline const TIndexOffU* offs() const    { return _offs.get(); }
+	inline const TIndexOffU* plen() const    { return _plen.get(); }
+	inline const TIndexOffU* rstarts() const { return _rstarts.get(); }
 	inline const uint8_t*  ebwt() const    { return _ebwt.get(); }
 	bool        toBe() const         { return _toBigEndian; }
 	bool        verbose() const      { return _verbose; }
 	bool        sanityCheck() const  { return _sanity; }
 	EList<string>& refnames()        { return _refnames; }
 	bool        fw() const           { return fw_; }
+#ifdef POPCNT_CAPABILITY 
+    bool _usePOPCNTinstruction; 
+#endif 
 
 	/**
 	 * Returns true iff the index contains the given string (exactly).  The
@@ -1101,8 +1186,8 @@ public:
 	 */
 	bool contains(
 		const BTDnaString& str,
-		uint32_t *top = NULL,
-		uint32_t *bot = NULL) const;
+		TIndexOffU *top = NULL,
+		TIndexOffU *bot = NULL) const;
 
 	/**
 	 * Returns true iff the index contains the given string (exactly).  The
@@ -1111,8 +1196,8 @@ public:
 	 */
 	bool contains(
 		const char *str,
-		uint32_t *top = NULL,
-		uint32_t *bot = NULL) const
+		TIndexOffU *top = NULL,
+		TIndexOffU *bot = NULL) const
 	{
 		return contains(BTDnaString(str, true), top, bot);
 	}
@@ -1129,7 +1214,7 @@ public:
 			assert(fchr() != NULL);
 			//assert(_offs != NULL);
 			//assert(_rstarts != NULL);
-			assert_neq(_zEbwtByteOff, 0xffffffff);
+			assert_neq(_zEbwtByteOff, OFF_MASK);
 			assert_neq(_zEbwtBpOff, -1);
 			return true;
 		} else {
@@ -1138,7 +1223,7 @@ public:
 			assert(fchr() == NULL);
 			assert(offs() == NULL);
 			assert(rstarts() == NULL);
-			assert_eq(_zEbwtByteOff, 0xffffffff);
+			assert_eq(_zEbwtByteOff, OFF_MASK);
 			assert_eq(_zEbwtBpOff, -1);
 			return false;
 		}
@@ -1189,7 +1274,7 @@ public:
 		// Keep plen; it's small and the client may want to seq it
 		// even when the others are evicted.
 		//_plen  = NULL;
-		_zEbwtByteOff = 0xffffffff;
+		_zEbwtByteOff = OFF_MASK;
 		_zEbwtBpOff = -1;
 	}
 
@@ -1198,7 +1283,7 @@ public:
 	 * length equal to the index's 'ftabChars' into an int that can be
 	 * used to index into the ftab array.
 	 */
-	uint32_t ftabSeqToInt(
+	TIndexOffU ftabSeqToInt(
 		const BTDnaString& seq,
 		size_t off,
 		bool rev) const
@@ -1206,7 +1291,7 @@ public:
 		int fc = _eh._ftabChars;
 		size_t lo = off, hi = lo + fc;
 		assert_leq(hi, seq.length());
-		uint32_t ftabOff = 0;
+		TIndexOffU ftabOff = 0;
 		for(int i = 0; i < fc; i++) {
 			bool fwex = fw();
 			if(rev) fwex = !fwex;
@@ -1215,7 +1300,7 @@ public:
 			// means right-to-left order; for BWT' it's left-to-right.
 			int c = (fwex ? seq[lo + i] : seq[hi - i - 1]);
 			if(c > 3) {
-				return std::numeric_limits<uint32_t>::max();
+				return std::numeric_limits<TIndexOffU>::max();
 			}
 			assert_range(0, 3, c);
 			ftabOff <<= 2;
@@ -1227,13 +1312,13 @@ public:
 	/**
 	 * Non-static facade for static function ftabHi.
 	 */
-	uint32_t ftabHi(uint32_t i) const {
+	TIndexOffU ftabHi(TIndexOffU i) const {
 		return Ebwt::ftabHi(
 			ftab(),
 			eftab(),
 			_eh._len,
 			_eh._ftabLen,
-		    _eh._eftabLen,
+			_eh._eftabLen,
 			i);
 	}
 
@@ -1246,19 +1331,19 @@ public:
 	 * It's a static member because it's convenient to ask this
 	 * question before the Ebwt is fully initialized.
 	 */
-	static uint32_t ftabHi(
-		const uint32_t *ftab,
-		const uint32_t *eftab,
-		uint32_t len,
-		uint32_t ftabLen,
-		uint32_t eftabLen,
-		uint32_t i)
+	static TIndexOffU ftabHi(
+		const TIndexOffU *ftab,
+		const TIndexOffU *eftab,
+		TIndexOffU len,
+		TIndexOffU ftabLen,
+		TIndexOffU eftabLen,
+		TIndexOffU i)
 	{
 		assert_lt(i, ftabLen);
 		if(ftab[i] <= len) {
 			return ftab[i];
 		} else {
-			uint32_t efIdx = ftab[i] ^ 0xffffffff;
+			TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
 			assert_lt(efIdx*2+1, eftabLen);
 			return eftab[efIdx*2+1];
 		}
@@ -1267,27 +1352,27 @@ public:
 	/**
 	 * Non-static facade for static function ftabLo.
 	 */
-	uint32_t ftabLo(uint32_t i) const {
+	TIndexOffU ftabLo(TIndexOffU i) const {
 		return Ebwt::ftabLo(
 			ftab(),
 			eftab(),
 			_eh._len,
 			_eh._ftabLen,
-		    _eh._eftabLen,
+			_eh._eftabLen,
 			i);
 	}
 	
 	/**
 	 * Get low bound of ftab range.
 	 */
-	uint32_t ftabLo(const BTDnaString& seq, size_t off) const {
+	TIndexOffU ftabLo(const BTDnaString& seq, size_t off) const {
 		return ftabLo(ftabSeqToInt(seq, off, false));
 	}
 
 	/**
 	 * Get high bound of ftab range.
 	 */
-	uint32_t ftabHi(const BTDnaString& seq, size_t off) const {
+	TIndexOffU ftabHi(const BTDnaString& seq, size_t off) const {
 		return ftabHi(ftabSeqToInt(seq, off, false));
 	}
 	
@@ -1303,11 +1388,11 @@ public:
 		const BTDnaString& seq, // sequence to extract from
 		size_t off,             // offset into seq to begin extracting
 		bool rev,               // reverse while extracting
-		uint32_t& top,
-		uint32_t& bot) const
+		TIndexOffU& top,
+		TIndexOffU& bot) const
 	{
-		uint32_t fi = ftabSeqToInt(seq, off, rev);
-		if(fi == std::numeric_limits<uint32_t>::max()) {
+		TIndexOffU fi = ftabSeqToInt(seq, off, rev);
+		if(fi == std::numeric_limits<TIndexOffU>::max()) {
 			return false;
 		}
 		top = ftabHi(fi);
@@ -1325,19 +1410,19 @@ public:
 	 * It's a static member because it's convenient to ask this
 	 * question before the Ebwt is fully initialized.
 	 */
-	static uint32_t ftabLo(
-		const uint32_t *ftab,
-		const uint32_t *eftab,
-		uint32_t len,
-		uint32_t ftabLen,
-		uint32_t eftabLen,
-		uint32_t i)
+	static TIndexOffU ftabLo(
+		const TIndexOffU *ftab,
+		const TIndexOffU *eftab,
+		TIndexOffU len,
+		TIndexOffU ftabLen,
+		TIndexOffU eftabLen,
+		TIndexOffU i)
 	{
 		assert_lt(i, ftabLen);
 		if(ftab[i] <= len) {
 			return ftab[i];
 		} else {
-			uint32_t efIdx = ftab[i] ^ 0xffffffff;
+			TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
 			assert_lt(efIdx*2+1, eftabLen);
 			return eftab[efIdx*2];
 		}
@@ -1346,20 +1431,20 @@ public:
 	/**
 	 * Try to resolve the reference offset of the BW element 'elt'.  If
 	 * it can be resolved immediately, return the reference offset.  If
-	 * it cannot be resolved immediately, return 0xffffffff.
+	 * it cannot be resolved immediately, return max value.
 	 */
-	uint32_t tryOffset(uint32_t elt) const {
+	TIndexOffU tryOffset(TIndexOffU elt) const {
 		assert(offs() != NULL);
 		if(elt == _zOff) return 0;
 		if((elt & _eh._offMask) == elt) {
-			uint32_t eltOff = elt >> _eh._offRate;
+			TIndexOffU eltOff = elt >> _eh._offRate;
 			assert_lt(eltOff, _eh._offsLen);
-			uint32_t off = offs()[eltOff];
-			assert_neq(0xffffffff, off);
+			TIndexOffU off = offs()[eltOff];
+			assert_neq(OFF_MASK, off);
 			return off;
 		} else {
 			// Try looking at zoff
-			return 0xffffffff;
+			return OFF_MASK;
 		}
 	}
 
@@ -1368,13 +1453,13 @@ public:
 	 * that the offset returned is at the right-hand side of the
 	 * forward reference substring involved in the hit.
 	 */
-	uint32_t tryOffset(
-		uint32_t elt,
+	TIndexOffU tryOffset(
+		TIndexOffU elt,
 		bool fw,
-		uint32_t hitlen) const
+		TIndexOffU hitlen) const
 	{
-		uint32_t off = tryOffset(elt);
-		if(off != 0xffffffff && !fw) {
+		TIndexOffU off = tryOffset(elt);
+		if(off != OFF_MASK && !fw) {
 			assert_lt(off, _eh._len);
 			off = _eh._len - off - 1;
 			assert_geq(off, hitlen-1);
@@ -1387,22 +1472,22 @@ public:
 	/**
 	 * Walk 'steps' steps to the left and return the row arrived at.
 	 */
-	uint32_t walkLeft(uint32_t row, uint32_t steps) const;
+	TIndexOffU walkLeft(TIndexOffU row, TIndexOffU steps) const;
 
 	/**
 	 * Resolve the reference offset of the BW element 'elt'.
 	 */
-	uint32_t getOffset(uint32_t row) const;
+	TIndexOffU getOffset(TIndexOffU row) const;
 
 	/**
 	 * Resolve the reference offset of the BW element 'elt' such that
 	 * the offset returned is at the right-hand side of the forward
 	 * reference substring involved in the hit.
 	 */
-	uint32_t getOffset(
-		uint32_t elt,
+	TIndexOffU getOffset(
+		TIndexOffU elt,
 		bool fw,
-		uint32_t hitlen) const;
+		TIndexOffU hitlen) const;
 
 	/**
 	 * When using read() to create an Ebwt, we have to set a couple of
@@ -1412,9 +1497,9 @@ public:
 	 * _zEbwtBpOff from _zOff.
 	 */
 	void postReadInit(EbwtParams& eh) {
-		uint32_t sideNum     = _zOff / eh._sideBwtLen;
-		uint32_t sideCharOff = _zOff % eh._sideBwtLen;
-		uint32_t sideByteOff = sideNum * eh._sideSz;
+		TIndexOffU sideNum     = _zOff / eh._sideBwtLen;
+		TIndexOffU sideCharOff = _zOff % eh._sideBwtLen;
+		TIndexOffU sideByteOff = sideNum * eh._sideSz;
 		_zEbwtByteOff = sideCharOff >> 2;
 		assert_lt(_zEbwtByteOff, eh._sideBwtSz);
 		_zEbwtBpOff = sideCharOff & 3;
@@ -1492,9 +1577,9 @@ public:
 
 	// Building
 	template <typename TStr> static TStr join(EList<TStr>& l, uint32_t seed);
-	template <typename TStr> static TStr join(EList<FileBuf*>& l, EList<RefRecord>& szs, uint32_t sztot, const RefReadInParams& refparams, uint32_t seed);
-	template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, uint32_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
-	template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2);
+	template <typename TStr> static TStr join(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed);
+	template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
+	template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2, ostream* saOut, ostream* bwtOut);
 
 	// I/O
 	void readIntoMemory(int color, int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
@@ -1502,13 +1587,13 @@ public:
 	void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
 
 	// Sanity checking
-	void sanityCheckUpToSide(int upToSide) const;
+	void sanityCheckUpToSide(TIndexOff upToSide) const;
 	void sanityCheckAll(int reverse) const;
 	void restore(SString<char>& s) const;
 	void checkOrigs(const EList<SString<char> >& os, bool color, bool mirror) const;
 
 	// Searching and reporting
-	void joinedToTextOff(uint32_t qlen, uint32_t off, uint32_t& tidx, uint32_t& textoff, uint32_t& tlen, bool rejectStraddle, bool& straddled) const;
+	void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen, bool rejectStraddle, bool& straddled) const;
 
 #define WITHIN_BWT_LEN(x) \
 	assert_leq(x[0], this->_eh._sideBwtLen); \
@@ -1538,12 +1623,12 @@ public:
 	 * XXXXXXXXXXXXXXXX [A] [C] [G] [T]
 	 * --------48------ -4- -4- -4- -4-  (numbers in bytes)
 	 */
-	inline uint32_t countBt2Side(const SideLocus& l, int c) const {
+	inline TIndexOffU countBt2Side(const SideLocus& l, int c) const {
 		assert_range(0, 3, c);
 		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
 		assert_range(0, 3, (int)l._bp);
 		const uint8_t *side = l.side(this->ebwt());
-		uint32_t cCnt = countUpTo(l, c);
+		TIndexOffU cCnt = countUpTo(l, c);
 		assert_leq(cCnt, l.toBWRow());
 		assert_leq(cCnt, this->_eh._sideBwtLen);
 		if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
@@ -1555,10 +1640,10 @@ public:
 				cCnt--; // Adjust for '$' looking like an 'A'
 			}
 		}
-		uint32_t ret;
+		TIndexOffU ret;
 		// Now factor in the occ[] count at the side break
 		const uint8_t *acgt8 = side + _eh._sideBwtSz;
-		const uint32_t *acgt = reinterpret_cast<const uint32_t*>(acgt8);
+		const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt8);
 		assert_leq(acgt[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
 		assert_leq(acgt[1], this->_eh._len);
 		assert_leq(acgt[2], this->_eh._len);
@@ -1586,9 +1671,9 @@ public:
 	 */
 	inline void countBt2SideRange(
 		SideLocus& l,        // top locus
-		uint32_t num,        // number of elts in range to tall
-		uint32_t* cntsUpto,  // A/C/G/T counts up to top
-		uint32_t* cntsIn,    // A/C/G/T counts within range
+		TIndexOffU num,        // number of elts in range to tall // @double-check
+		TIndexOffU* cntsUpto, // A/C/G/T counts up to top
+		TIndexOffU* cntsIn,   // A/C/G/T counts within range
 		EList<bool> *masks) const // masks indicating which range elts = A/C/G/T
 	{
 		assert_gt(num, 0);
@@ -1608,7 +1693,7 @@ public:
 			}
 		}
 		// Now factor in the occ[] count at the side break
-		const uint32_t *acgt = reinterpret_cast<const uint32_t*>(side + _eh._sideBwtSz);
+		const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(side + _eh._sideBwtSz);
 		assert_leq(acgt[0], this->fchr()[1] + this->_eh.sideBwtLen());
 		assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
 		assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
@@ -1629,7 +1714,7 @@ public:
 		WITHIN_FCHR_DOLLARA(cntsIn);
 		// 'cntsUpto' is complete now.
 		// Walk forward until we've tallied the entire 'In' range
-		uint32_t nm = 0;
+		TIndexOffU nm = 0;
 		// Rest of this side
 		nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm);
 		assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
@@ -1667,7 +1752,7 @@ public:
 	 *         Side ptr (result from SideLocus.side())
 	 *
 	 */
-	inline void countBt2SideEx(const SideLocus& l, uint32_t* arrs) const {
+	inline void countBt2SideEx(const SideLocus& l, TIndexOffU* arrs) const {
 		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
 		assert_range(0, 3, (int)l._bp);
 		countUpToEx(l, arrs);
@@ -1684,8 +1769,8 @@ public:
 		WITHIN_BWT_LEN(arrs);
 		// Now factor in the occ[] count at the side break
 		const uint8_t *side = l.side(this->ebwt());
-		const uint8_t *acgt16 = side + this->_eh._sideSz - 16;
-		const uint32_t *acgt = reinterpret_cast<const uint32_t*>(acgt16);
+		const uint8_t *acgt16 = side + this->_eh._sideSz - OFF_SIZE*4;
+		const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt16);
 		assert_leq(acgt[0], this->fchr()[1] + this->_eh.sideBwtLen());
 		assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
 		assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
@@ -1710,16 +1795,29 @@ public:
 	 *
 	 * Function gets 11.09% in profile
 	 */
-	inline uint32_t countUpTo(const SideLocus& l, int c) const {
+	inline TIndexOffU countUpTo(const SideLocus& l, int c) const { // @double-check
 		// Count occurrences of c in each 64-bit (using bit trickery);
 		// Someday countInU64() and pop() functions should be
 		// vectorized/SSE-ized in case that helps.
-		uint32_t cCnt = 0;
+		TIndexOffU cCnt = 0;
 		const uint8_t *side = l.side(this->ebwt());
 		int i = 0;
-		for(; i + 7 < l._by; i += 8) {
-			cCnt += countInU64(c, *(uint64_t*)&side[i]);
-		}
+#ifdef POPCNT_CAPABILITY
+        if ( _usePOPCNTinstruction) {
+            for(; i + 7 < l._by; i += 8) {
+                cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
+            }
+        } 
+        else {
+            for(; i + 7 < l._by; i += 8) {
+                cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
+            }
+        }
+#else
+        for(; i + 7 < l._by; i += 8) {
+            cCnt += countInU64(c, *(uint64_t*)&side[i]);
+        }
+#endif		
 		// Count occurences of c in the rest of the side (using LUT)
 		for(; i < l._by; i++) {
 			cCnt += cCntLUT_4[0][c][side[i]];
@@ -1737,14 +1835,57 @@ public:
 	 *
 	 * Function gets 2.32% in profile
 	 */
-	inline static void countInU64Ex(uint64_t dw, uint32_t* arrs) {
-		// Cache misses here (~9K)
-		uint64_t dwA  = dw &  0xAAAAAAAAAAAAAAAAllu;
-		uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
-		arrs[0] += (32 - pop64((dwA >> 1) | dwNA));
-		arrs[1] += pop64(~(dwA >> 1) & dwNA);
-		arrs[2] += pop64((dwA >> 1) & ~dwNA);
-		arrs[3] += pop64((dwA >> 1) & dwNA);
+#ifdef POPCNT_CAPABILITY
+template<typename Operation>
+#endif
+	inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) {
+		uint64_t c0 = c_table[0];
+		uint64_t x0 = dw ^ c0;
+		uint64_t x1 = (x0 >> 1);
+		uint64_t x2 = x1 & (0x5555555555555555llu);
+		uint64_t x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+		uint64_t tmp = Operation().pop64(x3);
+#else
+		uint64_t tmp = pop64(x3);
+#endif
+		arrs[0] += (uint32_t) tmp;
+
+		c0 = c_table[1];
+		x0 = dw ^ c0;
+		x1 = (x0 >> 1);
+		x2 = x1 & (0x5555555555555555llu);
+		x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+		tmp = Operation().pop64(x3);
+#else
+		tmp = pop64(x3);
+#endif
+		arrs[1] += (uint32_t) tmp;
+
+		c0 = c_table[2];
+		x0 = dw ^ c0;
+		x1 = (x0 >> 1);
+		x2 = x1 & (0x5555555555555555llu);
+		x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+		tmp = Operation().pop64(x3);
+#else
+		tmp = pop64(x3);
+#endif
+		arrs[2] += (uint32_t) tmp;
+	
+		c0 = c_table[3];
+		x0 = dw ^ c0;
+		x1 = (x0 >> 1);
+		x2 = x1 & (0x5555555555555555llu);
+		x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+		tmp = Operation().pop64(x3);
+#else
+		tmp = pop64(x3);
+#endif
+		arrs[3] += (uint32_t) tmp;
 	}
 
 	/**
@@ -1752,7 +1893,7 @@ public:
 	 * given side up to (but not including) the given byte/bitpair (by/bp).
 	 * Count for 'a' goes in arrs[0], 'c' in arrs[1], etc.
 	 */
-	inline void countUpToEx(const SideLocus& l, uint32_t* arrs) const {
+	inline void countUpToEx(const SideLocus& l, TIndexOffU* arrs) const {
 		int i = 0;
 		// Count occurrences of each nucleotide in each 64-bit word using
 		// bit trickery; note: this seems does not seem to lend a
@@ -1762,9 +1903,23 @@ public:
 		// does not change noticeably. Someday the countInU64() and pop()
 		// functions should be vectorized/SSE-ized in case that helps.
 		const uint8_t *side = l.side(this->ebwt());
+
+#ifdef POPCNT_CAPABILITY
+		if (_usePOPCNTinstruction) {
+			for(; i+7 < l._by; i += 8) {
+				countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
+			}
+		}
+		else {
+			for(; i+7 < l._by; i += 8) {
+				countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
+			}
+		}
+#else 
 		for(; i+7 < l._by; i += 8) {
 			countInU64Ex(*(uint64_t*)&side[i], arrs);
 		}
+#endif
 		// Count occurences of nucleotides in the rest of the side (using LUT)
 		// Many cache misses on following lines (~20K)
 		for(; i < l._by; i++) {
@@ -1789,7 +1944,7 @@ public:
 	 */
 	inline void mapLFEx(
 		const SideLocus& l,
-		uint32_t *arrs
+		TIndexOffU *arrs
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
 	{
@@ -1815,10 +1970,10 @@ public:
 	 * those loci.
 	 */
 	inline void mapLFEx(
-		uint32_t top,
-		uint32_t bot,
-		uint32_t *tops,
-		uint32_t *bots
+		TIndexOffU top,
+		TIndexOffU bot,
+		TIndexOffU *tops,
+		TIndexOffU *bots
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
 	{
@@ -1834,8 +1989,8 @@ public:
 	inline void mapLFEx(
 		const SideLocus& ltop,
 		const SideLocus& lbot,
-		uint32_t *tops,
-		uint32_t *bots
+		TIndexOffU *tops,
+		TIndexOffU *bots
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
 	{
@@ -1874,20 +2029,20 @@ public:
 	 *
 	 * Must fill in masks
 	 */
-	inline uint32_t countBt2SideRange2(
+	inline TIndexOffU countBt2SideRange2( // @double-check
 		const SideLocus& l,
 		bool startAtLocus,
-		uint32_t num,
-		uint32_t* arrs,
+		TIndexOffU num,
+		TIndexOffU* arrs,
 		EList<bool> *masks,
-		uint32_t maskOff) const
+		TIndexOffU maskOff) const
 	{
 		assert(!masks[0].empty());
 		assert_eq(masks[0].size(), masks[1].size());
 		assert_eq(masks[0].size(), masks[2].size());
 		assert_eq(masks[0].size(), masks[3].size());
-		ASSERT_ONLY(uint32_t myarrs[4] = {0, 0, 0, 0});
-		uint32_t nm = 0; // number of nucleotides tallied so far
+		ASSERT_ONLY(TIndexOffU myarrs[4] = {0, 0, 0, 0});
+		TIndexOffU nm = 0; // number of nucleotides tallied so far
 		int iby = 0;      // initial byte offset
 		int ibp = 0;      // initial base-pair offset
 		if(startAtLocus) {
@@ -1925,10 +2080,10 @@ public:
 #ifndef NDEBUG
 		if(_sanity) {
 			// Make sure results match up with a call to mapLFEx.
-			uint32_t tops[4] = {0, 0, 0, 0};
-			uint32_t bots[4] = {0, 0, 0, 0};
-			uint32_t top = l.toBWRow();
-			uint32_t bot = top + nm;
+			TIndexOffU tops[4] = {0, 0, 0, 0};
+			TIndexOffU bots[4] = {0, 0, 0, 0};
+			TIndexOffU top = l.toBWRow();
+			TIndexOffU bot = top + nm;
 			mapLFEx(top, bot, tops, bots, false);
 			assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1);
 			assert_eq(myarrs[1], bots[1] - tops[1]);
@@ -1954,7 +2109,7 @@ public:
 	 * BWT transform).  Note that the 'L' in the name of the function
 	 * stands for 'last', as in the literature.
 	 */
-	inline int rowL(uint32_t i) const {
+	inline int rowL(TIndexOffU i) const {
 		// Extract and return appropriate bit-pair
 		SideLocus l;
 		l.initFromRow(i, _eh, ebwt());
@@ -1968,9 +2123,9 @@ public:
 	inline void mapLFRange(
 		SideLocus& ltop,
 		SideLocus& lbot,
-		uint32_t num,        // Number of elts
-		uint32_t* cntsUpto,  // A/C/G/T counts up to top
-		uint32_t* cntsIn,    // A/C/G/T counts within range
+		TIndexOffU num,        // Number of elts
+		TIndexOffU* cntsUpto,  // A/C/G/T counts up to top
+		TIndexOffU* cntsIn,    // A/C/G/T counts within range
 		EList<bool> *masks
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
@@ -1989,8 +2144,8 @@ public:
 			// Make sure results match up with individual calls to mapLF;
 			// be sure to override sanity-checking in the callee, or we'll
 			// have infinite recursion
-			uint32_t tops[4] = {0, 0, 0, 0};
-			uint32_t bots[4] = {0, 0, 0, 0};
+			TIndexOffU tops[4] = {0, 0, 0, 0};
+			TIndexOffU bots[4] = {0, 0, 0, 0};
 			assert(ltop.repOk(this->eh()));
 			assert(lbot.repOk(this->eh()));
 			mapLFEx(ltop, lbot, tops, bots, false);
@@ -2010,13 +2165,13 @@ public:
 	/**
 	 * Given row i, return the row that the LF mapping maps i to.
 	 */
-	inline uint32_t mapLF(
+	inline TIndexOffU mapLF(
 		const SideLocus& l
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
 	{
-		ASSERT_ONLY(uint32_t srcrow = l.toBWRow());
-		uint32_t ret;
+		ASSERT_ONLY(TIndexOffU srcrow = l.toBWRow());
+		TIndexOffU ret;
 		assert(l.side(this->ebwt()) != NULL);
 		int c = rowL(l);
 		assert_lt(c, 4);
@@ -2029,7 +2184,7 @@ public:
 			// Make sure results match up with results from mapLFEx;
 			// be sure to override sanity-checking in the callee, or we'll
 			// have infinite recursion
-			uint32_t arrs[] = { 0, 0, 0, 0 };
+			TIndexOffU arrs[] = { 0, 0, 0, 0 };
 			mapLFEx(l, arrs, true);
 			assert_eq(arrs[c], ret);
 		}
@@ -2041,12 +2196,12 @@ public:
 	 * Given row i and character c, return the row that the LF mapping maps
 	 * i to on character c.
 	 */
-	inline uint32_t mapLF(
+	inline TIndexOffU mapLF(
 		const SideLocus& l, int c
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
 	{
-		uint32_t ret;
+		TIndexOffU ret;
 		assert_lt(c, 4);
 		assert_geq(c, 0);
 		ret = countBt2Side(l, c);
@@ -2056,7 +2211,7 @@ public:
 			// Make sure results match up with results from mapLFEx;
 			// be sure to override sanity-checking in the callee, or we'll
 			// have infinite recursion
-			uint32_t arrs[] = { 0, 0, 0, 0 };
+			TIndexOffU arrs[] = { 0, 0, 0, 0 };
 			mapLFEx(l, arrs, true);
 			assert_eq(arrs[c], ret);
 		}
@@ -2072,10 +2227,10 @@ public:
 	inline void mapBiLFEx(
 		const SideLocus& ltop,
 		const SideLocus& lbot,
-		uint32_t *tops,
-		uint32_t *bots,
-		uint32_t *topsP, // topsP[0] = top
-		uint32_t *botsP
+		TIndexOffU *tops,
+		TIndexOffU *bots,
+		TIndexOffU *topsP, // topsP[0] = top
+		TIndexOffU *botsP
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
 	{
@@ -2115,17 +2270,17 @@ public:
 	/**
 	 * Given row and its locus information, proceed on the given character
 	 * and return the next row, or all-fs if we can't proceed on that
-	 * character.  Returns 0xffffffff if this row ends in $.
+	 * character.  Returns max value if this row ends in $.
 	 */
-	inline uint32_t mapLF1(
-		uint32_t row,       // starting row
+	inline TIndexOffU mapLF1(
+		TIndexOffU row,       // starting row
 		const SideLocus& l, // locus for starting row
 		int c               // character to proceed on
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
 	{
-		if(rowL(l) != c || row == _zOff) return 0xffffffff;
-		uint32_t ret;
+		if(rowL(l) != c || row == _zOff) return OFF_MASK;
+		TIndexOffU ret;
 		assert_lt(c, 4);
 		assert_geq(c, 0);
 		ret = countBt2Side(l, c);
@@ -2135,7 +2290,7 @@ public:
 			// Make sure results match up with results from mapLFEx;
 			// be sure to override sanity-checking in the callee, or we'll
 			// have infinite recursion
-			uint32_t arrs[] = { 0, 0, 0, 0 };
+			TIndexOffU arrs[] = { 0, 0, 0, 0 };
 			mapLFEx(l, arrs, true);
 			assert_eq(arrs[c], ret);
 		}
@@ -2149,7 +2304,7 @@ public:
 	 * return the character that was in the final column.
 	 */
 	inline int mapLF1(
-		uint32_t& row,      // starting row
+		TIndexOffU& row,      // starting row
 		const SideLocus& l  // locus for starting row
 		ASSERT_ONLY(, bool overrideSanity = false)
 		) const
@@ -2164,7 +2319,7 @@ public:
 			// Make sure results match up with results from mapLFEx;
 			// be sure to override sanity-checking in the callee, or we'll
 			// have infinite recursion
-			uint32_t arrs[] = { 0, 0, 0, 0 };
+			TIndexOffU arrs[] = { 0, 0, 0, 0 };
 			mapLFEx(l, arrs, true);
 			assert_eq(arrs[c], row);
 		}
@@ -2216,24 +2371,26 @@ public:
 	MM_FILE    _in2;    // input fd for secondary index file
 	string     _in1Str; // filename for primary index file
 	string     _in2Str; // filename for secondary index file
-	uint32_t   _zOff;
-	uint32_t   _zEbwtByteOff;
-	int        _zEbwtBpOff;
-	uint32_t   _nPat;  /// number of reference texts
-	uint32_t   _nFrag; /// number of fragments
-	APtrWrap<uint32_t> _plen;
-	APtrWrap<uint32_t> _rstarts; // starting offset of fragments / text indexes
+	string     _inSaStr;  // filename for suffix-array file
+	string     _inBwtStr; // filename for BWT file
+	TIndexOffU  _zOff;
+	TIndexOffU  _zEbwtByteOff;
+	TIndexOff   _zEbwtBpOff; 
+	TIndexOffU  _nPat;  /// number of reference texts
+	TIndexOffU  _nFrag; /// number of fragments
+	APtrWrap<TIndexOffU> _plen;
+	APtrWrap<TIndexOffU> _rstarts; // starting offset of fragments / text indexes
 	// _fchr, _ftab and _eftab are expected to be relatively small
 	// (usually < 1MB, perhaps a few MB if _fchr is particularly large
 	// - like, say, 11).  For this reason, we don't bother with writing
 	// them to disk through separate output streams; we
-	APtrWrap<uint32_t> _fchr;
-	APtrWrap<uint32_t> _ftab;
-	APtrWrap<uint32_t> _eftab; // "extended" entries for _ftab
+	APtrWrap<TIndexOffU> _fchr;
+	APtrWrap<TIndexOffU> _ftab;
+	APtrWrap<TIndexOffU> _eftab; // "extended" entries for _ftab
 	// _offs may be extremely large.  E.g. for DNA w/ offRate=4 (one
 	// offset every 16 rows), the total size of _offs is the same as
 	// the total size of the input sequence
-	APtrWrap<uint32_t> _offs;
+	APtrWrap<TIndexOffU> _offs;
 	// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
 	// is at least as large as the input sequence.
 	APtrWrap<uint8_t> _ebwt;
@@ -2245,14 +2402,18 @@ public:
 	EbwtParams _eh;
 	bool packed_;
 
-	static const uint32_t default_bmax = 0xffffffff;
-	static const uint32_t default_bmaxMultSqrt = 0xffffffff;
-	static const uint32_t default_bmaxDivN = 4;
+	static const TIndexOffU default_bmax = OFF_MASK;
+	static const TIndexOffU default_bmaxMultSqrt = OFF_MASK;
+	static const TIndexOffU default_bmaxDivN = 4;
 	static const int      default_dcv = 1024;
 	static const bool     default_noDc = false;
 	static const bool     default_useBlockwise = true;
 	static const uint32_t default_seed = 0;
+#ifdef BOWTIE_64BIT_INDEX
+	static const int      default_lineRate = 7;
+#else
 	static const int      default_lineRate = 6;
+#endif
 	static const int      default_offRate = 5;
 	static const int      default_offRatePlus = 0;
 	static const int      default_ftabChars = 10;
@@ -2278,7 +2439,7 @@ private:
  * Read reference names from an input stream 'in' for an Ebwt primary
  * file and store them in 'refnames'.
  */
-void readEbwtRefnames(istream& in, EList<string>& refnames);
+void readEbwtRefnames(FILE* fin, EList<string>& refnames);
 
 /**
  * Read reference names from the index with basename 'in' and store
@@ -2317,12 +2478,12 @@ TStr Ebwt::join(EList<TStr>& l, uint32_t seed) {
 	RandomSource rand; // reproducible given same seed
 	rand.init(seed);
 	TStr ret;
-	size_t guessLen = 0;
-	for(size_t i = 0; i < l.size(); i++) {
+	TIndexOffU guessLen = 0;
+	for(TIndexOffU i = 0; i < l.size(); i++) {
 		guessLen += length(l[i]);
 	}
 	ret.resize(guessLen);
-	size_t off = 0;
+	TIndexOffU off = 0;
 	for(size_t i = 0; i < l.size(); i++) {
 		TStr& s = l[i];
 		assert_gt(s.length(), 0);
@@ -2344,7 +2505,7 @@ TStr Ebwt::join(EList<TStr>& l, uint32_t seed) {
 template<typename TStr>
 TStr Ebwt::join(EList<FileBuf*>& l,
                 EList<RefRecord>& szs,
-                uint32_t sztot,
+                TIndexOffU sztot,
                 const RefReadInParams& refparams,
                 uint32_t seed)
 {
@@ -2352,18 +2513,18 @@ TStr Ebwt::join(EList<FileBuf*>& l,
 	rand.init(seed);
 	RefReadInParams rpcp = refparams;
 	TStr ret;
-	size_t guessLen = sztot;
+	TIndexOffU guessLen = sztot;
 	ret.resize(guessLen);
-	ASSERT_ONLY(size_t szsi = 0);
-	size_t dstoff = 0;
-	for(size_t i = 0; i < l.size(); i++) {
+	ASSERT_ONLY(TIndexOffU szsi = 0);
+	TIndexOffU dstoff = 0;
+	for(TIndexOffU i = 0; i < l.size(); i++) {
 		// For each sequence we can pull out of istream l[i]...
 		assert(!l[i]->eof());
 		bool first = true;
 		while(!l[i]->eof()) {
 			RefRecord rec = fastaRefReadAppend(*l[i], first, ret, dstoff, rpcp);
 			first = false;
-			size_t bases = rec.len;
+			TIndexOffU bases = rec.len;
 			assert_eq(rec.off, szs[szsi].off);
 			assert_eq(rec.len, szs[szsi].len);
 			assert_eq(rec.first, szs[szsi].first);
@@ -2389,7 +2550,7 @@ template<typename TStr>
 void Ebwt::joinToDisk(
 	EList<FileBuf*>& l,
 	EList<RefRecord>& szs,
-	uint32_t sztot,
+	TIndexOffU sztot,
 	const RefReadInParams& refparams,
 	TStr& ret,
 	ostream& out1,
@@ -2405,28 +2566,28 @@ void Ebwt::joinToDisk(
 	// fragments.
 	this->_nPat = 0;
 	this->_nFrag = 0;
-	for(size_t i = 0; i < szs.size(); i++) {
+	for(TIndexOffU i = 0; i < szs.size(); i++) {
 		if(szs[i].len > 0) this->_nFrag++;
 		if(szs[i].first && szs[i].len > 0) this->_nPat++;
 	}
 	assert_gt(this->_nPat, 0);
 	assert_geq(this->_nFrag, this->_nPat);
 	_rstarts.reset();
-	writeU32(out1, this->_nPat, this->toBe());
+	writeU<TIndexOffU>(out1, this->_nPat, this->toBe());
 	// Allocate plen[]
 	try {
-		this->_plen.init(new uint32_t[this->_nPat], this->_nPat);
+		this->_plen.init(new TIndexOffU[this->_nPat], this->_nPat);
 	} catch(bad_alloc& e) {
 		cerr << "Out of memory allocating plen[] in Ebwt::join()"
 		     << " at " << __FILE__ << ":" << __LINE__ << endl;
 		throw e;
 	}
 	// For each pattern, set plen
-	int npat = -1;
-	for(size_t i = 0; i < szs.size(); i++) {
+	TIndexOff npat = -1;
+	for(TIndexOffU i = 0; i < szs.size(); i++) {
 		if(szs[i].first && szs[i].len > 0) {
 			if(npat >= 0) {
-				writeU32(out1, this->plen()[npat], this->toBe());
+				writeU<TIndexOffU>(out1, this->plen()[npat], this->toBe());
 			}
 			npat++;
 			this->plen()[npat] = (szs[i].len + szs[i].off);
@@ -2434,19 +2595,19 @@ void Ebwt::joinToDisk(
 			this->plen()[npat] += (szs[i].len + szs[i].off);
 		}
 	}
-	assert_eq((uint32_t)npat, this->_nPat-1);
-	writeU32(out1, this->plen()[npat], this->toBe());
+	assert_eq((TIndexOffU)npat, this->_nPat-1);
+	writeU<TIndexOffU>(out1, this->plen()[npat], this->toBe());
 	// Write the number of fragments
-	writeU32(out1, this->_nFrag, this->toBe());
-	size_t seqsRead = 0;
-	ASSERT_ONLY(uint32_t szsi = 0);
-	ASSERT_ONLY(uint32_t entsWritten = 0);
-	size_t dstoff = 0;
+	writeU<TIndexOffU>(out1, this->_nFrag, this->toBe());
+	TIndexOffU seqsRead = 0;
+	ASSERT_ONLY(TIndexOffU szsi = 0);
+	ASSERT_ONLY(TIndexOffU entsWritten = 0);
+	TIndexOffU dstoff = 0;
 	// For each filebuf
 	for(unsigned int i = 0; i < l.size(); i++) {
 		assert(!l[i]->eof());
 		bool first = true;
-		uint32_t patoff = 0;
+		TIndexOffU patoff = 0;
 		// For each *fragment* (not necessary an entire sequence) we
 		// can pull out of istream l[i]...
 		while(!l[i]->eof()) {
@@ -2456,7 +2617,7 @@ void Ebwt::joinToDisk(
 			RefRecord rec = fastaRefReadAppend(
 				*l[i], first, ret, dstoff, rpcp, &_refnames.back());
 			first = false;
-			size_t bases = rec.len;
+			TIndexOffU bases = rec.len;
 			if(rec.first && rec.len > 0) {
 				if(_refnames.back().length() == 0) {
 					// If name was empty, replace with an index
@@ -2490,7 +2651,7 @@ void Ebwt::joinToDisk(
 			//writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
 			//writeU32(out1, seq,       this->toBe()); // sequence id
 			//writeU32(out1, patoff,    this->toBe()); // offset into sequence
-			patoff += (uint32_t)bases;
+			patoff += bases;
 		}
 		assert_gt(szsi, 0);
 		l[i]->reset();
@@ -2533,7 +2694,9 @@ void Ebwt::buildToDisk(
 	InorderBlockwiseSA<TStr>& sa,
 	const TStr& s,
 	ostream& out1,
-	ostream& out2)
+	ostream& out2,
+	ostream* saOut,
+	ostream* bwtOut)
 {
 	const EbwtParams& eh = this->_eh;
 
@@ -2543,17 +2706,17 @@ void Ebwt::buildToDisk(
 	assert_gt(eh._lineRate, 3);
 	assert(sa.suffixItrIsReset());
 
-	uint32_t  len = eh._len;
-	uint32_t  ftabLen = eh._ftabLen;
-	uint32_t  sideSz = eh._sideSz;
-	uint32_t  ebwtTotSz = eh._ebwtTotSz;
-	uint32_t  fchr[] = {0, 0, 0, 0, 0};
-	EList<uint32_t> ftab(EBWT_CAT);
-	uint32_t  zOff = 0xffffffff;
+	TIndexOffU len = eh._len;
+	TIndexOffU ftabLen = eh._ftabLen;
+	TIndexOffU sideSz = eh._sideSz;
+	TIndexOffU ebwtTotSz = eh._ebwtTotSz;
+	TIndexOffU fchr[] = {0, 0, 0, 0, 0};
+	EList<TIndexOffU> ftab(EBWT_CAT);
+	TIndexOffU zOff = OFF_MASK;
 
 	// Save # of occurrences of each character as we walk along the bwt
-	uint32_t occ[4] = {0, 0, 0, 0};
-	uint32_t occSave[4] = {0, 0, 0, 0};
+	TIndexOffU occ[4] = {0, 0, 0, 0};
+	TIndexOffU occSave[4] = {0, 0, 0, 0};
 
 	// Record rows that should "absorb" adjacent rows in the ftab.
 	// The absorbed rows represent suffixes shorter than the ftabChars
@@ -2595,24 +2758,38 @@ void Ebwt::buildToDisk(
 
 	// Points to the base offset within ebwt for the side currently
 	// being written
-	uint32_t side = 0;
+	TIndexOffU side = 0;
 
 	// Whether we're assembling a forward or a reverse bucket
 	bool fw;
-	int sideCur = 0;
+	TIndexOff sideCur = 0;
 	fw = true;
 
 	// Have we skipped the '$' in the last column yet?
 	ASSERT_ONLY(bool dollarSkipped = false);
 
-	uint32_t si = 0;   // string offset (chars)
-	ASSERT_ONLY(uint32_t lastSufInt = 0);
+	TIndexOffU si = 0;   // string offset (chars)
+	ASSERT_ONLY(TIndexOffU lastSufInt = 0);
 	ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
 	                               // array (as opposed to the padding at the
 	                               // end)
 	// Iterate over packed bwt bytes
 	VMSG_NL("Entering Ebwt loop");
-	ASSERT_ONLY(uint32_t beforeEbwtOff = (uint32_t)out1.tellp());
+	ASSERT_ONLY(TIndexOffU beforeEbwtOff = (TIndexOffU)out1.tellp()); // @double-check - pos_type, std::streampos 
+	
+	// First integer in the suffix-array output file is the length of the
+	// array, including $
+	if(saOut != NULL) {
+		// Write length word
+		writeU<TIndexOffU>(*saOut, len+1, this->toBe());
+	}
+	
+	// First integer in the BWT output file is the length of BWT(T), including $
+	if(bwtOut != NULL) {
+		// Write length word
+		writeU<TIndexOffU>(*bwtOut, len+1, this->toBe());
+	}
+	
 	while(side < ebwtTotSz) {
 		// Sanity-check our cursor into the side buffer
 		assert_geq(sideCur, 0);
@@ -2631,7 +2808,14 @@ void Ebwt::buildToDisk(
 			bool count = true;
 			if(si <= len) {
 				// Still in the SA; extract the bwtChar
-				uint32_t saElt = sa.nextSuffix();
+				TIndexOffU saElt = sa.nextSuffix();
+				// Write it to the optional suffix-array output file
+				if(saOut != NULL) {
+					writeU<TIndexOffU>(*saOut, saElt, this->toBe());
+				}
+				// TODO: what exactly to write to the BWT output file?  How to
+				// represent $?  How to pack nucleotides into bytes/words?
+				
 				// (that might have triggered sa to calc next suf block)
 				if(saElt == 0) {
 					// Don't add the '$' in the last column to the BWT
@@ -2649,16 +2833,16 @@ void Ebwt::buildToDisk(
 					fchr[bwtChar]++;
 				}
 				// Update ftab
-				if((len-saElt) >= (uint32_t)eh._ftabChars) {
+				if((len-saElt) >= (TIndexOffU)eh._ftabChars) {
 					// Turn the first ftabChars characters of the
 					// suffix into an integer index into ftab.  The
 					// leftmost (lowest index) character of the suffix
 					// goes in the most significant bit pair if the
 					// integer.
-					uint32_t sufInt = 0;
+					TIndexOffU sufInt = 0;
 					for(int i = 0; i < eh._ftabChars; i++) {
 						sufInt <<= 2;
-						assert_lt((uint32_t)i, len-saElt);
+						assert_lt((TIndexOffU)i, len-saElt);
 						sufInt |= (unsigned char)(s[saElt+i]);
 					}
 					// Assert that this prefix-of-suffix is greater
@@ -2689,7 +2873,7 @@ void Ebwt::buildToDisk(
 					assert_lt((si >> eh._offRate), eh._offsLen);
 					// Write offsets directly to the secondary output
 					// stream, thereby avoiding keeping them in memory
-					writeU32(out2, saElt, this->toBe());
+					writeU<TIndexOffU>(out2, saElt, this->toBe());
 				}
 			} else {
 				// Strayed off the end of the SA, now we're just
@@ -2738,14 +2922,21 @@ void Ebwt::buildToDisk(
 		sideCur++;
 		if(sideCur == (int)eh._sideBwtSz) {
 			sideCur = 0;
-			uint32_t *u32side = reinterpret_cast<uint32_t*>(ebwtSide.ptr());
+			TIndexOffU *cpptr = reinterpret_cast<TIndexOffU*>(ebwtSide.ptr());
 			// Write 'A', 'C', 'G' and 'T' tallies
 			side += sideSz;
 			assert_leq(side, eh._ebwtTotSz);
-			u32side[(sideSz >> 2)-4] = endianizeU32(occSave[0], this->toBe());
-			u32side[(sideSz >> 2)-3] = endianizeU32(occSave[1], this->toBe());
-			u32side[(sideSz >> 2)-2] = endianizeU32(occSave[2], this->toBe());
-			u32side[(sideSz >> 2)-1] = endianizeU32(occSave[3], this->toBe());
+#ifdef BOWTIE_64BIT_INDEX
+			cpptr[(sideSz >> 3)-4] = endianizeU<TIndexOffU>(occSave[0], this->toBe());
+			cpptr[(sideSz >> 3)-3] = endianizeU<TIndexOffU>(occSave[1], this->toBe());
+			cpptr[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occSave[2], this->toBe());
+			cpptr[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occSave[3], this->toBe());
+#else
+			cpptr[(sideSz >> 2)-4] = endianizeU<TIndexOffU>(occSave[0], this->toBe());
+			cpptr[(sideSz >> 2)-3] = endianizeU<TIndexOffU>(occSave[1], this->toBe());
+			cpptr[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occSave[2], this->toBe());
+			cpptr[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occSave[3], this->toBe());
+#endif
 			occSave[0] = occ[0];
 			occSave[1] = occ[1];
 			occSave[2] = occ[2];
@@ -2755,7 +2946,7 @@ void Ebwt::buildToDisk(
 		}
 	}
 	VMSG_NL("Exited Ebwt loop");
-	assert_neq(zOff, 0xffffffff);
+	assert_neq(zOff, OFF_MASK);
 	if(absorbCnt > 0) {
 		// Absorb any trailing, as-yet-unabsorbed short suffixes into
 		// the last element of ftab
@@ -2764,13 +2955,13 @@ void Ebwt::buildToDisk(
 	// Assert that our loop counter got incremented right to the end
 	assert_eq(side, eh._ebwtTotSz);
 	// Assert that we wrote the expected amount to out1
-	assert_eq(((uint32_t)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz);
+	assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); // @double-check - pos_type
 	// assert that the last thing we did was write a forward bucket
 
 	//
 	// Write zOff to primary stream
 	//
-	writeU32(out1, zOff, this->toBe());
+	writeU<TIndexOffU>(out1, zOff, this->toBe());
 
 	//
 	// Finish building fchr
@@ -2791,21 +2982,21 @@ void Ebwt::buildToDisk(
 	}
 	// Write fchr to primary file
 	for(int i = 0; i < 5; i++) {
-		writeU32(out1, fchr[i], this->toBe());
+		writeU<TIndexOffU>(out1, fchr[i], this->toBe());
 	}
 
 	//
 	// Finish building ftab and build eftab
 	//
 	// Prefix sum on ftable
-	uint32_t eftabLen = 0;
+	TIndexOffU eftabLen = 0;
 	assert_eq(0, absorbFtab[0]);
-	for(uint32_t i = 1; i < ftabLen; i++) {
+	for(TIndexOffU i = 1; i < ftabLen; i++) {
 		if(absorbFtab[i] > 0) eftabLen += 2;
 	}
-	assert_leq(eftabLen, (uint32_t)eh._ftabChars*2);
+	assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2);
 	eftabLen = eh._ftabChars*2;
-	EList<uint32_t> eftab(EBWT_CAT);
+	EList<TIndexOffU> eftab(EBWT_CAT);
 	try {
 		eftab.resize(eftabLen);
 		eftab.fillZero();
@@ -2815,16 +3006,16 @@ void Ebwt::buildToDisk(
 		     << __LINE__ << endl;
 		throw e;
 	}
-	uint32_t eftabCur = 0;
-	for(uint32_t i = 1; i < ftabLen; i++) {
-		uint32_t lo = ftab[i] + Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
+	TIndexOffU eftabCur = 0;
+	for(TIndexOffU i = 1; i < ftabLen; i++) {
+		TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
 		if(absorbFtab[i] > 0) {
 			// Skip a number of short pattern indicated by absorbFtab[i]
-			uint32_t hi = lo + absorbFtab[i];
+			TIndexOffU hi = lo + absorbFtab[i];
 			assert_lt(eftabCur*2+1, eftabLen);
 			eftab[eftabCur*2] = lo;
 			eftab[eftabCur*2+1] = hi;
-			ftab[i] = (eftabCur++) ^ 0xffffffff; // insert pointer into eftab
+			ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab
 			assert_eq(lo, Ebwt::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
 			assert_eq(hi, Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
 		} else {
@@ -2833,12 +3024,12 @@ void Ebwt::buildToDisk(
 	}
 	assert_eq(Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
 	// Write ftab to primary file
-	for(uint32_t i = 0; i < ftabLen; i++) {
-		writeU32(out1, ftab[i], this->toBe());
+	for(TIndexOffU i = 0; i < ftabLen; i++) {
+		writeU<TIndexOffU>(out1, ftab[i], this->toBe());
 	}
 	// Write eftab to primary file
-	for(uint32_t i = 0; i < eftabLen; i++) {
-		writeU32(out1, eftab[i], this->toBe());
+	for(TIndexOffU i = 0; i < eftabLen; i++) {
+		writeU<TIndexOffU>(out1, eftab[i], this->toBe());
 	}
 
 	// Note: if you'd like to sanity-check the Ebwt, you'll have to
@@ -2858,4 +3049,28 @@ string adjustEbwtBase(const string& cmdline,
 					  const string& ebwtFileBase,
 					  bool verbose);
 
+
+extern string gLastIOErrMsg;
+
+/* Checks whether a call to read() failed or not. */
+inline bool is_read_err(int fdesc, ssize_t ret, size_t count){
+	if (ret < 0) {
+		std::stringstream sstm;
+		sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
+		gLastIOErrMsg = sstm.str();
+		return true;
+	}
+	return false;
+}
+
+/* Checks whether a call to fread() failed or not. */
+inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){
+	if (ferror(file_hd)) {
+		gLastIOErrMsg = "Error Reading File!";
+		return true;
+	}
+	return false;
+}
+
+
 #endif /*EBWT_H_*/
diff --git a/bt2_inspect.cpp b/bt2_inspect.cpp
index 92302c8..9094aa0 100644
--- a/bt2_inspect.cpp
+++ b/bt2_inspect.cpp
@@ -38,11 +38,12 @@ static int names_only   = 0;  // just print the sequence names in the index
 static int summarize_only = 0; // just print summary of index and quit
 static int across       = 60; // number of characters across in FASTA output
 static bool refFromEbwt = false; // true -> when printing reference, decode it from Ebwt instead of reading it from BitPairReference
-
+static string wrapper;
 static const char *short_options = "vhnsea:";
 
 enum {
 	ARG_VERSION = 256,
+	ARG_WRAPPER,
 	ARG_USAGE,
 };
 
@@ -55,6 +56,7 @@ static struct option long_options[] = {
 	{(char*)"help",     no_argument,        0, 'h'},
 	{(char*)"across",   required_argument,  0, 'a'},
 	{(char*)"ebwt-ref", no_argument,        0, 'e'},
+	{(char*)"wrapper",  required_argument,  0, ARG_WRAPPER},
 	{(char*)0, 0, 0, 0} // terminator
 };
 
@@ -65,21 +67,32 @@ static void printUsage(ostream& out) {
 	out << "Bowtie 2 version " << string(BOWTIE2_VERSION).c_str() << " by Ben Langmead (langmea at cs.jhu.edu, www.cs.jhu.edu/~langmea)" << endl;
 	out
 	<< "Usage: bowtie2-inspect [options]* <bt2_base>" << endl
-	<< "  <bt2_base>         bt2 filename minus trailing .1.bt2/.2.bt2" << endl
+	<< "  <bt2_base>         bt2 filename minus trailing .1." + gEbwt_ext + "/.2." + gEbwt_ext << endl
 	<< endl
 	<< "  By default, prints FASTA records of the indexed nucleotide sequences to" << endl
 	<< "  standard out.  With -n, just prints names.  With -s, just prints a summary of" << endl
 	<< "  the index parameters and sequences.  With -e, preserves colors if applicable." << endl
 	<< endl
-	<< "Options:" << endl
-	<< "  -a/--across <int>  Number of characters across in FASTA output (default: 60)" << endl
+	<< "Options:" << endl;
+	if(wrapper == "basic-0") {
+		out << "  --large-index      force inspection of the 'large' index, even if a" << endl
+			<< "                     'small' one is present." << endl;
+	}
+	out << "  -a/--across <int>  Number of characters across in FASTA output (default: 60)" << endl
 	<< "  -n/--names         Print reference sequence names only" << endl
 	<< "  -s/--summary       Print summary incl. ref names, lengths, index properties" << endl
-	<< "  -e/--bt2-ref      Reconstruct reference from .bt2 (slow, preserves colors)" << endl
+	<< "  -e/--bt2-ref      Reconstruct reference from ." + gEbwt_ext + " (slow, preserves colors)" << endl
 	<< "  -v/--verbose       Verbose output (for debugging)" << endl
 	<< "  -h/--help          print detailed description of tool and its options" << endl
 	<< "  --help             print this usage message" << endl
 	;
+	if(wrapper.empty()) {
+		cerr << endl
+		     << "*** Warning ***" << endl
+			 << "'boowtie2-inspect' was run directly.  It is recommended "
+			 << "to use the wrapper script instead."
+			 << endl << endl;
+	}
 }
 
 /**
@@ -114,6 +127,9 @@ static void parseOptions(int argc, char **argv) {
 	do {
 		next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
 		switch (next_option) {
+			case ARG_WRAPPER:
+				wrapper = optarg;
+				break;
 			case ARG_USAGE:
 			case 'h':
 				printUsage(cout);
@@ -200,7 +216,7 @@ static void print_ref_sequences(
 	ostream& fout,
 	bool color,
 	const EList<string>& refnames,
-	const uint32_t* plen,
+	const TIndexOffU* plen,
 	const string& adjustedEbwtFileBase)
 {
 	BitPairReference ref(
@@ -238,25 +254,25 @@ static void print_index_sequences(ostream& fout, Ebwt& ebwt)
 	TStr cat_ref;
 	ebwt.restore(cat_ref);
 
-	uint32_t curr_ref = 0xffffffff;
+	TIndexOffU curr_ref = OFF_MASK;
 	string curr_ref_seq = "";
-	uint32_t curr_ref_len = 0xffffffff;
-	uint32_t last_text_off = 0;
+	TIndexOffU curr_ref_len = OFF_MASK;
+	TIndexOffU last_text_off = 0;
 	size_t orig_len = cat_ref.length();
-	uint32_t tlen = 0xffffffff;
+	TIndexOffU tlen = OFF_MASK;
 	bool first = true;
 	for(size_t i = 0; i < orig_len; i++) {
-		uint32_t tidx = 0xffffffff;
-		uint32_t textoff = 0xffffffff;
-		tlen = 0xffffffff;
+		TIndexOffU tidx = OFF_MASK;
+		TIndexOffU textoff = OFF_MASK;
+		tlen = OFF_MASK;
 		bool straddled = false;
-		ebwt.joinedToTextOff(1 /* qlen */, (uint32_t)i, tidx, textoff, tlen, true, straddled);
+		ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);
 
-		if (tidx != 0xffffffff && textoff < tlen)
+		if (tidx != OFF_MASK && textoff < tlen)
 		{
 			if (curr_ref != tidx)
 			{
-				if (curr_ref != 0xffffffff)
+				if (curr_ref != OFF_MASK)
 				{
 					// Add trailing gaps, if any exist
 					if(curr_ref_seq.length() < curr_ref_len) {
@@ -271,12 +287,12 @@ static void print_index_sequences(ostream& fout, Ebwt& ebwt)
 				first = true;
 			}
 
-			uint32_t textoff_adj = textoff;
+			TIndexOffU textoff_adj = textoff;
 			if(first && textoff > 0) textoff_adj++;
 			if (textoff_adj - last_text_off > 1)
 				curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');
 
-			curr_ref_seq.push_back(cat_ref[i]);
+			curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]);
 			last_text_off = textoff;
 			first = false;
 		}
diff --git a/bt2_io.cpp b/bt2_io.cpp
index e2bff23..d8332aa 100644
--- a/bt2_io.cpp
+++ b/bt2_io.cpp
@@ -23,6 +23,7 @@
 #include <fstream>
 #include <stdlib.h>
 #include "bt2_idx.h"
+#include <iomanip>
 
 using namespace std;
 
@@ -149,15 +150,15 @@ void Ebwt::readIntoMemory(
 	}
 	
 	// Read endianness hints from both streams
-	size_t bytesRead = 0;
+	uint64_t bytesRead = 0;
 	switchEndian = false;
-	uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream
+	uint32_t one = readU<uint32_t>(_in1, switchEndian); // 1st word of primary stream
 	bytesRead += 4;
 	if(loadSASamp) {
 #ifndef NDEBUG
-		assert_eq(one, readU32(_in2, switchEndian)); // should match!
+		assert_eq(one, readU<uint32_t>(_in2, switchEndian)); // should match!
 #else
-		readU32(_in2, switchEndian);
+		readU<uint32_t>(_in2, switchEndian);
 #endif
 	}
 	if(one != 1) {
@@ -176,22 +177,22 @@ void Ebwt::readIntoMemory(
 	}
 	
 	// Reads header entries one by one from primary stream
-	uint32_t len          = readU32(_in1, switchEndian);
+	TIndexOffU len          = readU<TIndexOffU>(_in1, switchEndian);
+	bytesRead += OFF_SIZE;
+	int32_t  lineRate     = readI<int32_t>(_in1, switchEndian);
 	bytesRead += 4;
-	int32_t  lineRate     = readI32(_in1, switchEndian);
+	/*int32_t  linesPerSide =*/ readI<int32_t>(_in1, switchEndian);
 	bytesRead += 4;
-	/*int32_t  linesPerSide =*/ readI32(_in1, switchEndian);
-	bytesRead += 4;
-	int32_t  offRate      = readI32(_in1, switchEndian);
+	int32_t  offRate      = readI<int32_t>(_in1, switchEndian);
 	bytesRead += 4;
 	// TODO: add isaRate to the actual file format (right now, the
 	// user has to tell us whether there's an ISA sample and what the
 	// sampling rate is.
-	int32_t  ftabChars    = readI32(_in1, switchEndian);
+	int32_t  ftabChars    = readI<int32_t>(_in1, switchEndian);
 	bytesRead += 4;
 	// chunkRate was deprecated in an earlier version of Bowtie; now
 	// we use it to hold flags.
-	int32_t flags = readI32(_in1, switchEndian);
+	int32_t flags = readI<int32_t>(_in1, switchEndian);
 	bool entireRev = false;
 	if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
 		if(color != -1 && !color) {
@@ -235,15 +236,16 @@ void Ebwt::readIntoMemory(
 	}
 	
 	// Set up overridden suffix-array-sample parameters
-	uint32_t offsLen = eh->_offsLen;
-	uint32_t offRateDiff = 0;
-	uint32_t offsLenSampled = offsLen;
+	TIndexOffU offsLen = eh->_offsLen;
+	uint64_t offsSz = eh->_offsSz;
+	TIndexOffU offRateDiff = 0;
+	TIndexOffU offsLenSampled = offsLen;
 	if(_overrideOffRate > offRate) {
 		offRateDiff = _overrideOffRate - offRate;
 	}
 	if(offRateDiff > 0) {
 		offsLenSampled >>= offRateDiff;
-		if((offsLen & ~(0xffffffff << offRateDiff)) != 0) {
+		if((offsLen & ~(OFF_MASK << offRateDiff)) != 0) {
 			offsLenSampled++;
 		}
 	}
@@ -257,15 +259,15 @@ void Ebwt::readIntoMemory(
 	}
 	
 	// Read nPat from primary stream
-	this->_nPat = readI32(_in1, switchEndian);
-	bytesRead += 4;
+	this->_nPat = readI<TIndexOffU>(_in1, switchEndian);
+	bytesRead += OFF_SIZE;
 	_plen.reset();
 	// Read plen from primary stream
 	if(_useMm) {
 #ifdef BOWTIE_MM
-		_plen.init((uint32_t*)(mmFile[0] + bytesRead), _nPat, false);
-		bytesRead += _nPat*4;
-		MM_SEEK(_in1, _nPat*4, SEEK_CUR);
+		_plen.init((TIndexOffU*)(mmFile[0] + bytesRead), _nPat, false);
+		bytesRead += _nPat*OFF_SIZE;
+		MM_SEEK(_in1, _nPat*OFF_SIZE, SEEK_CUR);
 #endif
 	} else {
 		try {
@@ -273,15 +275,15 @@ void Ebwt::readIntoMemory(
 				cerr << "Reading plen (" << this->_nPat << "): ";
 				logTime(cerr);
 			}
-			_plen.init(new uint32_t[_nPat], _nPat, true);
+			_plen.init(new TIndexOffU[_nPat], _nPat, true);
 			if(switchEndian) {
-				for(uint32_t i = 0; i < this->_nPat; i++) {
-					plen()[i] = readU32(_in1, switchEndian);
+				for(TIndexOffU i = 0; i < this->_nPat; i++) {
+					plen()[i] = readU<TIndexOffU>(_in1, switchEndian);
 				}
 			} else {
-				MM_READ_RET r = MM_READ(_in1, (void*)(plen()), _nPat*4);
-				if(r != (MM_READ_RET)(_nPat*4)) {
-					cerr << "Error reading _plen[] array: " << r << ", " << _nPat*4 << endl;
+				MM_READ_RET r = MM_READ(_in1, (void*)(plen()), _nPat*OFF_SIZE);
+				if(r != (MM_READ_RET)(_nPat*OFF_SIZE)) {
+					cerr << "Error reading _plen[] array: " << r << ", " << _nPat*OFF_SIZE << endl;
 					throw 1;
 				}
 			}
@@ -300,8 +302,8 @@ void Ebwt::readIntoMemory(
 	// (i.e. everything up to and including join()).
 	if(justHeader) goto done;
 	
-	this->_nFrag = readU32(_in1, switchEndian);
-	bytesRead += 4;
+	this->_nFrag = readU<TIndexOffU>(_in1, switchEndian);
+	bytesRead += OFF_SIZE;
 	if(_verbose || startVerbose) {
 		cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
 		logTime(cerr);
@@ -311,24 +313,24 @@ void Ebwt::readIntoMemory(
 	if(loadRstarts) {
 		if(_useMm) {
 #ifdef BOWTIE_MM
-			_rstarts.init((uint32_t*)(mmFile[0] + bytesRead), _nFrag*3, false);
-			bytesRead += this->_nFrag*4*3;
-			MM_SEEK(_in1, this->_nFrag*4*3, SEEK_CUR);
+			_rstarts.init((TIndexOffU*)(mmFile[0] + bytesRead), _nFrag*3, false);
+			bytesRead += this->_nFrag*OFF_SIZE*3;
+			MM_SEEK(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
 #endif
 		} else {
-			_rstarts.init(new uint32_t[_nFrag*3], _nFrag*3, true);
+			_rstarts.init(new TIndexOffU[_nFrag*3], _nFrag*3, true);
 			if(switchEndian) {
-				for(uint32_t i = 0; i < this->_nFrag*3; i += 3) {
+				for(TIndexOffU i = 0; i < this->_nFrag*3; i += 3) {
 					// fragment starting position in joined reference
 					// string, text id, and fragment offset within text
-					this->rstarts()[i]   = readU32(_in1, switchEndian);
-					this->rstarts()[i+1] = readU32(_in1, switchEndian);
-					this->rstarts()[i+2] = readU32(_in1, switchEndian);
+					this->rstarts()[i]   = readU<TIndexOffU>(_in1, switchEndian);
+					this->rstarts()[i+1] = readU<TIndexOffU>(_in1, switchEndian);
+					this->rstarts()[i+2] = readU<TIndexOffU>(_in1, switchEndian);
 				}
 			} else {
-				MM_READ_RET r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*4*3);
-				if(r != (MM_READ_RET)(this->_nFrag*4*3)) {
-					cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*4*3) << endl;
+				MM_READ_RET r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*OFF_SIZE*3);
+				if(r != (MM_READ_RET)(this->_nFrag*OFF_SIZE*3)) {
+					cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*OFF_SIZE*3) << endl;
 					throw 1;
 				}
 			}
@@ -336,8 +338,8 @@ void Ebwt::readIntoMemory(
 	} else {
 		// Skip em
 		assert(rstarts() == NULL);
-		bytesRead += this->_nFrag*4*3;
-		MM_SEEK(_in1, this->_nFrag*4*3, SEEK_CUR);
+		bytesRead += this->_nFrag*OFF_SIZE*3;
+		MM_SEEK(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
 	}
 	
 	_ebwt.reset();
@@ -375,17 +377,25 @@ void Ebwt::readIntoMemory(
 		}
 		if(shmemLeader) {
 			// Read ebwt from primary stream
-			MM_READ_RET r = MM_READ(_in1, (void *)this->ebwt(), eh->_ebwtTotLen);
-			if(r != (MM_READ_RET)eh->_ebwtTotLen) {
-				cerr << "Error reading _ebwt[] array: " << r << ", " << (eh->_ebwtTotLen) << endl;
-				throw 1;
+			uint64_t bytesLeft = eh->_ebwtTotLen;
+			char *pebwt = (char*)this->ebwt();
+
+			while (bytesLeft>0){
+				MM_READ_RET r = MM_READ(_in1, (void *)pebwt, bytesLeft);
+				if(MM_IS_IO_ERR(_in1,r,bytesLeft)) {
+					cerr << "Error reading _ebwt[] array: " << r << ", "
+						 << bytesLeft << gLastIOErrMsg << endl;
+					throw 1;
+				}
+				pebwt += r;
+				bytesLeft -= r;
 			}
 			if(switchEndian) {
 				uint8_t *side = this->ebwt();
 				for(size_t i = 0; i < eh->_numSides; i++) {
-					uint32_t *cums = reinterpret_cast<uint32_t*>(side + eh->_sideSz - 8);
-					cums[0] = endianSwapU32(cums[0]);
-					cums[1] = endianSwapU32(cums[1]);
+					TIndexOffU *cums = reinterpret_cast<TIndexOffU*>(side + eh->_sideSz - OFF_SIZE*2);
+					cums[0] = endianSwapU(cums[0]);
+					cums[1] = endianSwapU(cums[1]);
 					side += this->_eh._sideSz;
 				}
 			}
@@ -402,8 +412,8 @@ void Ebwt::readIntoMemory(
 	}
 	
 	// Read zOff from primary stream
-	_zOff = readU32(_in1, switchEndian);
-	bytesRead += 4;
+	_zOff = readU<TIndexOffU>(_in1, switchEndian);
+	bytesRead += OFF_SIZE;
 	assert_lt(_zOff, len);
 	
 	try {
@@ -412,14 +422,14 @@ void Ebwt::readIntoMemory(
 		_fchr.reset();
 		if(_useMm) {
 #ifdef BOWTIE_MM
-			_fchr.init((uint32_t*)(mmFile[0] + bytesRead), 5, false);
-			bytesRead += 5*4;
-			MM_SEEK(_in1, 5*4, SEEK_CUR);
+			_fchr.init((TIndexOffU*)(mmFile[0] + bytesRead), 5, false);
+			bytesRead += 5*OFF_SIZE;
+			MM_SEEK(_in1, 5*OFF_SIZE, SEEK_CUR);
 #endif
 		} else {
-			_fchr.init(new uint32_t[5], 5, true);
+			_fchr.init(new TIndexOffU[5], 5, true);
 			for(int i = 0; i < 5; i++) {
-				this->fchr()[i] = readU32(_in1, switchEndian);
+				this->fchr()[i] = readU<TIndexOffU>(_in1, switchEndian);
 				assert_leq(this->fchr()[i], len);
 				assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]);
 			}
@@ -438,19 +448,19 @@ void Ebwt::readIntoMemory(
 		if(loadFtab) {
 			if(_useMm) {
 #ifdef BOWTIE_MM
-				_ftab.init((uint32_t*)(mmFile[0] + bytesRead), eh->_ftabLen, false);
-				bytesRead += eh->_ftabLen*4;
-				MM_SEEK(_in1, eh->_ftabLen*4, SEEK_CUR);
+				_ftab.init((TIndexOffU*)(mmFile[0] + bytesRead), eh->_ftabLen, false);
+				bytesRead += eh->_ftabLen*OFF_SIZE;
+				MM_SEEK(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
 #endif
 			} else {
-				_ftab.init(new uint32_t[eh->_ftabLen], eh->_ftabLen, true);
+				_ftab.init(new TIndexOffU[eh->_ftabLen], eh->_ftabLen, true);
 				if(switchEndian) {
-					for(uint32_t i = 0; i < eh->_ftabLen; i++)
-						this->ftab()[i] = readU32(_in1, switchEndian);
+					for(TIndexOffU i = 0; i < eh->_ftabLen; i++)
+						this->ftab()[i] = readU<TIndexOffU>(_in1, switchEndian);
 				} else {
-					MM_READ_RET r = MM_READ(_in1, (void *)ftab(), eh->_ftabLen*4);
-					if(r != (MM_READ_RET)(eh->_ftabLen*4)) {
-						cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*4) << endl;
+					MM_READ_RET r = MM_READ(_in1, (void *)ftab(), eh->_ftabLen*OFF_SIZE);
+					if(r != (MM_READ_RET)(eh->_ftabLen*OFF_SIZE)) {
+						cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*OFF_SIZE) << endl;
 						throw 1;
 					}
 				}
@@ -468,24 +478,24 @@ void Ebwt::readIntoMemory(
 			_eftab.reset();
 			if(_useMm) {
 #ifdef BOWTIE_MM
-				_eftab.init((uint32_t*)(mmFile[0] + bytesRead), eh->_eftabLen, false);
-				bytesRead += eh->_eftabLen*4;
-				MM_SEEK(_in1, eh->_eftabLen*4, SEEK_CUR);
+				_eftab.init((TIndexOffU*)(mmFile[0] + bytesRead), eh->_eftabLen, false);
+				bytesRead += eh->_eftabLen*OFF_SIZE;
+				MM_SEEK(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
 #endif
 			} else {
-				_eftab.init(new uint32_t[eh->_eftabLen], eh->_eftabLen, true);
+				_eftab.init(new TIndexOffU[eh->_eftabLen], eh->_eftabLen, true);
 				if(switchEndian) {
-					for(uint32_t i = 0; i < eh->_eftabLen; i++)
-						this->eftab()[i] = readU32(_in1, switchEndian);
+					for(TIndexOffU i = 0; i < eh->_eftabLen; i++)
+						this->eftab()[i] = readU<TIndexOffU>(_in1, switchEndian);
 				} else {
-					MM_READ_RET r = MM_READ(_in1, (void *)this->eftab(), eh->_eftabLen*4);
-					if(r != (MM_READ_RET)(eh->_eftabLen*4)) {
-						cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*4) << endl;
+					MM_READ_RET r = MM_READ(_in1, (void *)this->eftab(), eh->_eftabLen*OFF_SIZE);
+					if(r != (MM_READ_RET)(eh->_eftabLen*OFF_SIZE)) {
+						cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*OFF_SIZE) << endl;
 						throw 1;
 					}
 				}
 			}
-			for(uint32_t i = 0; i < eh->_eftabLen; i++) {
+			for(TIndexOffU i = 0; i < eh->_eftabLen; i++) {
 				if(i > 0 && this->eftab()[i] > 0) {
 					assert_geq(this->eftab()[i], this->eftab()[i-1]);
 				} else if(i > 0 && this->eftab()[i-1] == 0) {
@@ -496,11 +506,11 @@ void Ebwt::readIntoMemory(
 			assert(ftab() == NULL);
 			assert(eftab() == NULL);
 			// Skip ftab
-			bytesRead += eh->_ftabLen*4;
-			MM_SEEK(_in1, eh->_ftabLen*4, SEEK_CUR);
+			bytesRead += eh->_ftabLen*OFF_SIZE;
+			MM_SEEK(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
 			// Skip eftab
-			bytesRead += eh->_eftabLen*4;
-			MM_SEEK(_in1, eh->_eftabLen*4, SEEK_CUR);
+			bytesRead += eh->_eftabLen*OFF_SIZE;
+			MM_SEEK(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
 		}
 	} catch(bad_alloc& e) {
 		cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
@@ -533,7 +543,7 @@ void Ebwt::readIntoMemory(
 		
 		shmemLeader = true;
 		if(_verbose || startVerbose) {
-			cerr << "Reading offs (" << offsLenSampled << " 32-bit words): ";
+			cerr << "Reading offs (" << offsLenSampled << std::setw(2) << OFF_SIZE*8 <<"-bit words): ";
 			logTime(cerr);
 		}
 		
@@ -541,18 +551,18 @@ void Ebwt::readIntoMemory(
 			if(!useShmem_) {
 				// Allocate offs_
 				try {
-					_offs.init(new uint32_t[offsLenSampled], offsLenSampled, true);
+					_offs.init(new TIndexOffU[offsLenSampled], offsLenSampled, true);
 				} catch(bad_alloc& e) {
 					cerr << "Out of memory allocating the offs[] array  for the Bowtie index." << endl
 					<< "Please try again on a computer with more memory." << endl;
 					throw 1;
 				}
 			} else {
-				uint32_t *tmp = NULL;
-				shmemLeader = ALLOC_SHARED_U32(
-					(_in2Str + "[offs]"), offsLenSampled*4, &tmp,
+				TIndexOffU *tmp = NULL;
+				shmemLeader = ALLOC_SHARED_U(
+					(_in2Str + "[offs]"), offsLenSampled*OFF_SIZE, &tmp,
 					"offs", (_verbose || startVerbose));
-				_offs.init((uint32_t*)tmp, offsLenSampled, false);
+				_offs.init((TIndexOffU*)tmp, offsLenSampled, false);
 			}
 		}
 		
@@ -561,8 +571,8 @@ void Ebwt::readIntoMemory(
 				// Allocate offs (big allocation)
 				if(switchEndian || offRateDiff > 0) {
 					assert(!_useMm);
-					const uint32_t blockMaxSz = (2 * 1024 * 1024); // 2 MB block size
-					const uint32_t blockMaxSzU32 = (blockMaxSz >> 2); // # U32s per block
+					const TIndexOffU blockMaxSz = (2 * 1024 * 1024); // 2 MB block size
+					const TIndexOffU blockMaxSzU = (blockMaxSz >> (OFF_SIZE/4 + 1)); // # U32s per block
 					char *buf;
 					try {
 						buf = new char[blockMaxSz];
@@ -570,19 +580,19 @@ void Ebwt::readIntoMemory(
 						cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl;
 						throw e;
 					}
-					for(uint32_t i = 0; i < offsLen; i += blockMaxSzU32) {
-						uint32_t block = min<uint32_t>(blockMaxSzU32, offsLen - i);
-						MM_READ_RET r = MM_READ(_in2, (void *)buf, block << 2);
-						if(r != (MM_READ_RET)(block << 2)) {
-							cerr << "Error reading block of _offs[] array: " << r << ", " << (block << 2) << endl;
+					for(TIndexOffU i = 0; i < offsLen; i += blockMaxSzU) {
+						TIndexOffU block = min<TIndexOffU>(blockMaxSzU, offsLen - i);
+						MM_READ_RET r = MM_READ(_in2, (void *)buf, block << (OFF_SIZE/4 + 1));
+						if(r != (MM_READ_RET)(block << (OFF_SIZE/4 + 1))) {
+							cerr << "Error reading block of _offs[] array: " << r << ", " << (block << (OFF_SIZE/4 + 1)) << endl;
 							throw 1;
 						}
-						uint32_t idx = i >> offRateDiff;
-						for(uint32_t j = 0; j < block; j += (1 << offRateDiff)) {
+						TIndexOffU idx = i >> offRateDiff;
+						for(TIndexOffU j = 0; j < block; j += (1 << offRateDiff)) {
 							assert_lt(idx, offsLenSampled);
-							this->offs()[idx] = ((uint32_t*)buf)[j];
+							this->offs()[idx] = ((TIndexOffU*)buf)[j];
 							if(switchEndian) {
-								this->offs()[idx] = endianSwapU32(this->offs()[idx]);
+								this->offs()[idx] = endianSwapU(this->offs()[idx]);
 							}
 							idx++;
 						}
@@ -591,45 +601,39 @@ void Ebwt::readIntoMemory(
 				} else {
 					if(_useMm) {
 #ifdef BOWTIE_MM
-						_offs.init((uint32_t*)(mmFile[1] + bytesRead), offsLen, false);
-						bytesRead += (offsLen << 2);
-						MM_SEEK(_in2, (offsLen << 2), SEEK_CUR);
+						_offs.init((TIndexOffU*)(mmFile[1] + bytesRead), offsLen, false);
+						bytesRead += offsSz;
+						// Argument to lseek can be 64 bits if compiled with
+						// _FILE_OFFSET_BITS
+						MM_SEEK(_in2, offsSz, SEEK_CUR);
 #endif
 					} else {
-						// If any of the high two bits are set
-						if((offsLen & 0xc0000000) != 0) {
-							if(sizeof(char *) <= 4) {
-								cerr << "Sanity error: sizeof(char *) <= 4 but offsLen is " << hex << offsLen << endl;
-								throw 1;
-							}
-							// offsLen << 2 overflows, so do it in four reads
-							char *offs = (char *)this->offs();
-							for(int i = 0; i < 4; i++) {
-								MM_READ_RET r = MM_READ(_in2, (void*)offs, offsLen);
-								if(r != (MM_READ_RET)(offsLen)) {
-									cerr << "Error reading block of _offs[] array: " << r << ", " << offsLen << endl;
-									throw 1;
-								}
-								offs += offsLen;
-							}
-						} else {
-							// Do it all in one read
-							MM_READ_RET r = MM_READ(_in2, (void*)this->offs(), offsLen << 2);
-							if(r != (MM_READ_RET)(offsLen << 2)) {
-								cerr << "Error reading _offs[] array: " << r << ", " << (offsLen << 2) << endl;
+						// Workaround for small-index mode where MM_READ may
+						// not be able to handle read amounts greater than 2^32
+						// bytes.
+						uint64_t bytesLeft = offsSz;
+						char *offs = (char *)this->offs();
+
+						while(bytesLeft > 0) {
+							MM_READ_RET r = MM_READ(_in2, (void*)offs, bytesLeft);
+							if(MM_IS_IO_ERR(_in2,r,bytesLeft)) {
+								cerr << "Error reading block of _offs[] array: "
+								     << r << ", " << bytesLeft << gLastIOErrMsg << endl;
 								throw 1;
 							}
+							offs += r;
+							bytesLeft -= r;
 						}
 					}
 				}
 #ifdef BOWTIE_SHARED_MEM				
-				if(useShmem_) NOTIFY_SHARED(offs(), offsLenSampled*4);
+				if(useShmem_) NOTIFY_SHARED(offs(), offsLenSampled*OFF_SIZE);
 #endif
 			} else {
 				// Not the shmem leader
-				MM_SEEK(_in2, offsLenSampled*4, SEEK_CUR);
+				MM_SEEK(_in2, offsLenSampled*OFF_SIZE, SEEK_CUR);
 #ifdef BOWTIE_SHARED_MEM				
-				if(useShmem_) WAIT_SHARED(offs(), offsLenSampled*4);
+				if(useShmem_) WAIT_SHARED(offs(), offsLenSampled*OFF_SIZE);
 #endif
 			}
 		}
@@ -659,28 +663,28 @@ done: // Exit hatch for both justHeader and !justHeader
  * file and store them in 'refnames'.
  */
 void
-readEbwtRefnames(istream& in, EList<string>& refnames) {
+readEbwtRefnames(FILE* fin, EList<string>& refnames) {
 	// _in1 must already be open with the get cursor at the
 	// beginning and no error flags set.
-	assert(in.good());
-	assert_eq((streamoff)in.tellg(), ios::beg);
+	assert(fin != NULL);
+	assert_eq(ftello(fin), 0);
 	
 	// Read endianness hints from both streams
 	bool switchEndian = false;
-	uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
+	uint32_t one = readU<uint32_t>(fin, switchEndian); // 1st word of primary stream
 	if(one != 1) {
 		assert_eq((1u<<24), one);
 		switchEndian = true;
 	}
 	
 	// Reads header entries one by one from primary stream
-	uint32_t len          = readU32(in, switchEndian);
-	int32_t  lineRate     = readI32(in, switchEndian);
-	/*int32_t  linesPerSide =*/ readI32(in, switchEndian);
-	int32_t  offRate      = readI32(in, switchEndian);
-	int32_t  ftabChars    = readI32(in, switchEndian);
+	TIndexOffU len          = readU<TIndexOffU>(fin, switchEndian);
+	int32_t  lineRate     = readI<int32_t>(fin, switchEndian);
+	/*int32_t  linesPerSide =*/ readI<int32_t>(fin, switchEndian);
+	int32_t  offRate      = readI<int32_t>(fin, switchEndian);
+	int32_t  ftabChars    = readI<int32_t>(fin, switchEndian);
 	// BTL: chunkRate is now deprecated
-	int32_t flags = readI32(in, switchEndian);
+	int32_t flags = readI<int32_t>(fin, switchEndian);
 	bool color = false;
 	bool entireReverse = false;
 	if(flags < 0) {
@@ -691,33 +695,35 @@ readEbwtRefnames(istream& in, EList<string>& refnames) {
 	// Create a new EbwtParams from the entries read from primary stream
 	EbwtParams eh(len, lineRate, offRate, ftabChars, color, entireReverse);
 	
-	uint32_t nPat = readI32(in, switchEndian); // nPat
-	in.seekg(nPat*4, ios_base::cur); // skip plen
+	TIndexOffU nPat = readI<TIndexOffU>(fin, switchEndian); // nPat
+	fseeko(fin, nPat*OFF_SIZE, SEEK_CUR);
 	
 	// Skip rstarts
-	uint32_t nFrag = readU32(in, switchEndian);
-	in.seekg(nFrag*4*3, ios_base::cur);
+	TIndexOffU nFrag = readU<TIndexOffU>(fin, switchEndian);
+	fseeko(fin, nFrag*OFF_SIZE*3, SEEK_CUR);
 	
 	// Skip ebwt
-	in.seekg(eh._ebwtTotLen, ios_base::cur);
+	fseeko(fin, eh._ebwtTotLen, SEEK_CUR);
 	
 	// Skip zOff from primary stream
-	readU32(in, switchEndian);
+	readU<TIndexOffU>(fin, switchEndian);
 	
 	// Skip fchr
-	in.seekg(5 * 4, ios_base::cur);
+	fseeko(fin, 5 * OFF_SIZE, SEEK_CUR);
 	
 	// Skip ftab
-	in.seekg(eh._ftabLen*4, ios_base::cur);
+	fseeko(fin, eh._ftabLen*OFF_SIZE, SEEK_CUR);
 	
 	// Skip eftab
-	in.seekg(eh._eftabLen*4, ios_base::cur);
+	fseeko(fin, eh._eftabLen*OFF_SIZE, SEEK_CUR);
 	
 	// Read reference sequence names from primary index file
 	while(true) {
 		char c = '\0';
-		in.read(&c, 1);
-		if(in.eof()) break;
+		int read_value = 0;
+        read_value = fgetc(fin);
+		if(read_value == EOF) break;
+        c = read_value;
 		if(c == '\0') break;
 		else if(c == '\n') {
 			refnames.push_back("");
@@ -733,8 +739,8 @@ readEbwtRefnames(istream& in, EList<string>& refnames) {
 	}
 	
 	// Be kind
-	in.clear(); in.seekg(0, ios::beg);
-	assert(in.good());
+    fseeko(fin, 0, SEEK_SET);
+	assert(ferror(fin) == 0);
 }
 
 /**
@@ -743,16 +749,15 @@ readEbwtRefnames(istream& in, EList<string>& refnames) {
  */
 void
 readEbwtRefnames(const string& instr, EList<string>& refnames) {
-	ifstream in;
+    FILE* fin;
 	// Initialize our primary and secondary input-stream fields
-	in.open((instr + ".1.bt2").c_str(), ios_base::in | ios::binary);
-	if(!in.is_open()) {
+    fin = fopen((instr + ".1." + gEbwt_ext).c_str(),"rb");
+	if(fin == NULL) {
 		throw EbwtFileOpenException("Cannot open file " + instr);
 	}
-	assert(in.is_open());
-	assert(in.good());
-	assert_eq((streamoff)in.tellg(), ios::beg);
-	readEbwtRefnames(in, refnames);
+	assert_eq(ftello(fin), 0);
+	readEbwtRefnames(fin, refnames);
+    fclose(fin);
 }
 
 /**
@@ -761,25 +766,25 @@ readEbwtRefnames(const string& instr, EList<string>& refnames) {
 int32_t Ebwt::readFlags(const string& instr) {
 	ifstream in;
 	// Initialize our primary and secondary input-stream fields
-	in.open((instr + ".1.bt2").c_str(), ios_base::in | ios::binary);
+	in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
 	if(!in.is_open()) {
 		throw EbwtFileOpenException("Cannot open file " + instr);
 	}
 	assert(in.is_open());
 	assert(in.good());
 	bool switchEndian = false;
-	uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
+	uint32_t one = readU<uint32_t>(in, switchEndian); // 1st word of primary stream
 	if(one != 1) {
 		assert_eq((1u<<24), one);
 		assert_eq(1, endianSwapU32(one));
 		switchEndian = true;
 	}
-	readU32(in, switchEndian);
-	readI32(in, switchEndian);
-	readI32(in, switchEndian);
-	readI32(in, switchEndian);
-	readI32(in, switchEndian);
-	int32_t flags = readI32(in, switchEndian);
+	readU<TIndexOffU>(in, switchEndian);
+	readI<int32_t>(in, switchEndian);
+	readI<int32_t>(in, switchEndian);
+	readI<int32_t>(in, switchEndian);
+	readI<int32_t>(in, switchEndian);
+	int32_t flags = readI<int32_t>(in, switchEndian);
 	return flags;
 }
 
@@ -832,17 +837,17 @@ void Ebwt::writeFromMemory(bool justHeader,
 	// When building an Ebwt, these header parameters are known
 	// "up-front", i.e., they can be written to disk immediately,
 	// before we join() or buildToDisk()
-	writeI32(out1, 1, be); // endian hint for priamry stream
-	writeI32(out2, 1, be); // endian hint for secondary stream
-	writeU32(out1, eh._len,          be); // length of string (and bwt and suffix array)
-	writeI32(out1, eh._lineRate,     be); // 2^lineRate = size in bytes of 1 line
-	writeI32(out1, 2,                be); // not used
-	writeI32(out1, eh._offRate,      be); // every 2^offRate chars is "marked"
-	writeI32(out1, eh._ftabChars,    be); // number of 2-bit chars used to address ftab
+	writeI<int32_t>(out1, 1, be); // endian hint for priamry stream
+	writeI<int32_t>(out2, 1, be); // endian hint for secondary stream
+	writeU<TIndexOffU>(out1, eh._len,          be); // length of string (and bwt and suffix array)
+	writeI<int32_t>(out1, eh._lineRate,     be); // 2^lineRate = size in bytes of 1 line
+	writeI<int32_t>(out1, 2,                be); // not used
+	writeI<int32_t>(out1, eh._offRate,      be); // every 2^offRate chars is "marked"
+	writeI<int32_t>(out1, eh._ftabChars,    be); // number of 2-bit chars used to address ftab
 	int32_t flags = 1;
 	if(eh._color) flags |= EBWT_COLOR;
 	if(eh._entireReverse) flags |= EBWT_ENTIRE_REV;
-	writeI32(out1, -flags, be); // BTL: chunkRate is now deprecated
+	writeI<int32_t>(out1, -flags, be); // BTL: chunkRate is now deprecated
 	
 	if(!justHeader) {
 		assert(rstarts() != NULL);
@@ -853,13 +858,13 @@ void Ebwt::writeFromMemory(bool justHeader,
 		// These Ebwt parameters are known after the inputs strings have
 		// been joined() but before they have been built().  These can
 		// written to the disk next and then discarded from memory.
-		writeU32(out1, this->_nPat,      be);
-		for(uint32_t i = 0; i < this->_nPat; i++)
-			writeU32(out1, this->plen()[i], be);
+		writeU<TIndexOffU>(out1, this->_nPat,      be);
+		for(TIndexOffU i = 0; i < this->_nPat; i++)
+			writeU<TIndexOffU>(out1, this->plen()[i], be);
 		assert_geq(this->_nFrag, this->_nPat);
-		writeU32(out1, this->_nFrag, be);
-		for(uint32_t i = 0; i < this->_nFrag*3; i++)
-			writeU32(out1, this->rstarts()[i], be);
+		writeU<TIndexOffU>(out1, this->_nFrag, be);
+		for(TIndexOffU i = 0; i < this->_nFrag*3; i++)
+			writeU<TIndexOffU>(out1, this->rstarts()[i], be);
 		
 		// These Ebwt parameters are discovered only as the Ebwt is being
 		// built (in buildToDisk()).  Of these, only 'offs' and 'ebwt' are
@@ -867,21 +872,21 @@ void Ebwt::writeFromMemory(bool justHeader,
 		// discarded from memory as it is built; 'offs' is similarly
 		// written to the secondary file and discarded.
 		out1.write((const char *)this->ebwt(), eh._ebwtTotLen);
-		writeU32(out1, this->zOff(), be);
-		uint32_t offsLen = eh._offsLen;
-		for(uint32_t i = 0; i < offsLen; i++)
-			writeU32(out2, this->offs()[i], be);
+		writeU<TIndexOffU>(out1, this->zOff(), be);
+		TIndexOffU offsLen = eh._offsLen;
+		for(TIndexOffU i = 0; i < offsLen; i++)
+			writeU<TIndexOffU>(out2, this->offs()[i], be);
 		
 		// 'fchr', 'ftab' and 'eftab' are not fully determined until the
 		// loop is finished, so they are written to the primary file after
 		// all of 'ebwt' has already been written and only then discarded
 		// from memory.
 		for(int i = 0; i < 5; i++)
-			writeU32(out1, this->fchr()[i], be);
-		for(uint32_t i = 0; i < eh._ftabLen; i++)
-			writeU32(out1, this->ftab()[i], be);
-		for(uint32_t i = 0; i < eh._eftabLen; i++)
-			writeU32(out1, this->eftab()[i], be);
+			writeU<TIndexOffU>(out1, this->fchr()[i], be);
+		for(TIndexOffU i = 0; i < eh._ftabLen; i++)
+			writeU<TIndexOffU>(out1, this->ftab()[i], be);
+		for(TIndexOffU i = 0; i < eh._eftabLen; i++)
+			writeU<TIndexOffU>(out1, this->eftab()[i], be);
 	}
 }
 
@@ -926,21 +931,21 @@ void Ebwt::writeFromMemory(bool justHeader,
 	    assert_eq(_zEbwtBpOff,       copy.zEbwtBpOff());
 	    assert_eq(_zEbwtByteOff,     copy.zEbwtByteOff());
 		assert_eq(_nPat,             copy.nPat());
-		for(uint32_t i = 0; i < _nPat; i++)
+		for(TIndexOffU i = 0; i < _nPat; i++)
 			assert_eq(this->_plen[i], copy.plen()[i]);
 		assert_eq(this->_nFrag, copy.nFrag());
-		for(uint32_t i = 0; i < this->nFrag*3; i++) {
+		for(TIndexOffU i = 0; i < this->nFrag*3; i++) {
 			assert_eq(this->_rstarts[i], copy.rstarts()[i]);
 		}
-		for(uint32_t i = 0; i < 5; i++)
+		for(int i = 0; i < 5; i++)
 			assert_eq(this->_fchr[i], copy.fchr()[i]);
-		for(uint32_t i = 0; i < eh._ftabLen; i++)
+		for(TIndexOffU i = 0; i < eh._ftabLen; i++)
 			assert_eq(this->ftab()[i], copy.ftab()[i]);
-		for(uint32_t i = 0; i < eh._eftabLen; i++)
+		for(TIndexOffU i = 0; i < eh._eftabLen; i++)
 			assert_eq(this->eftab()[i], copy.eftab()[i]);
-		for(uint32_t i = 0; i < eh._offsLen; i++)
+		for(TIndexOffU i = 0; i < eh._offsLen; i++)
 			assert_eq(this->_offs[i], copy.offs()[i]);
-		for(uint32_t i = 0; i < eh._ebwtTotLen; i++)
+		for(TIndexOffU i = 0; i < eh._ebwtTotLen; i++)
 			assert_eq(this->ebwt()[i], copy.ebwt()[i]);
 		copy.sanityCheckAll();
 		if(_verbose)
@@ -953,17 +958,17 @@ void Ebwt::writeFromMemory(bool justHeader,
  * Write the rstarts array given the szs array for the reference.
  */
 void Ebwt::szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse) {
-	size_t seq = 0;
-	uint32_t off = 0;
-	uint32_t totlen = 0;
+	TIndexOffU seq = 0;
+	TIndexOffU off = 0;
+	TIndexOffU totlen = 0;
 	for(unsigned int i = 0; i < szs.size(); i++) {
 		if(szs[i].len == 0) continue;
 		if(szs[i].first) off = 0;
 		off += szs[i].off;
 		if(szs[i].first && szs[i].len > 0) seq++;
-		size_t seqm1 = seq-1;
+		TIndexOffU seqm1 = seq-1;
 		assert_lt(seqm1, _nPat);
-		size_t fwoff = off;
+		TIndexOffU fwoff = off;
 		if(reverse == REF_READ_REVERSE) {
 			// Invert pattern idxs
 			seqm1 = _nPat - seqm1 - 1;
@@ -971,9 +976,9 @@ void Ebwt::szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse) {
 			assert_leq(off + szs[i].len, plen()[seqm1]);
 			fwoff = plen()[seqm1] - (off + szs[i].len);
 		}
-		writeU32(os, totlen, this->toBe()); // offset from beginning of joined string
-		writeU32(os, (uint32_t)seqm1,  this->toBe()); // sequence id
-		writeU32(os, (uint32_t)fwoff,  this->toBe()); // offset into sequence
+		writeU<TIndexOffU>(os, totlen, this->toBe()); // offset from beginning of joined string
+		writeU<TIndexOffU>(os, seqm1,  this->toBe()); // sequence id
+		writeU<TIndexOffU>(os, fwoff,  this->toBe()); // offset into sequence
 		totlen += szs[i].len;
 		off += szs[i].len;
 	}
diff --git a/bt2_search.cpp b/bt2_search.cpp
index b43bf84..5aa8684 100644
--- a/bt2_search.cpp
+++ b/bt2_search.cpp
@@ -161,7 +161,6 @@ static bool bwaSwLike;
 static float bwaSwLikeC;
 static float bwaSwLikeT;
 static bool qcFilter;
-static bool sortByScore;      // prioritize alignments to report by score?
 bool gReportOverhangs;        // false -> filter out alignments that fall off the end of a reference sequence
 static string rgid;           // ID: setting for @RG header line
 static string rgs;            // SAM outputs for @RG header line
@@ -191,7 +190,8 @@ static SimpleFunc scoreMin;   // minimum valid score as function of read len
 static SimpleFunc nCeil;      // max # Ns allowed as function of read len
 static SimpleFunc msIval;     // interval between seeds as function of read len
 static double descConsExp;    // how to adjust score minimum as we descent further into index-assisted alignment
-static size_t descentLanding; // don't place a search root if it's within this many positions of end
+static bool descPrioritizeRoots; // whether to prioritize search roots with scores
+static size_t descLanding;    // don't place a search root if it's within this many positions of end
 static SimpleFunc descentTotSz;    // maximum space a DescentDriver can use in bytes
 static SimpleFunc descentTotFmops; // maximum # FM ops a DescentDriver can perform
 static int    multiseedMms;   // mismatches permitted in a multiseed seed
@@ -233,6 +233,8 @@ static bool reorder;          // true -> reorder SAM recs in -p mode
 static float sampleFrac;      // only align random fraction of input reads
 static bool arbitraryRandom;  // pseudo-randoms no longer a function of read properties
 static bool bowtie2p5;
+static string logDps;         // log seed-extend dynamic programming problems
+static string logDpsOpp;      // log mate-search dynamic programming problems
 
 static string bt2index;      // read Bowtie 2 index from files with this prefix
 static EList<pair<int, string> > extra_opts;
@@ -347,7 +349,6 @@ static void resetOptions() {
 	bwaSwLikeC              = 5.5f;
 	bwaSwLikeT              = 20.0f;
 	qcFilter                = false; // don't believe upstream qc by default
-	sortByScore             = true;  // prioritize alignments to report by score?
 	rgid					= "";    // SAM outputs for @RG header line
 	rgs						= "";    // SAM outputs for @RG header line
 	rgs_optflag				= "";    // SAM optional flag to add corresponding to @RG ID
@@ -376,7 +377,8 @@ static void resetOptions() {
 	nCeil.init     (SIMPLE_FUNC_LINEAR, 0.0f, DMAX, 2.0f, 0.1f);
 	msIval.init    (SIMPLE_FUNC_LINEAR, 1.0f, DMAX, DEFAULT_IVAL_B, DEFAULT_IVAL_A);
 	descConsExp     = 2.0;
-	descentLanding  = 20;
+	descPrioritizeRoots = false;
+	descLanding = 20;
 	descentTotSz.init(SIMPLE_FUNC_LINEAR, 1024.0, DMAX, 0.0, 1024.0);
 	descentTotFmops.init(SIMPLE_FUNC_LINEAR, 100.0, DMAX, 0.0, 10.0);
 	multiseedMms    = DEFAULT_SEEDMMS;
@@ -421,6 +423,8 @@ static void resetOptions() {
 	sampleFrac = 1.1f;       // align all reads
 	arbitraryRandom = false; // let pseudo-random seeds be a function of read properties
 	bowtie2p5 = false;
+	logDps.clear();          // log seed-extend dynamic programming problems
+	logDpsOpp.clear();       // log mate-search dynamic programming problems
 }
 
 static const char *short_options = "fF:qbzhcu:rv:s:aP:t3:5:w:p:k:M:1:2:I:X:CQ:N:i:L:U:x:S:g:O:D:R:";
@@ -551,7 +555,6 @@ static struct option long_options[] = {
 	{(char*)"fast-local",           no_argument,   0,        ARG_PRESET_FAST_LOCAL},
 	{(char*)"sensitive-local",      no_argument,   0,        ARG_PRESET_SENSITIVE_LOCAL},
 	{(char*)"very-sensitive-local", no_argument,   0,        ARG_PRESET_VERY_SENSITIVE_LOCAL},
-	{(char*)"no-score-priority",no_argument,       0,        ARG_NO_SCORE_PRIORITY},
 	{(char*)"seedlen",          required_argument, 0,        'L'},
 	{(char*)"seedmms",          required_argument, 0,        'N'},
 	{(char*)"seedival",         required_argument, 0,        'i'},
@@ -604,7 +607,10 @@ static struct option long_options[] = {
 	{(char*)"desc-kb",          required_argument, 0,        ARG_DESC_KB},
 	{(char*)"desc-landing",     required_argument, 0,        ARG_DESC_LANDING},
 	{(char*)"desc-exp",         required_argument, 0,        ARG_DESC_EXP},
+	{(char*)"desc-prioritize",  no_argument,       0,        ARG_DESC_PRIORITIZE},
 	{(char*)"desc-fmops",       required_argument, 0,        ARG_DESC_FMOPS},
+	{(char*)"log-dp",           required_argument, 0,        ARG_LOG_DP},
+	{(char*)"log-dp-opp",       required_argument, 0,        ARG_LOG_DP_OPP},
 	{(char*)0, 0, 0, 0} // terminator
 };
 
@@ -657,7 +663,7 @@ static void printUsage(ostream& out) {
 	out << "Usage: " << endl
 	    << "  " << tool_name.c_str() << " [options]* -x <bt2-idx> {-1 <m1> -2 <m2> | -U <r>} [-S <sam>]" << endl
 	    << endl
-		<<     "  <bt2-idx>  Index filename prefix (minus trailing .X.bt2)." << endl
+		<<     "  <bt2-idx>  Index filename prefix (minus trailing .X." + gEbwt_ext + ")." << endl
 		<<     "             NOTE: Bowtie 1 and Bowtie 2 indexes are not compatible." << endl
 	    <<     "  <m1>       Files with #1 mates, paired with files in <m2>." << endl;
 	if(wrapper == "basic-0") {
@@ -779,7 +785,7 @@ static void printUsage(ostream& out) {
 	    << "  --omit-sec-seq     put '*' in SEQ and QUAL fields for secondary alignments." << endl
 		<< endl
 	    << " Performance:" << endl
-	    << "  -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
+	//    << "  -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
 	    << "  -p/--threads <int> number of alignment threads to launch (1)" << endl
 	    << "  --reorder          force SAM output order to match order of input reads" << endl
 #ifdef BOWTIE_MM
@@ -905,7 +911,16 @@ static void parseOption(int next_option, const char *arg) {
 		case ARG_TEST_25: bowtie2p5 = true; break;
 		case ARG_DESC_KB: descentTotSz = SimpleFunc::parse(arg, 0.0, 1024.0, 1024.0, DMAX); break;
 		case ARG_DESC_FMOPS: descentTotFmops = SimpleFunc::parse(arg, 0.0, 10.0, 100.0, DMAX); break;
-		case ARG_DESC_LANDING: descentLanding = parse<int>(arg); break;
+		case ARG_LOG_DP: logDps = arg; break;
+		case ARG_LOG_DP_OPP: logDpsOpp = arg; break;
+		case ARG_DESC_LANDING: {
+			descLanding = parse<int>(arg);
+			if(descLanding < 1) {
+				cerr << "Error: --desc-landing must be greater than or equal to 1" << endl;
+				throw 1;
+			}
+			break;
+		}
 		case ARG_DESC_EXP: {
 			descConsExp = parse<double>(arg);
 			if(descConsExp < 0.0) {
@@ -914,6 +929,7 @@ static void parseOption(int next_option, const char *arg) {
 			}
 			break;
 		}
+		case ARG_DESC_PRIORITIZE: descPrioritizeRoots = true; break;
 		case '1': tokenize(arg, ",", mates1); break;
 		case '2': tokenize(arg, ",", mates2); break;
 		case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
@@ -1138,8 +1154,6 @@ static void parseOption(int next_option, const char *arg) {
 		case ARG_REORDER: reorder = true; break;
 		case ARG_MAPQ_EX: {
 			sam_print_zp = true;
-			sam_print_zu = true;
-			sam_print_xp = true;
 			sam_print_xss = true;
 			sam_print_yn = true;
 			break;
@@ -1223,7 +1237,6 @@ static void parseOption(int next_option, const char *arg) {
 		case ARG_CONTAIN:     gContainMatesOK  = true;  break;
 		case ARG_OVERLAP:     gOlapMatesOK     = true;  break;
 		case ARG_QC_FILTER: qcFilter = true; break;
-		case ARG_NO_SCORE_PRIORITY: sortByScore = false; break;
 		case ARG_IGNORE_QUALS: ignoreQuals = true; break;
 		case ARG_MAPQ_V: mapqv = parse<int>(arg); break;
 		case ARG_TIGHTEN: tighten = parse<int>(arg); break;
@@ -1774,14 +1787,16 @@ struct PerfMetrics {
 				/* 22 */ "AlUnpFail"      "\t"
 				
 				/* 23 */ "SeedSearch"     "\t"
-				/* 24 */ "IntraSCacheHit" "\t"
-				/* 25 */ "InterSCacheHit" "\t"
-				/* 26 */ "OutOfMemory"    "\t"
-				/* 27 */ "AlBWOp"         "\t"
-				/* 28 */ "AlBWBranch"     "\t"
-				/* 29 */ "ResBWOp"        "\t"
-				/* 30 */ "ResBWBranch"    "\t"
-				/* 31 */ "ResResolve"     "\t"
+				/* 24 */ "NRange"         "\t"
+				/* 25 */ "NElt"           "\t"
+				/* 26 */ "IntraSCacheHit" "\t"
+				/* 27 */ "InterSCacheHit" "\t"
+				/* 28 */ "OutOfMemory"    "\t"
+				/* 29 */ "AlBWOp"         "\t"
+				/* 30 */ "AlBWBranch"     "\t"
+				/* 31 */ "ResBWOp"        "\t"
+				/* 32 */ "ResBWBranch"    "\t"
+				/* 33 */ "ResResolve"     "\t"
 				/* 34 */ "ResReport"      "\t"
 				/* 35 */ "RedundantSHit"  "\t"
 
@@ -2017,38 +2032,46 @@ struct PerfMetrics {
 		itoa10<uint64_t>(sd.seedsearch, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
-		// 24. Hits in 'current' cache
+		// 24. Seed ranges found
+		itoa10<uint64_t>(sd.nrange, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 25. Seed elements found
+		itoa10<uint64_t>(sd.nelt, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 26. Hits in 'current' cache
 		itoa10<uint64_t>(sd.intrahit, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
-		// 25. Hits in 'local' cache
+		// 27. Hits in 'local' cache
 		itoa10<uint64_t>(sd.interhit, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
-		// 26. Out of memory
+		// 28. Out of memory
 		itoa10<uint64_t>(sd.ooms, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
-		// 27. Burrows-Wheeler ops in aligner
+		// 29. Burrows-Wheeler ops in aligner
 		itoa10<uint64_t>(sd.bwops, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
-		// 28. Burrows-Wheeler branches (edits) in aligner
+		// 30. Burrows-Wheeler branches (edits) in aligner
 		itoa10<uint64_t>(sd.bweds, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
 		
 		const WalkMetrics& wl = total ? wlm : wlmu;
 		
-		// 29. Burrows-Wheeler ops in resolver
+		// 31. Burrows-Wheeler ops in resolver
 		itoa10<uint64_t>(wl.bwops, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
-		// 30. Burrows-Wheeler branches in resolver
+		// 32. Burrows-Wheeler branches in resolver
 		itoa10<uint64_t>(wl.branches, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
-		// 31. Burrows-Wheeler offset resolutions
+		// 33. Burrows-Wheeler offset resolutions
 		itoa10<uint64_t>(wl.resolves, buf);
 		if(metricsStderr) stderrSs << buf << '\t';
 		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
@@ -2788,10 +2811,21 @@ static void multiseedSearchWorker(void *vp) {
 		rp,            // reporting parameters
 		*bmapq.get(),  // MAPQ calculator
 		(size_t)tid);  // thread id
-
+	
+	// Write dynamic-programming problem descriptions here
+	ofstream *dpLog = NULL, *dpLogOpp = NULL;
+	if(!logDps.empty()) {
+		dpLog = new ofstream(logDps.c_str(), ofstream::out);
+		dpLog->sync_with_stdio(false);
+	}
+	if(!logDpsOpp.empty()) {
+		dpLogOpp = new ofstream(logDpsOpp.c_str(), ofstream::out);
+		dpLogOpp->sync_with_stdio(false);
+	}
+	
 	SeedAligner al;
 	SwDriver sd(exactCacheCurrentMB * 1024 * 1024);
-	SwAligner sw, osw;
+	SwAligner sw(dpLog), osw(dpLogOpp);
 	SeedResults shs[2];
 	OuterLoopMetrics olm;
 	SeedSearchMetrics sdm;
@@ -3729,7 +3763,6 @@ static void multiseedSearchWorker(void *vp) {
 					lenfilt[1],
 					qcfilt[0],
 					qcfilt[1],
-					sortByScore,          // prioritize by alignment score
 					rnd,                  // pseudo-random generator
 					rpm,                  // reporting metrics
 					prm,                  // per-read metrics
@@ -3753,6 +3786,9 @@ static void multiseedSearchWorker(void *vp) {
 	
 	// One last metrics merge
 	MERGE_METRICS(metrics, nthreads > 1);
+	
+	if(dpLog    != NULL) dpLog->close();
+	if(dpLogOpp != NULL) dpLogOpp->close();
 
 	return;
 }
@@ -3837,12 +3873,13 @@ static void multiseedSearchWorker_2p5(void *vp) {
 		gExpandToFrag);
 	
 	AlignerDriver ald(
-		descConsExp,       // exponent for interpolating maximum penalty
-		msIval,            // interval length, as function of read length
-		descentLanding,    // landing length
-		gVerbose,          // verbose?
-		descentTotSz,      // limit on total bytes of best-first search data
-		descentTotFmops);  // limit on total number of FM index ops in BFS
+		descConsExp,         // exponent for interpolating maximum penalty
+		descPrioritizeRoots, // whether to select roots with scores and weights
+		msIval,              // interval length, as function of read length
+		descLanding,         // landing length
+		gVerbose,            // verbose?
+		descentTotSz,        // limit on total bytes of best-first search data
+		descentTotFmops);    // limit on total number of FM index ops in BFS
 	
 	PerfMetrics metricsPt; // per-thread metrics object; for read-level metrics
 	BTString nametmp;
@@ -4027,14 +4064,6 @@ static void multiseedSearchWorker_2p5(void *vp) {
 				assert_gt(streak[1], 0);
 			}
 			assert_gt(streak[0], 0);
-			// Calculate # seed rounds for each mate
-			size_t nrounds[2] = { nSeedRounds, nSeedRounds };
-			if(filt[0] && filt[1]) {
-				nrounds[0] = (size_t)ceil((double)nrounds[0] / 2.0);
-				nrounds[1] = (size_t)ceil((double)nrounds[1] / 2.0);
-				assert_gt(nrounds[1], 0);
-			}
-			assert_gt(nrounds[0], 0);
 			// Increment counters according to what got filtered
 			for(size_t mate = 0; mate < (pair ? 2:1); mate++) {
 				if(!filt[mate]) {
@@ -4070,7 +4099,6 @@ static void multiseedSearchWorker_2p5(void *vp) {
 				lenfilt[1],
 				qcfilt[0],
 				qcfilt[1],
-				sortByScore,          // prioritize by alignment score
 				rnd,                  // pseudo-random generator
 				rpm,                  // reporting metrics
 				prm,                  // per-read metrics
@@ -4567,7 +4595,7 @@ int bowtie(int argc, const char **argv) {
 
 			// Optionally summarize
 			if(gVerbose) {
-				cout << "Input bt2 file: \"" << bt2index.c_str() << "\"" << endl;
+				cout << "Input " + gEbwt_ext +" file: \"" << bt2index.c_str() << "\"" << endl;
 				cout << "Query inputs (DNA, " << file_format_names[format].c_str() << "):" << endl;
 				for(size_t i = 0; i < queries.size(); i++) {
 					cout << "  " << queries[i].c_str() << endl;
diff --git a/bt2_util.cpp b/bt2_util.cpp
index 51327d7..e7e476a 100644
--- a/bt2_util.cpp
+++ b/bt2_util.cpp
@@ -36,14 +36,14 @@
  * including) the given side index by re-counting the chars and
  * comparing against the embedded occ[] arrays.
  */
-void Ebwt::sanityCheckUpToSide(int upToSide) const {
+void Ebwt::sanityCheckUpToSide(TIndexOff upToSide) const {
 	assert(isInMemory());
-	uint32_t occ[] = {0, 0, 0, 0};
-	ASSERT_ONLY(uint32_t occ_save[] = {0, 0, 0, 0});
-	uint32_t cur = 0; // byte pointer
+	TIndexOffU occ[] = {0, 0, 0, 0};
+	ASSERT_ONLY(TIndexOffU occ_save[] = {0, 0, 0, 0});
+	TIndexOffU cur = 0; // byte pointer
 	const EbwtParams& eh = this->_eh;
 	bool fw = false;
-	while(cur < (upToSide * eh._sideSz)) {
+	while(cur < (TIndexOffU)(upToSide * eh._sideSz)) {
 		assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
 		for(uint32_t i = 0; i < eh._sideBwtSz; i++) {
 			uint8_t by = this->ebwt()[cur + (fw ? i : eh._sideBwtSz-i-1)];
@@ -57,11 +57,11 @@ void Ebwt::sanityCheckUpToSide(int upToSide) const {
 		assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
 		// Finished forward bucket; check saved [A], [C], [G] and [T]
 		// against the uint32_ts encoded here
-		ASSERT_ONLY(const uint32_t *u32ebwt = reinterpret_cast<const uint32_t*>(&ebwt()[cur + eh._sideBwtSz]));
-		ASSERT_ONLY(uint32_t as = u32ebwt[0]);
-		ASSERT_ONLY(uint32_t cs = u32ebwt[1]);
-		ASSERT_ONLY(uint32_t gs = u32ebwt[2]);
-		ASSERT_ONLY(uint32_t ts = u32ebwt[3]);
+		ASSERT_ONLY(const TIndexOffU *u_ebwt = reinterpret_cast<const TIndexOffU*>(&ebwt()[cur + eh._sideBwtSz]));
+		ASSERT_ONLY(TIndexOffU as = u_ebwt[0]);
+		ASSERT_ONLY(TIndexOffU cs = u_ebwt[1]);
+		ASSERT_ONLY(TIndexOffU gs = u_ebwt[2]);
+		ASSERT_ONLY(TIndexOffU ts = u_ebwt[3]);
 		assert(as == occ_save[0] || as == occ_save[0]-1);
 		assert_eq(cs, occ_save[1]);
 		assert_eq(gs, occ_save[2]);
@@ -83,7 +83,7 @@ void Ebwt::sanityCheckAll(int reverse) const {
 	const EbwtParams& eh = this->_eh;
 	assert(isInMemory());
 	// Check ftab
-	for(uint32_t i = 1; i < eh._ftabLen; i++) {
+	for(TIndexOffU i = 1; i < eh._ftabLen; i++) {
 		assert_geq(this->ftabHi(i), this->ftabLo(i-1));
 		assert_geq(this->ftabLo(i), this->ftabHi(i-1));
 		assert_leq(this->ftabHi(i), eh._bwtLen+1);
@@ -91,20 +91,20 @@ void Ebwt::sanityCheckAll(int reverse) const {
 	assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen);
 	
 	// Check offs
-	int seenLen = (eh._bwtLen + 31) >> 5;
-	uint32_t *seen;
+	TIndexOff seenLen = (eh._bwtLen + 31) >> ((TIndexOffU)5);
+	TIndexOff *seen;
 	try {
-		seen = new uint32_t[seenLen]; // bitvector marking seen offsets
+		seen = new TIndexOff[seenLen]; // bitvector marking seen offsets
 	} catch(bad_alloc& e) {
 		cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
 		throw e;
 	}
-	memset(seen, 0, 4 * seenLen);
-	uint32_t offsLen = eh._offsLen;
-	for(uint32_t i = 0; i < offsLen; i++) {
+	memset(seen, 0, OFF_SIZE * seenLen);
+	TIndexOffU offsLen = eh._offsLen;
+	for(TIndexOffU i = 0; i < offsLen; i++) {
 		assert_lt(this->offs()[i], eh._bwtLen);
-		int w = this->offs()[i] >> 5;
-		int r = this->offs()[i] & 31;
+		TIndexOff w = this->offs()[i] >> 5;
+		TIndexOff r = this->offs()[i] & 31;
 		assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
 		seen[w] |= (1 << r);
 	}
@@ -114,13 +114,13 @@ void Ebwt::sanityCheckAll(int reverse) const {
 	assert_gt(this->_nPat, 0);
 	
 	// Check plen, flen
-	for(uint32_t i = 0; i < this->_nPat; i++) {
+	for(TIndexOffU i = 0; i < this->_nPat; i++) {
 		assert_geq(this->plen()[i], 0);
 	}
 	
 	// Check rstarts
 	if(this->rstarts() != NULL) {
-		for(uint32_t i = 0; i < this->_nFrag-1; i++) {
+		for(TIndexOffU i = 0; i < this->_nFrag-1; i++) {
 			assert_gt(this->rstarts()[(i+1)*3], this->rstarts()[i*3]);
 			if(reverse == REF_READ_REVERSE) {
 				assert(this->rstarts()[(i*3)+1] >= this->rstarts()[((i+1)*3)+1]);
@@ -144,14 +144,14 @@ void Ebwt::sanityCheckAll(int reverse) const {
 void Ebwt::restore(SString<char>& s) const {
 	assert(isInMemory());
 	s.resize(this->_eh._len);
-	uint32_t jumps = 0;
-	uint32_t i = this->_eh._len; // should point to final SA elt (starting with '$')
+	TIndexOffU jumps = 0;
+	TIndexOffU i = this->_eh._len; // should point to final SA elt (starting with '$')
 	SideLocus l(i, this->_eh, this->ebwt());
 	while(i != _zOff) {
 		assert_lt(jumps, this->_eh._len);
 		//if(_verbose) cout << "restore: i: " << i << endl;
 		// Not a marked row; go back a char in the original string
-		uint32_t newi = mapLF(l ASSERT_ONLY(, false));
+		TIndexOffU newi = mapLF(l ASSERT_ONLY(, false));
 		assert_neq(newi, i);
 		s[this->_eh._len - jumps - 1] = rowL(l);
 		i = newi;
@@ -172,7 +172,7 @@ void Ebwt::checkOrigs(
 {
 	SString<char> rest;
 	restore(rest);
-	uint32_t restOff = 0;
+	TIndexOffU restOff = 0;
 	size_t i = 0, j = 0;
 	if(mirror) {
 		// TODO: FIXME
diff --git a/multikey_qsort.cpp b/btypes.h
similarity index 57%
copy from multikey_qsort.cpp
copy to btypes.h
index 36fdd7d..cc4225c 100644
--- a/multikey_qsort.cpp
+++ b/btypes.h
@@ -17,7 +17,31 @@
  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "multikey_qsort.h"
 
-// 5 64-element buckets for bucket-sorting A, C, G, T, $
-uint32_t bkts[4][4 * 1024 * 1024];
+#ifndef BOWTIE_INDEX_TYPES_H
+#define	BOWTIE_INDEX_TYPES_H
+
+#ifdef BOWTIE_64BIT_INDEX
+#define OFF_MASK 0xffffffffffffffff
+#define OFF_LEN_MASK 0xc000000000000000
+#define LS_SIZE 0x100000000000000
+#define OFF_SIZE 8
+
+typedef uint64_t TIndexOffU;
+typedef int64_t TIndexOff;
+    
+#else
+#define OFF_MASK 0xffffffff
+#define OFF_LEN_MASK 0xc0000000
+#define LS_SIZE 0x10000000
+#define OFF_SIZE 4
+
+typedef uint32_t TIndexOffU;
+typedef int TIndexOff;
+
+#endif /* BOWTIE_64BIT_INDEX */
+
+extern const std::string gEbwt_ext;
+
+#endif	/* BOWTIE_INDEX_TYPES_H */
+
diff --git a/diff_sample.h b/diff_sample.h
index 66b0b17..a0ebd31 100644
--- a/diff_sample.h
+++ b/diff_sample.h
@@ -28,6 +28,7 @@
 #include "ds.h"
 #include "mem_ids.h"
 #include "ls.h"
+#include "btypes.h"
 
 using namespace std;
 
@@ -452,7 +453,7 @@ public:
 		_isaPrime(),
 		_dInv(),
 		_log2v(myLog2(_v)),
-		_vmask(0xffffffff << _log2v),
+		_vmask(OFF_MASK << _log2v),
 		_logger(__logger)
 	{
 		assert_gt(_d, 0);
@@ -478,15 +479,15 @@ public:
 		size_t sPrimeSz = (len / v) * ds.size();
 		// sPrime, sPrimeOrder, _isaPrime all exist in memory at
 		// once and that's the peak
-		AutoArray<uint32_t> aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/), EBWT_CAT);
+		AutoArray<TIndexOffU> aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/), EBWT_CAT);
 		return sPrimeSz * 4; // sPrime array
 	}
 
 	uint32_t v() const                   { return _v; }
 	uint32_t log2v() const               { return _log2v; }
 	uint32_t vmask() const               { return _vmask; }
-	uint32_t modv(uint32_t i) const      { return i & ~_vmask; }
-	uint32_t divv(uint32_t i) const      { return i >> _log2v; }
+	uint32_t modv(TIndexOffU i) const    { return (uint32_t)(i & ~_vmask); }
+	TIndexOffU divv(TIndexOffU i) const  { return i >> _log2v; }
 	uint32_t d() const                   { return _d; }
 	bool verbose() const                 { return _verbose; }
 	bool sanityCheck() const             { return _sanity; }
@@ -496,10 +497,10 @@ public:
 	ostream& log() const                 { return _logger; }
 
 	void     build();
-	uint32_t tieBreakOff(uint32_t i, uint32_t j) const;
-	int64_t  breakTie(uint32_t i, uint32_t j) const;
-	bool     isCovered(uint32_t i) const;
-	uint32_t rank(uint32_t i) const;
+	uint32_t tieBreakOff(TIndexOffU i, TIndexOffU j) const;
+	int64_t  breakTie(TIndexOffU i, TIndexOffU j) const;
+	bool     isCovered(TIndexOffU i) const;
+	TIndexOffU rank(TIndexOffU i) const;
 
 	/**
 	 * Print out the suffix array such that every sample offset has its
@@ -522,7 +523,7 @@ public:
 private:
 
 	void doBuiltSanityCheck() const;
-	void buildSPrime(EList<uint32_t>& sPrime, size_t padding);
+	void buildSPrime(EList<TIndexOffU>& sPrime, size_t padding);
 
 	bool built() const {
 		return _isaPrime.size() > 0;
@@ -542,11 +543,11 @@ private:
 	EList<uint32_t>  _ds;       // samples: idx -> d
 	EList<uint32_t>  _dmap;     // delta map
 	uint32_t         _d;        // |D| - size of sample
-	EList<uint32_t>  _doffs;    // offsets into sPrime/isaPrime for each d idx
-	EList<uint32_t>  _isaPrime; // ISA' array
+	EList<TIndexOffU>  _doffs;    // offsets into sPrime/isaPrime for each d idx
+	EList<TIndexOffU>  _isaPrime; // ISA' array
 	EList<uint32_t>  _dInv;     // Map from d -> idx
 	uint32_t         _log2v;
-	uint32_t         _vmask;
+	TIndexOffU         _vmask;
 	ostream&         _logger;
 };
 
@@ -559,17 +560,17 @@ void DifferenceCoverSample<TStr>::doBuiltSanityCheck() const {
 	uint32_t v = this->v();
 	assert(built());
 	VMSG_NL("  Doing sanity check");
-	uint32_t added = 0;
-	EList<uint32_t> sorted;
+	TIndexOffU added = 0;
+	EList<TIndexOffU> sorted;
 	sorted.resizeExact(_isaPrime.size());
-	sorted.fill(0xffffffff);
+	sorted.fill(OFF_MASK);
 	for(size_t di = 0; di < this->d(); di++) {
 		uint32_t d = _ds[di];
 		size_t i = 0;
 		for(size_t doi = _doffs[di]; doi < _doffs[di+1]; doi++, i++) {
-			assert_eq(0xffffffff, sorted[_isaPrime[doi]]);
+			assert_eq(OFF_MASK, sorted[_isaPrime[doi]]);
 			// Maps the offset of the suffix to its rank
-			sorted[_isaPrime[doi]] = (uint32_t)(v*i + d);
+			sorted[_isaPrime[doi]] = (TIndexOffU)(v*i + d);
 			added++;
 		}
 	}
@@ -591,25 +592,25 @@ void DifferenceCoverSample<TStr>::doBuiltSanityCheck() const {
  */
 template <typename TStr>
 void DifferenceCoverSample<TStr>::buildSPrime(
-	EList<uint32_t>& sPrime,
+	EList<TIndexOffU>& sPrime,
 	size_t padding)
 {
 	const TStr& t = this->text();
 	const EList<uint32_t>& ds = this->ds();
-	uint32_t tlen = (uint32_t)t.length();
+	TIndexOffU tlen = (TIndexOffU)t.length();
 	uint32_t v = this->v();
 	uint32_t d = this->d();
 	assert_gt(v, 2);
 	assert_lt(d, v);
 	// Record where each d section should begin in sPrime
-	uint32_t tlenDivV = this->divv(tlen);
+	TIndexOffU tlenDivV = this->divv(tlen);
 	uint32_t tlenModV = this->modv(tlen);
-	uint32_t sPrimeSz = 0;
+	TIndexOffU sPrimeSz = 0;
 	assert(_doffs.empty());
 	_doffs.resizeExact((size_t)d+1);
 	for(uint32_t di = 0; di < d; di++) {
 		// mu mapping
-		uint32_t sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0);
+		TIndexOffU sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0);
 		assert_geq(sz, 0);
 		_doffs[di] = sPrimeSz;
 		sPrimeSz += sz;
@@ -619,7 +620,7 @@ void DifferenceCoverSample<TStr>::buildSPrime(
 	if(tlenDivV > 0) {
 		for(size_t i = 0; i < d; i++) {
 			assert_gt(_doffs[i+1], _doffs[i]);
-			uint32_t diff = _doffs[i+1] - _doffs[i];
+			TIndexOffU diff = _doffs[i+1] - _doffs[i];
 			assert(diff == tlenDivV || diff == tlenDivV+1);
 		}
 	}
@@ -627,20 +628,20 @@ void DifferenceCoverSample<TStr>::buildSPrime(
 	assert_eq(_doffs.size(), d+1);
 	// Size sPrime appropriately
 	sPrime.resizeExact((size_t)sPrimeSz + padding);
-	sPrime.fill(0xffffffff);
+	sPrime.fill(OFF_MASK);
 	// Slot suffixes from text into sPrime according to the mu
 	// mapping; where the mapping would leave a blank, insert a 0
-	uint32_t added = 0;
-	uint32_t i = 0;
-	for(uint32_t ti = 0; ti <= tlen; ti += v) {
+	TIndexOffU added = 0;
+	TIndexOffU i = 0;
+	for(uint64_t ti = 0; ti <= tlen; ti += v) {
 		for(uint32_t di = 0; di < d; di++) {
-			uint32_t tti = ti + ds[di];
+			TIndexOffU tti = ti + ds[di];
 			if(tti > tlen) break;
-			uint32_t spi = _doffs[di] + i;
+			TIndexOffU spi = _doffs[di] + i;
 			assert_lt(spi, _doffs[di+1]);
 			assert_leq(tti, tlen);
 			assert_lt(spi, sPrimeSz);
-			assert_eq(0xffffffff, sPrime[spi]);
+			assert_eq(OFF_MASK, sPrime[spi]);
 			sPrime[spi] = tti; added++;
 		}
 		i++;
@@ -655,11 +656,11 @@ void DifferenceCoverSample<TStr>::buildSPrime(
 template <typename TStr>
 static inline bool suffixSameUpTo(
 	const TStr& host,
-	uint32_t suf1,
-	uint32_t suf2,
-	uint32_t v)
+	TIndexOffU suf1,
+	TIndexOffU suf2,
+	TIndexOffU v)
 {
-	for(uint32_t i = 0; i < v; i++) {
+	for(TIndexOffU i = 0; i < v; i++) {
 		bool endSuf1 = suf1+i >= host.length();
 		bool endSuf2 = suf2+i >= host.length();
 		if((endSuf1 && !endSuf2) || (!endSuf1 && endSuf2)) return false;
@@ -682,7 +683,7 @@ void DifferenceCoverSample<TStr>::build() {
 	uint32_t v = this->v();
 	assert_gt(v, 2);
 	// Build s'
-	EList<uint32_t> sPrime;
+	EList<TIndexOffU> sPrime;
 	// Need to allocate 2 extra elements at the end of the sPrime and _isaPrime
 	// arrays.  One element that's less than all others, and another that acts
 	// as needed padding for the Larsson-Sadakane sorting code.
@@ -692,12 +693,12 @@ void DifferenceCoverSample<TStr>::build() {
 	size_t sPrimeSz = sPrime.size() - padding;
 	assert_gt(sPrime.size(), padding);
 	assert_leq(sPrime.size(), t.length() + padding + 1);
-	uint32_t nextRank = 0;
+	TIndexOffU nextRank = 0;
 	{
 		VMSG_NL("  Building sPrimeOrder");
-		EList<uint32_t> sPrimeOrder;
+		EList<TIndexOffU> sPrimeOrder;
 		sPrimeOrder.resizeExact(sPrimeSz);
-		for(uint32_t i = 0; i < sPrimeSz; i++) {
+		for(TIndexOffU i = 0; i < sPrimeSz; i++) {
 			sPrimeOrder[i] = i;
 		}
 		// sPrime now holds suffix-offsets for DC samples.
@@ -707,10 +708,10 @@ void DifferenceCoverSample<TStr>::build() {
 			// Extract backing-store array from sPrime and sPrimeOrder;
 			// the mkeyQSortSuf2 routine works on the array for maximum
 			// efficiency
-			uint32_t *sPrimeArr = (uint32_t*)sPrime.ptr();
+			TIndexOffU *sPrimeArr = (TIndexOffU*)sPrime.ptr();
 			assert_eq(sPrimeArr[0], sPrime[0]);
 			assert_eq(sPrimeArr[sPrimeSz-1], sPrime[sPrimeSz-1]);
-			uint32_t *sPrimeOrderArr = (uint32_t*)sPrimeOrder.ptr();
+			TIndexOffU *sPrimeOrderArr = (TIndexOffU*)sPrimeOrder.ptr();
 			assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]);
 			assert_eq(sPrimeOrderArr[sPrimeSz-1], sPrimeOrder[sPrimeSz-1]);
 			// Sort sample suffixes up to the vth character using a
@@ -734,7 +735,7 @@ void DifferenceCoverSample<TStr>::build() {
 		// arrays back into sPrime.
 		VMSG_NL("  Allocating rank array");
 		_isaPrime.resizeExact(sPrime.size());
-		ASSERT_ONLY(_isaPrime.fill(0xffffffff));
+		ASSERT_ONLY(_isaPrime.fill(OFF_MASK));
 		assert_gt(_isaPrime.size(), 0);
 		{
 			Timer timer(cout, "  Ranking v-sort output time: ", this->verbose());
@@ -749,7 +750,7 @@ void DifferenceCoverSample<TStr>::build() {
 			_isaPrime[sPrimeOrder[sPrimeSz-1]] = nextRank; // finish off
 #ifndef NDEBUG
 			for(size_t i = 0; i < sPrimeSz; i++) {
-				assert_neq(0xffffffff, _isaPrime[i]);
+				assert_neq(OFF_MASK, _isaPrime[i]);
 				assert_lt(_isaPrime[i], sPrimeSz);
 			}
 #endif
@@ -757,23 +758,23 @@ void DifferenceCoverSample<TStr>::build() {
 		// sPrimeOrder is destroyed
 		// All the information we need is now in _isaPrime
 	}
-	_isaPrime[_isaPrime.size()-1] = (uint32_t)sPrimeSz;
-	sPrime[sPrime.size()-1] = (uint32_t)sPrimeSz;
+	_isaPrime[_isaPrime.size()-1] = (TIndexOffU)sPrimeSz;
+	sPrime[sPrime.size()-1] = (TIndexOffU)sPrimeSz;
 	// _isaPrime[_isaPrime.size()-1] and sPrime[sPrime.size()-1] are just
 	// spacer for the Larsson-Sadakane routine to use
 	{
 		Timer timer(cout, "  Invoking Larsson-Sadakane on ranks time: ", this->verbose());
 		VMSG_NL("  Invoking Larsson-Sadakane on ranks");
-		if(sPrime.size() >= 0x10000000) {
+		if(sPrime.size() >= LS_SIZE) {
 			cerr << "Error; sPrime array has so many elements that it can't be converted to a signed array without overflow." << endl;
 			throw 1;
 		}
-		LarssonSadakane<int> ls;
+		LarssonSadakane<TIndexOff> ls;
 		ls.suffixsort(
-			(int*)_isaPrime.ptr(),
-			(int*)sPrime.ptr(),
-			(int)sPrimeSz,
-			(int)sPrime.size(),
+			(TIndexOff*)_isaPrime.ptr(),
+			(TIndexOff*)sPrime.ptr(),
+			(TIndexOff)sPrimeSz,
+			(TIndexOff)sPrime.size(),
 			0);
 	}
 	// chop off final character of _isaPrime
@@ -797,7 +798,7 @@ void DifferenceCoverSample<TStr>::build() {
  * logic elsewhere.
  */
 template <typename TStr>
-bool DifferenceCoverSample<TStr>::isCovered(uint32_t i) const {
+bool DifferenceCoverSample<TStr>::isCovered(TIndexOffU i) const {
 	assert(built());
 	uint32_t modi = this->modv(i);
 	assert_lt(modi, _dInv.size());
@@ -809,16 +810,16 @@ bool DifferenceCoverSample<TStr>::isCovered(uint32_t i) const {
  * among the sample suffixes.
  */
 template <typename TStr>
-uint32_t DifferenceCoverSample<TStr>::rank(uint32_t i) const {
+TIndexOffU DifferenceCoverSample<TStr>::rank(TIndexOffU i) const {
 	assert(built());
 	assert_lt(i, this->text().length());
 	uint32_t imodv = this->modv(i);
 	assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample
-	uint32_t ioff = this->divv(i);
+	TIndexOffU ioff = this->divv(i);
 	assert_lt(ioff, _doffs[_dInv[imodv]+1] - _doffs[_dInv[imodv]]);
-	uint32_t isaIIdx = _doffs[_dInv[imodv]] + ioff;
+	TIndexOffU isaIIdx = _doffs[_dInv[imodv]] + ioff;
 	assert_lt(isaIIdx, _isaPrime.size());
-	uint32_t isaPrimeI = _isaPrime[isaIIdx];
+	TIndexOffU isaPrimeI = _isaPrime[isaIIdx];
 	assert_leq(isaPrimeI, _isaPrime.size());
 	return isaPrimeI;
 }
@@ -828,7 +829,7 @@ uint32_t DifferenceCoverSample<TStr>::rank(uint32_t i) const {
  * if suffix j is lexicographically greater.
  */
 template <typename TStr>
-int64_t DifferenceCoverSample<TStr>::breakTie(uint32_t i, uint32_t j) const {
+int64_t DifferenceCoverSample<TStr>::breakTie(TIndexOffU i, TIndexOffU j) const {
 	assert(built());
 	assert_neq(i, j);
 	assert_lt(i, this->text().length());
@@ -839,20 +840,20 @@ int64_t DifferenceCoverSample<TStr>::breakTie(uint32_t i, uint32_t j) const {
 	assert_neq(0xffffffff, _dInv[jmodv]); // must be in the sample
 	uint32_t dimodv = _dInv[imodv];
 	uint32_t djmodv = _dInv[jmodv];
-	uint32_t ioff = this->divv(i);
-	uint32_t joff = this->divv(j);
+	TIndexOffU ioff = this->divv(i);
+	TIndexOffU joff = this->divv(j);
 	assert_lt(dimodv+1, _doffs.size());
 	assert_lt(djmodv+1, _doffs.size());
 	// assert_lt: expected (32024) < (0)
 	assert_lt(ioff, _doffs[dimodv+1] - _doffs[dimodv]);
 	assert_lt(joff, _doffs[djmodv+1] - _doffs[djmodv]);
-	uint32_t isaIIdx = _doffs[dimodv] + ioff;
-	uint32_t isaJIdx = _doffs[djmodv] + joff;
+	TIndexOffU isaIIdx = _doffs[dimodv] + ioff;
+	TIndexOffU isaJIdx = _doffs[djmodv] + joff;
 	assert_lt(isaIIdx, _isaPrime.size());
 	assert_lt(isaJIdx, _isaPrime.size());
 	assert_neq(isaIIdx, isaJIdx); // ranks must be unique
-	uint32_t isaPrimeI = _isaPrime[isaIIdx];
-	uint32_t isaPrimeJ = _isaPrime[isaJIdx];
+	TIndexOffU isaPrimeI = _isaPrime[isaIIdx];
+	TIndexOffU isaPrimeJ = _isaPrime[isaJIdx];
 	assert_neq(isaPrimeI, isaPrimeJ); // ranks must be unique
 	assert_leq(isaPrimeI, _isaPrime.size());
 	assert_leq(isaPrimeJ, _isaPrime.size());
@@ -864,7 +865,7 @@ int64_t DifferenceCoverSample<TStr>::breakTie(uint32_t i, uint32_t j) const {
  * be compared before the difference cover can break the tie.
  */
 template <typename TStr>
-uint32_t DifferenceCoverSample<TStr>::tieBreakOff(uint32_t i, uint32_t j) const {
+uint32_t DifferenceCoverSample<TStr>::tieBreakOff(TIndexOffU i, TIndexOffU j) const {
 	const TStr& t = this->text();
 	const EList<uint32_t>& dmap = this->dmap();
 	assert(built());
diff --git a/doc/manual.html b/doc/manual.html
index 1bf2954..9d5aee2 100644
--- a/doc/manual.html
+++ b/doc/manual.html
@@ -55,7 +55,8 @@
 <li><a href="#presets-setting-many-settings-at-once">Presets: setting many settings at once</a></li>
 <li><a href="#filtering">Filtering</a></li>
 <li><a href="#alignment-summmary">Alignment summmary</a></li>
-<li><a href="#wrapper">Wrapper</a></li>
+<li><a href="#wrapper-scripts">Wrapper scripts</a></li>
+<li><a href="#small-and-large-indexes">Small and large indexes</a></li>
 <li><a href="#performance-tuning">Performance tuning</a></li>
 <li><a href="#command-line">Command Line</a><ul>
 <li><a href="#setting-function-options">Setting function options</a></li>
@@ -132,7 +133,7 @@
 <h2 id="what-does-it-mean-that-some-older-bowtie-2-versions-are-beta"><a href="#TOC">What does it mean that some older Bowtie 2 versions are "beta"?</a></h2>
 <p>We said those Bowtie 2 versions were in "beta" to convey that it was not as polished as a tool that had been around for a while, and was still in flux. Since version 2.0.1, we declared Bowtie 2 was no longer "beta".</p>
 <h1 id="obtaining-bowtie-2"><a href="#TOC">Obtaining Bowtie 2</a></h1>
-<p>Download Bowtie 2 sources and binaries from the <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/">Download</a> section of the Sourceforge site. Binaries are available for Intel architectures (<code>i386</code> and <code>x86_64</code>) running Linux, and Mac OS X. A 32-bit version is available for Windows. If you plan to compile Bowtie 2 yourself, make sure to get the source package, i.e., the filename that ends in "-source.zip".</p>
+<p>Download Bowtie 2 sources and binaries from the <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/">Download</a> section of the Sourceforge site. Binaries are available for the Intel <code>x86_64</code> architecture running Linux, Mac OS X, and Windows. If you plan to compile Bowtie 2 yourself, make sure to get the source package, i.e., the filename that ends in "-source.zip".</p>
 <h2 id="building-from-source"><a href="#TOC">Building from source</a></h2>
 <p>Building Bowtie 2 from source requires a GNU-like environment with GCC, GNU Make and other basics. It should be possible to build Bowtie 2 on most vanilla Linux installations or on a Mac installation with <a href="http://developer.apple.com/xcode/">Xcode</a> installed. Bowtie 2 can also be built on Windows using <a href="http://www.cygwin.com/">Cygwin</a> or <a href="http://www.mingw.org/">MinGW</a> (MinGW recommended). For a MinGW build the choice of what compiler is to be used is im [...]
 <p>First, download the source package from the <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/">sourceforge site</a>. Make sure you're getting the source package; the file downloaded should end in <code>-source.zip</code>. Unzip the file, change to the unzipped directory, and build the Bowtie 2 tools by running GNU <code>make</code> (usually with the command <code>make</code>, but sometimes with <code>gmake</code>) with no arguments. If building with MinGW, run <code> [...]
@@ -291,14 +292,19 @@ Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC</code></pr
         1 (0.08%) aligned >1 times
 96.70% overall alignment rate</code></pre>
 <p>The indentation indicates how subtotals relate to totals.</p>
-<h2 id="wrapper"><a href="#TOC">Wrapper</a></h2>
-<p>The <code>bowtie2</code> executable is actually a Perl wrapper script that calls the compiled <code>bowtie2-align</code> binary. It is recommended that you always run the <code>bowtie2</code> wrapper and not run <code>bowtie2-align</code> directly.</p>
+<h2 id="wrapper-scripts"><a href="#TOC">Wrapper scripts</a></h2>
+<p>The <code>bowtie2</code>, <code>bowtie2-build</code> and <code>bowtie2-inspect</code> executables are actually wrapper scripts that call binary programs as appropriate. The wrappers shield users from having to distinguish between "small" and "large" index formats, discussed briefly in the following section. Also, the <code>bowtie2</code> wrapper provides some key functionality, like the ability to handle compressed inputs, and the fucntionality for <a href="#bowtie [...]
+<p>It is recommended that you always run the bowtie2 wrappers and not run the binaries directly.</p>
+<h2 id="small-and-large-indexes"><a href="#TOC">Small and large indexes</a></h2>
+<p><code>bowtie2-build</code> can index reference genomes of any size. For genomes less than about 4 billion nucleotides in length, <code>bowtie2-build</code> builds a "small" index using 32-bit numbers in various parts of the index. When the genome is longer, <code>bowtie2-build</code> builds a "large" index using 64-bit numbers. Small indexes are stored in files with the <code>.bt2</code> extension, and large indexes are stored in files with the <code>.bt2l</code> e [...]
 <h2 id="performance-tuning"><a href="#TOC">Performance tuning</a></h2>
 <ol style="list-style-type: decimal">
-<li><p>Use 64-bit version if possible</p>
-<p>The 64-bit version of Bowtie 2 is faster than the 32-bit version, owing to its use of 64-bit arithmetic. If possible, download the 64-bit binaries for Bowtie 2 and run on a 64-bit computer. If you are building Bowtie 2 from sources, you may need to pass the <code>-m64</code> option to <code>g++</code> to compile the 64-bit version; you can do this by including <code>BITS=64</code> in the arguments to the <code>make</code> command; e.g.: <code>make BITS=64 bowtie2</code>. To determine  [...]
 <li><p>If your computer has multiple processors/cores, use <code>-p</code></p>
 <p>The <a href="#bowtie2-options-p"><code>-p</code></a> option causes Bowtie 2 to launch a specified number of parallel search threads. Each thread runs on a different processor/core and all threads find alignments in parallel, increasing alignment throughput by approximately a multiple of the number of threads (though in practice, speedup is somewhat worse than linear).</p></li>
+<li><p>If reporting many alignments per read, try reducing <code>bowtie2-build --offrate</code></p>
+<p>If you are using <a href="#bowtie2-options-k"><code>-k</code></a> or <a href="#bowtie2-options-a"><code>-a</code></a> options and Bowtie 2 is reporting many alignments per read, using an index with a denser SA sample can speed things up considerably. To do this, specify a smaller-than-default <a href="#bowtie2-build-options-o"><code>-o</code>/<code>--offrate</code></a> value when running <code>bowtie2-build</code>. A denser SA sample yields a larger index, but is also particularly eff [...]
+<li><p>If <code>bowtie2</code> "thrashes", try increasing <code>bowtie2-build --offrate</code></p>
+<p>If <code>bowtie2</code> runs very slowly on a relatively low-memory computer, try setting <a href="#bowtie2-options-o"><code>-o</code>/<code>--offrate</code></a> to a <em>larger</em> value when building the index. This decreases the memory footprint of the index.</p></li>
 </ol>
 <h2 id="command-line"><a href="#TOC">Command Line</a></h2>
 <h3 id="setting-function-options"><a href="#TOC">Setting function options</a></h3>
@@ -1196,7 +1202,7 @@ Seed 4 rc:                   TTATGCATGA</code></pre>
 </td>
 <td>
 
-<p>Alignment score for second-best alignment. Can be negative. Can be greater than 0 in <a href="#bowtie2-options-local"><code>--local</code></a> mode (but not in <a href="#bowtie2-options-end-to-end"><code>--end-to-end</code></a> mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.</p>
+<p>Alignment score for the best-scoring alignment found other than the alignment reported. Can be negative. Can be greater than 0 in <a href="#bowtie2-options-local"><code>--local</code></a> mode (but not in <a href="#bowtie2-options-end-to-end"><code>--end-to-end</code></a> mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. Note that, when the read is part of a concordantly-aligned pair, this score could be greater than <a hr [...]
 </td></tr>
 <tr><td id="bowtie2-build-opt-fields-ys">
 
@@ -1294,12 +1300,11 @@ Seed 4 rc:                   TTATGCATGA</code></pre>
 </li>
 </ol>
 <h1 id="the-bowtie2-build-indexer"><a href="#TOC">The <code>bowtie2-build</code> indexer</a></h1>
-<p><code>bowtie2-build</code> builds a Bowtie index from a set of DNA sequences. <code>bowtie2-build</code> outputs a set of 6 files with suffixes <code>.1.bt2</code>, <code>.2.bt2</code>, <code>.3.bt2</code>, <code>.4.bt2</code>, <code>.rev.1.bt2</code>, and <code>.rev.2.bt2</code>. These files together constitute the index: they are all that is needed to align reads to that reference. The original sequence FASTA files are no longer used by Bowtie 2 once the index is built.</p>
+<p><code>bowtie2-build</code> builds a Bowtie index from a set of DNA sequences. <code>bowtie2-build</code> outputs a set of 6 files with suffixes <code>.1.bt2</code>, <code>.2.bt2</code>, <code>.3.bt2</code>, <code>.4.bt2</code>, <code>.rev.1.bt2</code>, and <code>.rev.2.bt2</code>. In the case of a large index these suffixes will have a <code>bt2l</code> termination. These files together constitute the index: they are all that is needed to align reads to that reference. The original se [...]
 <p>Bowtie 2's <code>.bt2</code> index format is different from Bowtie 1's <code>.ebwt</code> format, and they are not compatible with each other.</p>
 <p>Use of Karkkainen's <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> allows <code>bowtie2-build</code> to trade off between running time and memory usage. <code>bowtie2-build</code> has three options governing how it makes this trade: <a href="#bowtie2-build-options-p"><code>-p</code>/<code>--packed</code></a>, <a href="#bowtie2-build-options-bmax"><code>--bmax</code></a>/<a href="#bowtie2-build-options-bmaxdivn"><code>--bmaxdivn</code></a>, and <a href= [...]
 <p>The indexer provides options pertaining to the "shape" of the index, e.g. <a href="#bowtie2-build-options-o"><code>--offrate</code></a> governs the fraction of <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows that are "marked" (i.e., the density of the suffix-array sample; see the original <a href="http://portal.acm.org/citation.cfm?id=796543">FM Index</a> paper for details). All of these options are potentially profitable t [...]
-<p>Because <code>bowtie2-build</code> uses 32-bit pointers internally, it can handle up to a theoretical maximum of 2^32-1 (somewhat more than 4 billion) characters in an index, though, with other constraints, the actual ceiling is somewhat less than that. If your reference exceeds 2^32-1 characters, <code>bowtie2-build</code> will print an error message and abort. To resolve this, divide your reference sequences into smaller batches and/or chunks and build a separate index for each.</p>
-<p>If your computer has more than 3-4 GB of memory and you would like to exploit that fact to make index building faster, use a 64-bit version of the <code>bowtie2-build</code> binary. The 32-bit version of the binary is restricted to using less than 4 GB of memory. If a 64-bit pre-built binary does not yet exist for your platform on the sourceforge download site, you will need to build one from source.</p>
+<p><code>bowtie2-build</code> can generate either <a href="#small-and-large-indexes">small or large indexes</a>. The wrapper will decide which based on the length of the input genome. If the reference does not exceed 4 billion characters but a large index is preferred, the user can specify <a href="#bowtie2-build-options-large-index"><code>--large-index</code></a> to force <code>bowtie2-build</code> to build a large index instead.</p>
 <p>The Bowtie 2 index is based on the <a href="http://portal.acm.org/citation.cfm?id=796543">FM Index</a> of Ferragina and Manzini, which in turn is based on the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> transform. The algorithm used to build the index is based on the <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> of Karkkainen.</p>
 <h2 id="command-line-1"><a href="#TOC">Command Line</a></h2>
 <p>Usage:</p>
@@ -1333,6 +1338,15 @@ Seed 4 rc:                   TTATGCATGA</code></pre>
 
 <p>The reference sequences are given on the command line. I.e. <code><reference_in></code> is a comma-separated list of sequences rather than a list of FASTA files.</p>
 </td></tr>
+</td></tr><tr><td id="bowtie2-build-options-large-index">
+
+
+
+<pre><code>--large-index</code></pre>
+</td><td>
+
+<p>Force <code>bowtie2-build</code> to build a <a href="#small-and-large-indexes">large index</a>, even if the reference is less than ~ 4 billion nucleotides inlong.</p>
+</td></tr>
 <tr><td id="bowtie2-build-options-a">
 
 
diff --git a/dp_framer.h b/dp_framer.h
index 4209f41..e27dca5 100644
--- a/dp_framer.h
+++ b/dp_framer.h
@@ -87,6 +87,12 @@ struct DPRect {
 	
 	size_t  maxgap; // max # gaps - width of the gap bands
 	
+	void write(std::ostream& os) const {
+		os << refl << ',' << refr << ',' << refl_pretrim << ','
+		   << refr_pretrim << ',' << triml << ',' << trimr << ','
+		   << corel << ',' << corer << ',' << maxgap;
+	}
+	
 	/**
 	 * Return true iff the combined effect of triml and trimr is to trim away
 	 * the entire rectangle.
diff --git a/ds.h b/ds.h
index 4eea349..5453762 100644
--- a/ds.h
+++ b/ds.h
@@ -29,6 +29,7 @@
 #include "assert_helpers.h"
 #include "threading.h"
 #include "random_source.h"
+#include "btypes.h"
 
 /**
  * Tally how much memory is allocated to certain 
@@ -780,7 +781,7 @@ public:
 		if(num < 2) return;
 		size_t left = num;
 		for(size_t i = begin; i < begin + num - 1; i++) {
-			uint32_t rndi = rnd.nextU32() % left;
+			size_t rndi = rnd.nextSizeT() % left;
 			if(rndi > 0) {
 				std::swap(list_[i], list_[i + rndi]);
 			}
@@ -2970,10 +2971,10 @@ public:
         size_t c1 = ((cur+1) << 1) - 1;
         size_t c2 = c1 + 1;
 		if(c1 < l_.size()) {
-			assert_leq(l_[cur], l_[c1]);
+			assert(l_[cur] <= l_[c1]);
 		}
 		if(c2 < l_.size()) {
-			assert_leq(l_[cur], l_[c2]);
+			assert(l_[cur] <= l_[c2]);
 		}
 		if(c2 < l_.size()) {
 			return repOkNode(c1) && repOkNode(c2);
@@ -3072,9 +3073,9 @@ public:
 
 private:
 	int             cat_;    // memory category, for accounting purposes
-	uint32_t        cur_;    // next page to hand out
-	const uint64_t  bytes_;  // total bytes in the pool
-	const uint32_t  pagesz_; // size of a single page
+	size_t          cur_;    // next page to hand out
+	const size_t    bytes_;  // total bytes in the pool
+	const size_t    pagesz_; // size of a single page
 	EList<uint8_t*> pages_;  // the pages themselves
 };
 
@@ -3434,7 +3435,7 @@ public:
 	 * include elements that fall off the end of list_).
 	 */
 	void setLength(size_t nlen) {
-		len_ = (uint32_t)nlen;
+		len_ = nlen;
 	}
 	
 protected:
@@ -3458,8 +3459,8 @@ public:
 
 	PListSlice(
 		PList<T, S>& list,
-		uint32_t i,
-		uint32_t len) :
+		TIndexOffU i,
+		TIndexOffU len) :
 		i_(i),
 		len_(len),
 		list_(&list)
@@ -3471,8 +3472,8 @@ public:
 	void init(const PListSlice<T, S>& sl, size_t first, size_t last) {
 		assert_gt(last, first);
 		assert_leq(last - first, sl.len_);
-		i_ = (uint32_t)(sl.i_ + first);
-		len_ = (uint32_t)(last - first);
+		i_ = (TIndexOffU)(sl.i_ + first);
+		len_ = (TIndexOffU)(last - first);
 		list_ = sl.list_;
 	}
 	
@@ -3566,12 +3567,12 @@ public:
 	 * include elements that fall off the end of list_).
 	 */
 	void setLength(size_t nlen) {
-		len_ = (uint32_t)nlen;
+		len_ = (TIndexOffU)nlen;
 	}
 	
 protected:
-	uint32_t i_;
-	uint32_t len_;
+	TIndexOffU i_;
+	TIndexOffU len_;
 	PList<T, S>* list_;
 };
 
@@ -3698,7 +3699,7 @@ public:
 	 * Initialize the current-edit pointer to 0 and set the number of
 	 * edits per memory page.
 	 */
-	RedBlack(uint32_t pageSz, int cat = 0) :
+	RedBlack(size_t pageSz, int cat = 0) :
 		perPage_(pageSz/sizeof(TNode)), pages_(cat) { clear(); }
 
 	/**
diff --git a/endian_swap.h b/endian_swap.h
index e5626ff..64c5889 100644
--- a/endian_swap.h
+++ b/endian_swap.h
@@ -22,6 +22,15 @@
 
 #include <stdint.h>
 #include <inttypes.h>
+#include "assert_helpers.h"
+
+#ifdef BOWTIE_64BIT_INDEX
+#   define endianSwapU(x) endianSwapU64(x)
+#   define endianSwapI(x) endianSwapI64(x)
+#else
+#   define endianSwapU(x) endianSwapU32(x)
+#   define endianSwapI(x) endianSwapI32(x)
+#endif
 
 /**
  * Return true iff the machine running this program is big-endian.
@@ -73,25 +82,56 @@ static inline int32_t endianSwapI32(int32_t i) {
 }
 
 /**
- * Convert uint32_t argument to the specified endianness.  It's assumed
+ * Return copy of int64_t argument with byte order reversed.
+ */
+static inline int64_t endianSwapI64(int64_t u) {
+        int64_t tmp = 0;
+        tmp |= ((u >> 56) & (0xffull <<  0));
+        tmp |= ((u >> 40) & (0xffull <<  8));
+        tmp |= ((u >> 24) & (0xffull << 16));
+        tmp |= ((u >>  8) & (0xffull << 24));
+        tmp |= ((u <<  8) & (0xffull << 32));
+        tmp |= ((u << 24) & (0xffull << 40));
+        tmp |= ((u << 40) & (0xffull << 48));
+        tmp |= ((u << 56) & (0xffull << 56));
+        return tmp;
+}
+
+/**
+ * Convert uint32_t/uint64_t argument to the specified endianness.  It's assumed
  * that u currently has the endianness of the current machine.
  */
-static inline uint32_t endianizeU32(uint32_t u, bool toBig) {
+template <typename T>
+static inline T endianizeU(T u, bool toBig) {
 	if(toBig == currentlyBigEndian()) {
 		return u;
 	}
-	return endianSwapU32(u);
+	if(sizeof(T) == 4) {
+		return endianSwapU32((uint32_t)u);
+	} else if(sizeof(T) == 8) {
+		return endianSwapU64((uint64_t)u);
+	} else {
+		assert(false);
+	}
 }
 
+
 /**
- * Convert int32_t argument to the specified endianness.  It's assumed
+ * Convert int32_t/int64_t argument to the specified endianness.  It's assumed
  * that u currently has the endianness of the current machine.
  */
-static inline int32_t endianizeI32(int32_t i, bool toBig) {
+template <typename T>
+static inline T endianizeI(T i, bool toBig) {
 	if(toBig == currentlyBigEndian()) {
 		return i;
 	}
-	return endianSwapI32(i);
+	if(sizeof(T) == 4) {
+		return endianSwapI32((int32_t)i);
+	} else if(sizeof(T) == 8) {
+		return endianSwapI64((int64_t)i);
+	} else {
+		assert(false);
+	}
 }
 
 #endif
diff --git a/fast_mutex.h b/fast_mutex.h
old mode 100755
new mode 100644
diff --git a/filebuf.h b/filebuf.h
index a296608..66dffb4 100644
--- a/filebuf.h
+++ b/filebuf.h
@@ -512,7 +512,7 @@ private:
 	static const size_t BUF_SZ = 128 * 1024;
 	FILE    *out_;
 	int      bpPtr_;
-	uint32_t cur_;
+	size_t   cur_;
 	char     buf_[BUF_SZ]; // (large) input buffer
 };
 
@@ -536,6 +536,8 @@ public:
 			std::cerr << "Error: Could not open alignment output file " << out.c_str() << std::endl;
 			throw 1;
 		}
+		if(setvbuf(out_, NULL, _IOFBF, 10* 1024* 1024)) 
+			std::cerr << "Warning: Could not allocate the proper buffer size for output file stream. " << std::endl;
 	}
 
 	/**
diff --git a/group_walk.h b/group_walk.h
index d3fe013..2309bb1 100644
--- a/group_walk.h
+++ b/group_walk.h
@@ -88,8 +88,6 @@
 #include "reference.h"
 #include "mem_ids.h"
 
-typedef uint32_t TIndexOff;
-
 /**
  * Encapsulate an SA range and an associated list of slots where the resolved
  * offsets can be placed.
@@ -101,24 +99,24 @@ public:
 
 	SARangeWithOffs() { reset(); };
 
-	SARangeWithOffs(TIndexOff tf, size_t len, const T& o) {
+	SARangeWithOffs(TIndexOffU tf, size_t len, const T& o) {
 		init(tf, len, o);
 	}
 	
-	void init(TIndexOff tf, size_t len_, const T& o) {
-		topf = tf; len = len_, offs = o;
+	void init(TIndexOffU tf, const T& o) {
+		topf = tf, offs = o;
 	}
 
 	/**
 	 * Reset to uninitialized state.
 	 */
-	void reset() { topf = std::numeric_limits<TIndexOff>::max(); }
+	void reset() { topf = std::numeric_limits<TIndexOffU>::max(); }
 	
 	/**
 	 * Return true if this is initialized.
 	 */
 	bool inited() const {
-		return topf != std::numeric_limits<TIndexOff>::max();
+		return topf != std::numeric_limits<TIndexOffU>::max();
 	}
 	
 	/**
@@ -127,7 +125,7 @@ public:
 	 */
 	size_t size() const { return offs.size(); }
 
-	TIndexOff topf; // top in BWT index
+	TIndexOffU topf; // top in BWT index
 	size_t    len;  // length of the reference sequence involved
 	T         offs; // offsets
 };
@@ -146,7 +144,7 @@ struct GroupWalkState {
 	}
 
 	EList<bool> masks[4];      // temporary list for masks; used in GWState
-	EList<uint32_t, 16> map;   // temporary list of GWState maps
+	EList<TIndexOffU, 16> map;   // temporary list of GWState maps
 };
 
 /**
@@ -198,7 +196,7 @@ struct GWElt {
 	 * Reset GWElt to uninitialized state.
 	 */
 	void reset() {
-		offidx = range = elt = len = 0xffffffff;
+		offidx = range = elt = len = OFF_MASK;
 		fw = false;
 	}
 
@@ -206,11 +204,11 @@ struct GWElt {
 	 * Initialize this WalkResult.
 	 */
 	void init(
-		uint32_t oi,
+		TIndexOffU oi,
 		bool f,
-		uint32_t r,
-		uint32_t e,
-		uint32_t l)
+		TIndexOffU r,
+		TIndexOffU e,
+		TIndexOffU l)
 	{
 		offidx = oi;
 		fw = f;
@@ -239,11 +237,11 @@ struct GWElt {
 		return !(*this == o);
 	}
 
-	uint32_t offidx; // seed offset index
+	TIndexOffU offidx; // seed offset index
 	bool     fw;     // strand
-	uint32_t range;  // range
-	uint32_t elt;    // element
-	uint32_t len;    // length
+	TIndexOffU range;  // range
+	TIndexOffU elt;    // element
+	TIndexOffU len;    // length
 };
 
 /**
@@ -259,20 +257,20 @@ struct WalkResult {
 	 */
 	void reset() {
 		elt.reset();
-		bwrow = toff = 0xffffffff;
+		bwrow = toff = OFF_MASK;
 	}
 
 	/**
 	 * Initialize this WalkResult.
 	 */
 	void init(
-		uint32_t oi,  // seed offset index
+		TIndexOffU oi,  // seed offset index
 		bool f,       // strand
-		uint32_t r,   // range
-		uint32_t e,   // element
-		uint32_t bwr, // BW row
-		uint32_t len, // length
-		uint32_t to)  // text offset
+		TIndexOffU r,   // range
+		TIndexOffU e,   // element
+		TIndexOffU bwr, // BW row
+		TIndexOffU len, // length
+		TIndexOffU to)  // text offset
 	{
 		elt.init(oi, f, r, e, len);
 		bwrow = bwr;
@@ -280,8 +278,8 @@ struct WalkResult {
 	}
 
 	GWElt    elt;   // element resolved
-	uint32_t bwrow; // SA row resolved
-	uint32_t toff;  // resolved offset from SA sample
+	TIndexOffU bwrow; // SA row resolved
+	TIndexOffU toff;  // resolved offset from SA sample
 };
 
 /**
@@ -296,10 +294,10 @@ public:
 
 	GWHit() :
 		fmap(0, GW_CAT),
-		offidx(0xffffffff),
+		offidx(OFF_MASK),
 		fw(false),
-		range(0xffffffff),
-		len(0xffffffff),
+		range(OFF_MASK),
+		len(OFF_MASK),
 		reported_(0, GW_CAT),
 		nrep_(0)
 	{
@@ -312,19 +310,19 @@ public:
 	 */
 	void init(
 		SARangeWithOffs<T>& sa,
-		uint32_t oi,
+		TIndexOffU oi,
 		bool f,
-		uint32_t r)
+		TIndexOffU r)
 	{
 		nrep_ = 0;
 		offidx = oi;
 		fw = f;
 		range = r;
-		len = (uint32_t)sa.len;
+		len = (TIndexOffU)sa.len;
 		reported_.resize(sa.offs.size());
 		reported_.fill(false);
 		fmap.resize(sa.offs.size());
-		fmap.fill(make_pair(0xffffffff, 0xffffffff));
+		fmap.fill(make_pair(OFF_MASK, OFF_MASK));
 	}
 	
 	/**
@@ -334,10 +332,10 @@ public:
 		reported_.clear();
 		fmap.clear();
 		nrep_ = 0;
-		offidx = 0xffffffff;
+		offidx = OFF_MASK;
 		fw = false;
-		range = 0xffffffff;
-		len = 0xffffffff;
+		range = OFF_MASK;
+		len = OFF_MASK;
 	}
 	
 #ifndef NDEBUG
@@ -354,11 +352,11 @@ public:
 		size_t nrep = 0;
 		for(size_t i = 0; i < fmap.size(); i++) {
 			if(reported_[i]) nrep++;
-			if(sa.offs[i] != 0xffffffff) {
+			if(sa.offs[i] != OFF_MASK) {
 				continue;
 			}
 			for(size_t j = i+1; j < fmap.size(); j++) {
-				if(sa.offs[j] != 0xffffffff) {
+				if(sa.offs[j] != OFF_MASK) {
 					continue;
 				}
 				assert(fmap[i] != fmap[j]);
@@ -402,11 +400,11 @@ public:
 		return nrep_ == reported_.size();
 	}
 
-	EList<std::pair<uint32_t, uint32_t>, 16> fmap; // forward map; to GWState & elt
-	uint32_t offidx; // offset idx
+	EList<std::pair<TIndexOffU, TIndexOffU>, 16> fmap; // forward map; to GWState & elt
+	TIndexOffU offidx; // offset idx
 	bool fw;         // orientation
-	uint32_t range;  // original range index
-	uint32_t len;    // length of hit
+	TIndexOffU range;  // original range index
+	TIndexOffU len;    // length of hit
 
 protected:
 
@@ -435,18 +433,18 @@ public:
 	 * Returns true iff at least one elt was resolved.
 	 */
 	template<int S>
-	pair<int, int> init(
+	pair<TIndexOff, TIndexOff> init(
 		const Ebwt& ebwt,             // index to walk left in
 		const BitPairReference& ref,  // bitpair-encoded reference
 		SARangeWithOffs<T>& sa,       // SA range with offsets
 		EList<GWState, S>& sts,       // EList of GWStates for range being advanced
 		GWHit<T>& hit,                // Corresponding hit structure
-		uint32_t range,               // which range is this?
+		TIndexOffU range,               // which range is this?
 		bool reportList,              // if true, "report" resolved offsets immediately by adding them to 'res' list
 		EList<WalkResult, 16>* res,   // EList where resolved offsets should be appended
-		uint32_t tp,                  // top of range at this step
-		uint32_t bt,                  // bot of range at this step
-		uint32_t st,                  // # steps taken to get to this step
+		TIndexOffU tp,                  // top of range at this step
+		TIndexOffU bt,                  // bot of range at this step
+		TIndexOffU st,                  // # steps taken to get to this step
 		WalkMetrics& met)
 	{
 		assert_gt(bt, tp);
@@ -472,13 +470,13 @@ public:
 	 * second being the number of as-yet-unresolved offsets.
 	 */
 	template<int S>
-	pair<int, int> init(
+	pair<TIndexOff, TIndexOff> init(
 		const Ebwt& ebwt,             // forward Bowtie index
 		const BitPairReference& ref,  // bitpair-encoded reference
 		SARangeWithOffs<T>& sa,       // SA range with offsets
 		EList<GWState, S>& st,        // EList of GWStates for advancing range
 		GWHit<T>& hit,                // Corresponding hit structure
-		uint32_t range,               // range being inited
+		TIndexOffU range,               // range being inited
 		bool reportList,              // report resolutions, adding to 'res' list?
 		EList<WalkResult, 16>* res,   // EList to append resolutions
 		WalkMetrics& met)             // update these metrics
@@ -486,22 +484,22 @@ public:
 		assert(inited_);
 		assert_eq(step, lastStep_+1);
 		ASSERT_ONLY(lastStep_++);
-		assert_leq((uint32_t)step, ebwt.eh().len());
+		assert_leq((TIndexOffU)step, ebwt.eh().len());
 		assert_lt(range, st.size());
-		pair<int, int> ret = make_pair(0, 0);
-		uint32_t trimBegin = 0, trimEnd = 0;
+		pair<TIndexOff, TIndexOff> ret = make_pair(0, 0);
+		TIndexOffU trimBegin = 0, trimEnd = 0;
 		bool empty = true; // assume all resolved until proven otherwise
 		// Commit new information, if any, to the PListSlide.  Also,
 		// trim and check if we're done.
 		for(size_t i = mapi_; i < map_.size(); i++) {
-			bool resolved = (off(i, sa) != 0xffffffff);
+			bool resolved = (off(i, sa) != OFF_MASK);
 			if(!resolved) {
 				// Elt not resolved yet; try to resolve it now
-				uint32_t bwrow = (uint32_t)(top - mapi_ + i);
-				uint32_t toff = ebwt.tryOffset(bwrow);
-				ASSERT_ONLY(uint32_t origBwRow = sa.topf + map(i));
+				TIndexOffU bwrow = (TIndexOff)(top - mapi_ + i);
+				TIndexOffU toff = ebwt.tryOffset(bwrow);
+				ASSERT_ONLY(TIndexOffU origBwRow = sa.topf + map(i));
 				assert_eq(bwrow, ebwt.walkLeft(origBwRow, step));
-				if(toff != 0xffffffff) {
+				if(toff != OFF_MASK) {
 					// Yes, toff was resolvable
 					assert_eq(toff, ebwt.getOffset(bwrow));
 					met.resolves++;
@@ -523,7 +521,7 @@ public:
 					// respective BW rows but they WILL all be correct w/r/t
 					// the reference sequence underneath, which is what really
 					// matters here.
-					uint32_t tidx = 0xffffffff, tof, tlen;
+					TIndexOffU tidx = OFF_MASK, tof, tlen;
 					bool straddled = false;
 					ebwt.joinedToTextOff(
 						hit.len, // length of seed
@@ -533,7 +531,7 @@ public:
 						tlen,    // length of reference sequence
 						true,    // don't reject straddlers
 						straddled);
-					if(tidx != 0xffffffff &&
+					if(tidx != OFF_MASK &&
 					   hit.satup->key.seq != std::numeric_limits<uint64_t>::max())
 					{
 						// key: 2-bit characters packed into a 64-bit word with
@@ -572,13 +570,13 @@ public:
 			// resolved (whether this function did it just now, whether it did
 			// it a while ago, or whether some other function outside GroupWalk
 			// did it).
-			if(off(i, sa) != 0xffffffff) {
+			if(off(i, sa) != OFF_MASK) {
 				if(reportList && !hit.reported(map(i))) {
 					// Report it
-					uint32_t toff = off(i, sa);
+					TIndexOffU toff = off(i, sa);
 					assert(res != NULL);
 					res->expand();
-					uint32_t origBwRow = sa.topf + map(i);
+					TIndexOffU origBwRow = sa.topf + map(i);
 					res->back().init(
 						hit.offidx, // offset idx
 						hit.fw,     // orientation
@@ -607,12 +605,12 @@ public:
 				// object to point to the appropriate element of our
 				// range
 				assert_geq(i, mapi_);
-				uint32_t bmap = map(i);
+				TIndexOffU bmap = map(i);
 				hit.fmap[bmap].first = range;
-				hit.fmap[bmap].second = (uint32_t)i;
+				hit.fmap[bmap].second = (TIndexOffU)i;
 #ifndef NDEBUG
 				for(size_t j = 0; j < bmap; j++) {
-					if(sa.offs[j] == 0xffffffff &&
+					if(sa.offs[j] == OFF_MASK &&
 					   hit.fmap[j].first == range)
 					{
 						assert_neq(i, hit.fmap[j].second);
@@ -636,13 +634,13 @@ public:
 			// If range is done, all elements from map should be
 			// resolved
 			for(size_t i = mapi_; i < map_.size(); i++) {
-				assert_neq(0xffffffff, off(i, sa));
+				assert_neq(OFF_MASK, off(i, sa));
 			}
 			// If this range is done, then it should be the case that
 			// all elements in the corresponding GWHit that point to
 			// this range are resolved.
 			for(size_t i = 0; i < hit.fmap.size(); i++) {
-				if(sa.offs[i] == 0xffffffff) {
+				if(sa.offs[i] == OFF_MASK) {
 					assert_neq(range, hit.fmap[i].first);
 				}
 			}
@@ -659,14 +657,14 @@ public:
 			// must split it into the two ranges on either side of the
 			// dollar.  Let 'bot' and 'top' delimit the portion of the
 			// range prior to the dollar.
-			uint32_t oldbot = bot;
+			TIndexOffU oldbot = bot;
 			bot = ebwt._zOff;
 			// Note: might be able to do additional trimming off the
 			// end.
 			// Create a new range for the portion after the dollar.
 			st.expand();
 			st.back().reset();
-			uint32_t ztop = ebwt._zOff+1;
+			TIndexOffU ztop = ebwt._zOff+1;
 			st.back().initMap(oldbot - ztop);
 			assert_eq(map_.size(), oldbot-top+mapi_);
 			for(size_t i = ztop; i < oldbot; i++) {
@@ -679,7 +677,7 @@ public:
 				sa,
 				st,
 				hit,
-				(uint32_t)st.size()-1,
+				(TIndexOffU)st.size()-1,
 				reportList,
 				res,
 				ztop,
@@ -708,7 +706,7 @@ public:
 	bool repOk(
 		const Ebwt& ebwt,
 		GWHit<T>& hit,
-		uint32_t range) const
+		TIndexOffU range) const
 	{
 		assert(done() || bot > top);
 		assert(doneResolving(hit) || (tloc.valid() && tloc.repOk(ebwt.eh())));
@@ -716,14 +714,14 @@ public:
 		assert_eq(map_.size()-mapi_, bot-top);
 		// Make sure that 'done' is compatible with whether we have >=
 		// 1 elements left to resolve.
-		int left = 0;
+		TIndexOff left = 0;
 		for(size_t i = mapi_; i < map_.size(); i++) {
-			ASSERT_ONLY(uint32_t row = (uint32_t)(top + i - mapi_));
-			ASSERT_ONLY(uint32_t origRow = hit.satup->topf + map(i));
+			ASSERT_ONLY(TIndexOffU row = (TIndexOffU)(top + i - mapi_));
+			ASSERT_ONLY(TIndexOffU origRow = hit.satup->topf + map(i));
 			assert(step == 0 || row != origRow);
 			assert_eq(row, ebwt.walkLeft(origRow, step));
 			assert_lt(map_[i], hit.satup->offs.size());
-			if(off(i, hit) == 0xffffffff) left++;
+			if(off(i, hit) == OFF_MASK) left++;
 		}
 		assert(repOkMapRepeats());
 		assert(repOkMapInclusive(hit, range));
@@ -742,9 +740,9 @@ public:
 	 * Check that the fmap elements pointed to by our map_ include all
 	 * of the fmap elements that point to this range.
 	 */
-	bool repOkMapInclusive(GWHit<T>& hit, uint32_t range) const {
+	bool repOkMapInclusive(GWHit<T>& hit, TIndexOffU range) const {
 		for(size_t i = 0; i < hit.fmap.size(); i++) {
-			if(hit.satup->offs[i] == 0xffffffff) {
+			if(hit.satup->offs[i] == OFF_MASK) {
 				if(range == hit.fmap[i].first) {
 					ASSERT_ONLY(bool found = false);
 					for(size_t j = mapi_; j < map_.size(); j++) {
@@ -777,7 +775,7 @@ public:
 	 * Return the offset currently assigned to the ith element.  If it
 	 * has not yet been resolved, return 0xffffffff.
 	 */
-	uint32_t off(
+	TIndexOffU off(
 		size_t i,
 		const SARangeWithOffs<T>& sa)
 	{
@@ -791,7 +789,7 @@ public:
 	 * Return the offset of the element within the original range's
 	 * PListSlice that the ith element of this range corresponds to.
 	 */
-	uint32_t map(size_t i) const {
+	TIndexOffU map(size_t i) const {
 		assert_geq(i, mapi_);
 		assert_lt(i, map_.size());
 		return map_[i];
@@ -800,7 +798,7 @@ public:
 	/**
 	 * Return the offset of the first untrimmed offset in the map.
 	 */
-	uint32_t mapi() const {
+	TIndexOffU mapi() const {
 		return mapi_;
 	}
 
@@ -826,7 +824,7 @@ public:
 	 */
 	void setOff(
 		size_t i,
-		uint32_t off,
+		TIndexOffU off,
 		SARangeWithOffs<T>& sa,
 		WalkMetrics& met)
 	{
@@ -849,12 +847,12 @@ public:
 	 * second being the number of as-yet-unresolved offsets.
 	 */
 	template <int S>
-	pair<int, int> advance(
+	pair<TIndexOff, TIndexOff> advance(
 		const Ebwt& ebwt,            // the forward Bowtie index, for stepping left
 		const BitPairReference& ref, // bitpair-encoded reference
 		SARangeWithOffs<T>& sa,      // SA range with offsets
 		GWHit<T>& hit,               // the associated GWHit object
-		uint32_t range,              // which range is this?
+		TIndexOffU range,              // which range is this?
 		bool reportList,             // if true, "report" resolved offsets immediately by adding them to 'res' list
 		EList<WalkResult, 16>* res,  // EList where resolved offsets should be appended
 		EList<GWState, S>& st,       // EList of GWStates for range being advanced
@@ -862,19 +860,19 @@ public:
 		WalkMetrics& met,
 		PerReadMetrics& prm)
 	{
-		ASSERT_ONLY(uint32_t origTop = top);
-		ASSERT_ONLY(uint32_t origBot = bot);
+		ASSERT_ONLY(TIndexOffU origTop = top);
+		ASSERT_ONLY(TIndexOffU origBot = bot);
 		assert_geq(step, 0);
 		assert_eq(step, lastStep_);
 		assert_geq(st.capacity(), st.size() + 4);
 		assert(tloc.valid()); assert(tloc.repOk(ebwt.eh()));
 		assert_eq(bot-top, map_.size()-mapi_);
-		pair<int, int> ret = make_pair(0, 0);
+		pair<TIndexOff, TIndexOff> ret = make_pair(0, 0);
 		assert_eq(top, tloc.toBWRow());
 		if(bloc.valid()) {
 			// Still multiple elements being tracked
 			assert_lt(top+1, bot);
-			uint32_t upto[4], in[4];
+			TIndexOffU upto[4], in[4];
 			upto[0] = in[0] = upto[1] = in[1] =
 			upto[2] = in[2] = upto[3] = in[3] = 0;
 			assert_eq(bot, bloc.toBWRow());
@@ -890,8 +888,8 @@ public:
 			}
 #endif
 			bool first = true;
-			ASSERT_ONLY(uint32_t sum = 0);
-			uint32_t newtop = 0, newbot = 0;
+			ASSERT_ONLY(TIndexOffU sum = 0);
+			TIndexOffU newtop = 0, newbot = 0;
 			gws.map.clear();
 			for(int i = 0; i < 4; i++) {
 				if(in[i] > 0) {
@@ -915,7 +913,7 @@ public:
 								// of the corresponding element in the
 								// root range
 								assert_lt(gws.map.back(), sa.size());
-								if(sa.offs[gws.map.back()] == 0xffffffff) {
+								if(sa.offs[gws.map.back()] == OFF_MASK) {
 									assert_eq(newtop + gws.map.size() - 1,
 											  ebwt.walkLeft(sa.topf + gws.map.back(), step+1));
 								}
@@ -931,8 +929,8 @@ public:
 						// pointing to bad memory.
 						st.expand();
 						st.back().reset();
-						uint32_t ntop = upto[i];
-						uint32_t nbot = ntop + in[i];
+						TIndexOffU ntop = upto[i];
+						TIndexOffU nbot = ntop + in[i];
 						assert_lt(nbot-ntop, bot-top);
 						st.back().mapi_ = 0;
 						st.back().map_.clear();
@@ -941,14 +939,14 @@ public:
 						for(size_t j = 0; j < gws.masks[i].size(); j++) {
 							if(gws.masks[i][j]) st.back().map_.push_back(map_[j+mapi_]);
 						}
-						pair<int, int> rret =
+						pair<TIndexOff, TIndexOff> rret =
 						st.back().init(
 							ebwt,        // forward Bowtie index
 							ref,         // bitpair-encodede reference
 							sa,          // SA range with offsets
 							st,          // EList of all GWStates associated with original range
 							hit,         // associated GWHit object
-							(uint32_t)st.size()-1, // range offset
+							(TIndexOffU)st.size()-1, // range offset
 							reportList,  // if true, report hits to 'res' list
 							res,         // report hits here if reportList is true
 							ntop,        // BW top of new range
@@ -980,7 +978,7 @@ public:
 			assert_eq(bot, top+1);
 			assert_eq(1, map_.size()-mapi_);
 			// Sets top, returns char walked through (which we ignore)
-			ASSERT_ONLY(uint32_t oldtop = top);
+			ASSERT_ONLY(TIndexOffU oldtop = top);
 			met.bwops++;
 			prm.nExFmops++;
 			ebwt.mapLF1(top, tloc);
@@ -995,8 +993,8 @@ public:
 		assert(top != origTop || bot != origBot);
 		step++;
 		assert_gt(step, 0);
-		assert_leq((uint32_t)step, ebwt.eh().len());
-		pair<int, int> rret =
+		assert_leq((TIndexOffU)step, ebwt.eh().len());
+		pair<TIndexOff, TIndexOff> rret =
 		init<S>(
 			ebwt,       // forward Bowtie index
 			ref,        // bitpair-encodede reference
@@ -1031,7 +1029,7 @@ public:
 		mapi_ = 0;
 		map_.resize(newsz);
 		for(size_t i = 0; i < newsz; i++) {
-			map_[i] = (uint32_t)i;
+			map_[i] = (TIndexOffU)i;
 		}
 	}
 
@@ -1052,23 +1050,23 @@ public:
 	 */
 	bool doneResolving(const SARangeWithOffs<T>& sa) const {
 		for(size_t i = mapi_; i < map_.size(); i++) {
-			if(sa.offs[map(i)] == 0xffffffff) return false;
+			if(sa.offs[map(i)] == OFF_MASK) return false;
 		}
 		return true;
 	}
 
 	SideLocus tloc;      // SideLocus for top
 	SideLocus bloc;      // SideLocus for bottom
-	uint32_t  top;       // top elt of range in BWT
-	uint32_t  bot;       // bot elt of range in BWT
-	int       step;      // how many steps have we walked to the left so far
+	TIndexOffU  top;       // top elt of range in BWT
+	TIndexOffU  bot;       // bot elt of range in BWT
+	TIndexOff   step;      // how many steps have we walked to the left so far
 
 protected:
 	
 	ASSERT_ONLY(bool inited_);
-	ASSERT_ONLY(int lastStep_);
-	EList<uint32_t, 16> map_; // which elts in range 'range' we're tracking
-	uint32_t mapi_;           // first untrimmed element of map
+	ASSERT_ONLY(TIndexOff lastStep_);
+	EList<TIndexOffU, 16> map_; // which elts in range 'range' we're tracking
+	TIndexOffU mapi_;           // first untrimmed element of map
 };
 
 template<typename T, int S>
@@ -1108,8 +1106,8 @@ public:
 		st_.resize(1);
 		st_.back().reset();
 		assert(st_.back().repOkBasic());
-		uint32_t top = sa.topf;
-		uint32_t bot = (uint32_t)(top + sa.size());
+		TIndexOffU top = sa.topf;
+		TIndexOffU bot = (TIndexOffU)(top + sa.size());
 		st_.back().initMap(bot-top);
 		st_.ensure(4);
 		st_.back().init(
@@ -1139,7 +1137,7 @@ public:
 	void resolveAll(WalkMetrics& met, PerReadMetrics& prm) {
 		WalkResult res; // ignore results for now
 		for(size_t i = 0; i < elt_; i++) {
-			advanceElement((uint32_t)i, res, met, prm);
+			advanceElement((TIndexOffU)i, res, met, prm);
 		}
 	}
 
@@ -1148,7 +1146,7 @@ public:
 	 * resolved.
 	 */
 	bool advanceElement(
-		uint32_t elt,                // element within the range
+		TIndexOffU elt,                // element within the range
 		const Ebwt& ebwtFw,          // forward Bowtie index for walking left
 		const BitPairReference& ref, // bitpair-encoded reference
 		SARangeWithOffs<T>& sa,      // SA range with offsets
@@ -1162,7 +1160,7 @@ public:
 		assert(hit_.repOk(sa));
 		assert_lt(elt, sa.size()); // elt must fall within range
 		// Until we've resolved our element of interest...
-		while(sa.offs[elt] == 0xffffffff) {
+		while(sa.offs[elt] == OFF_MASK) {
 			// Get the GWState that contains our element of interest
 			size_t range = hit_.fmap[elt].first;
 			st_.ensure(4);
@@ -1176,17 +1174,17 @@ public:
 				ref,
 				sa,
 				hit_,
-				(uint32_t)range,
+				(TIndexOffU)range,
 				false,
 				NULL,
 				st_,
 				gws,
 				met,
 				prm);
-			assert(sa.offs[elt] != 0xffffffff ||
+			assert(sa.offs[elt] != OFF_MASK ||
 			       !st_[hit_.fmap[elt].first].doneResolving(sa));
 		}
-		assert_neq(0xffffffff, sa.offs[elt]);
+		assert_neq(OFF_MASK, sa.offs[elt]);
 		// Report it!
 		if(!hit_.reported(elt)) {
 			hit_.setReported(elt);
@@ -1198,7 +1196,7 @@ public:
 			0,              // range
 			elt,            // element
 			sa.topf + elt,  // bw row
-			(uint32_t)sa.len, // length of hit
+			(TIndexOffU)sa.len, // length of hit
 			sa.offs[elt]);  // resolved text offset
 		rep_++;
 		return true;
@@ -1222,7 +1220,7 @@ public:
 		const size_t sz = sa.size();
 		for(size_t m = 0; m < sz; m++) {
 			// Is it resolved?
-			if(sa.offs[m] != 0xffffffff) {
+			if(sa.offs[m] != OFF_MASK) {
 				resolved++;
 			} else {
 				assert(!hit_.reported(m));
diff --git a/ls.h b/ls.h
index 9b0bc88..e333f7c 100644
--- a/ls.h
+++ b/ls.h
@@ -237,7 +237,7 @@ class LarssonSadakane {
 		  b=b<<s|(x[r]-l+1);        /* b is start of x in chunk alphabet.*/
 		  d=c;                      /* d is max symbol in chunk alphabet.*/
 	   }
-	   m=(1<<(r-1)*s)-1;            /* m masks off top old symbol from chunk.*/
+	   m=(((T)1)<<(r-1)*s)-1;            /* m masks off top old symbol from chunk.*/
 	   x[n]=l-1;                    /* emulate zero terminator.*/
 	   if (d<=n) {                  /* if bucketing possible, compact alphabet.*/
 		  for (pi=p; pi<=p+d; ++pi)
diff --git a/mm.h b/mm.h
index c192cc5..a0d9301 100644
--- a/mm.h
+++ b/mm.h
@@ -36,6 +36,7 @@
 #define MM_SEEK lseek
 #define MM_FILE int
 #define MM_FILE_INIT -1
+#define MM_IS_IO_ERR(fdesc, ret, count) is_read_err(fdesc, ret, count)
 #else
 #define MM_FILE_CLOSE(x) if(x != NULL) { fclose(x); }
 #define MM_READ_RET size_t
@@ -43,6 +44,7 @@
 #define MM_SEEK fseek
 #define MM_FILE FILE*
 #define MM_FILE_INIT NULL
+#define MM_IS_IO_ERR(file_hd, ret, count) is_fread_err(file_hd, ret, count)
 #endif
 
 #endif /* MM_H_ */
diff --git a/multikey_qsort.cpp b/multikey_qsort.cpp
index 36fdd7d..adbd2ed 100644
--- a/multikey_qsort.cpp
+++ b/multikey_qsort.cpp
@@ -20,4 +20,4 @@
 #include "multikey_qsort.h"
 
 // 5 64-element buckets for bucket-sorting A, C, G, T, $
-uint32_t bkts[4][4 * 1024 * 1024];
+TIndexOffU bkts[4][4 * 1024 * 1024];
diff --git a/multikey_qsort.h b/multikey_qsort.h
index 4b69942..5c2e041 100644
--- a/multikey_qsort.h
+++ b/multikey_qsort.h
@@ -26,6 +26,7 @@
 #include "assert_helpers.h"
 #include "diff_sample.h"
 #include "sstring.h"
+#include "btypes.h"
 
 using namespace std;
 
@@ -248,7 +249,7 @@ static inline void vecswap2(TVal* s, size_t slen, TVal* s2, TPos i, TPos j, TPos
 template<typename THost>
 bool assertPartitionedSuf(
 	const THost& host,
-	uint32_t *s,
+	TIndexOffU *s,
 	size_t slen,
 	int hi,
 	int pivot,
@@ -287,7 +288,7 @@ bool assertPartitionedSuf(
 template<typename THost>
 bool assertPartitionedSuf2(
 	const THost& host,
-	uint32_t *s,
+	TIndexOffU *s,
 	size_t slen,
 	int hi,
 	int pivot,
@@ -319,7 +320,7 @@ bool assertPartitionedSuf2(
  * legitimate suffix-offset list (at this time, we just check that it doesn't
  * list any suffix twice).
  */
-static inline void sanityCheckInputSufs(uint32_t *s, size_t slen) {
+static inline void sanityCheckInputSufs(TIndexOffU *s, size_t slen) {
 	assert_gt(slen, 0);
 	for(size_t i = 0; i < slen; i++) {
 		// Actually, it's convenient to allow the caller to provide
@@ -340,11 +341,11 @@ template <typename T>
 void sanityCheckOrderedSufs(
 	const T& host,
 	size_t hlen,
-	uint32_t *s,
+	TIndexOffU *s,
 	size_t slen,
 	size_t upto,
 	size_t lower = 0,
-	size_t upper = 0xffffffff)
+	size_t upper = OFF_MASK)
 {
 	assert_lt(s[0], hlen);
 	upper = min<size_t>(upper, slen-1);
@@ -353,7 +354,7 @@ void sanityCheckOrderedSufs(
 		// convenient for some callers
 		if(s[i+1] >= hlen) continue;
 #ifndef NDEBUG
-		if(upto == 0xffffffff) {
+		if(upto == OFF_MASK) {
 			assert(sstr_suf_lt(host, s[i], hlen, host, s[i+1], hlen, false));
 		} else {
 			if(sstr_suf_upto_lt(host, s[i], host, s[i+1], upto, false)) {
@@ -392,13 +393,13 @@ template<typename T>
 void mkeyQSortSuf(
 	const T& host,
 	size_t hlen,
-	uint32_t *s,
+	TIndexOffU *s,
 	size_t slen,
 	int hi,
 	size_t begin,
 	size_t end,
 	size_t depth,
-	size_t upto = 0xffffffff)
+	size_t upto = OFF_MASK)
 {
 	// Helper for making the recursive call; sanity-checks arguments to
 	// make sure that the problem actually got smaller.
@@ -481,12 +482,12 @@ void mkeyQSortSuf(
 template<typename T>
 void mkeyQSortSuf(
 	const T& host,
-	uint32_t *s,
+	TIndexOffU *s,
 	size_t slen,
 	int hi,
 	bool verbose = false,
 	bool sanityCheck = false,
-	size_t upto = 0xffffffff)
+	size_t upto = OFF_MASK)
 {
 	size_t hlen = host.length();
 	assert_gt(slen, 0);
@@ -506,14 +507,14 @@ template<typename T>
 void mkeyQSortSuf2(
 	const T& host,
 	size_t hlen,
-	uint32_t *s,
+	TIndexOffU *s,
 	size_t slen,
-	uint32_t *s2,
+	TIndexOffU *s2,
 	int hi,
 	size_t begin,
 	size_t end,
 	size_t depth,
-	size_t upto = 0xffffffff)
+	size_t upto = OFF_MASK)
 {
 	// Helper for making the recursive call; sanity-checks arguments to
 	// make sure that the problem actually got smaller.
@@ -598,20 +599,20 @@ void mkeyQSortSuf2(
 template<typename T>
 void mkeyQSortSuf2(
 	const T& host,
-	uint32_t *s,
+	TIndexOffU *s,
 	size_t slen,
-	uint32_t *s2,
+	TIndexOffU *s2,
 	int hi,
 	bool verbose = false,
 	bool sanityCheck = false,
-	size_t upto = 0xffffffff)
+	size_t upto = OFF_MASK)
 {
 	size_t hlen = host.length();
 	if(sanityCheck) sanityCheckInputSufs(s, slen);
-	uint32_t *sOrig = NULL;
+	TIndexOffU *sOrig = NULL;
 	if(sanityCheck) {
-		sOrig = new uint32_t[slen];
-		memcpy(sOrig, s, 4 * slen);
+		sOrig = new TIndexOffU[slen];
+		memcpy(sOrig, s, OFF_SIZE * slen);
 	}
 	mkeyQSortSuf2(host, hlen, s, slen, s2, hi, (size_t)0, slen, (size_t)0, upto);
 	if(sanityCheck) {
@@ -665,7 +666,7 @@ template<typename T> inline
 void qsortSufDc(
 	const T& host,
 	size_t hlen,
-	uint32_t* s,
+	TIndexOffU* s,
 	size_t slen,
 	const DifferenceCoverSample<T>& dc,
 	size_t begin,
@@ -709,7 +710,7 @@ void mkeyQSortSufDcU8(
 	const T1& host1,
 	const T2& host,
 	size_t hlen,
-	uint32_t* s,
+	TIndexOffU* s,
 	size_t slen,
 	const DifferenceCoverSample<T1>& dc,
 	int hi,
@@ -718,7 +719,7 @@ void mkeyQSortSufDcU8(
 {
 	if(sanityCheck) sanityCheckInputSufs(s, slen);
 	mkeyQSortSufDcU8(host1, host, hlen, s, slen, dc, hi, 0, slen, 0, sanityCheck);
-	if(sanityCheck) sanityCheckOrderedSufs(host1, hlen, s, slen, 0xffffffff);
+	if(sanityCheck) sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK);
 }
 
 /**
@@ -736,7 +737,7 @@ bool sufDcLtU8(
 	bool sanityCheck = false)
 {
 	hlen += 0;
-	size_t diff = dc.tieBreakOff((uint32_t)s1, (uint32_t)s2);
+	size_t diff = dc.tieBreakOff((TIndexOffU)s1, (TIndexOffU)s2);
 	assert_lt(diff, dc.v());
 	assert_lt(diff, hlen-s1);
 	assert_lt(diff, hlen-s2);
@@ -745,7 +746,7 @@ bool sufDcLtU8(
 			assert_eq(host[s1+i], host1[s2+i]);
 		}
 	}
-	bool ret = dc.breakTie((uint32_t)(s1+diff), (uint32_t)(s2+diff)) < 0;
+	bool ret = dc.breakTie((TIndexOffU)(s1+diff), (TIndexOffU)(s2+diff)) < 0;
 	// Sanity-check return value using dollarLt
 #ifndef NDEBUG
 	bool ret2 = sstr_suf_lt(host1, s1, hlen, host, s2, hlen, false);
@@ -762,7 +763,7 @@ void qsortSufDcU8(
 	const T1& host1,
 	const T2& host,
 	size_t hlen,
-	uint32_t* s,
+	TIndexOffU* s,
 	size_t slen,
 	const DifferenceCoverSample<T1>& dc,
 	size_t begin,
@@ -805,7 +806,7 @@ void qsortSufDcU8(
 #define SELECTION_SORT_CUTOFF 6
 
 // 5 64-element buckets for bucket-sorting A, C, G, T, $
-extern uint32_t bkts[4][4 * 1024 * 1024];
+extern TIndexOffU bkts[4][4 * 1024 * 1024];
 
 /**
  * Straightforwardly obtain a uint8_t-ized version of t[off].  This
@@ -834,7 +835,7 @@ template<typename TStr>
 static inline int char_at_suf_u8(
 	const TStr& host,
 	size_t hlen,
-	uint32_t* s,
+	TIndexOffU* s,
 	size_t si,
 	size_t off,
 	uint8_t hi)
@@ -847,7 +848,7 @@ static void selectionSortSufDcU8(
 		const T1& host1,
 		const T2& host,
         size_t hlen,
-        uint32_t* s,
+        TIndexOffU* s,
         size_t slen,
         const DifferenceCoverSample<T1>& dc,
         uint8_t hi,
@@ -871,16 +872,16 @@ static void selectionSortSufDcU8(
 		if(off + s[begin] >= hlen ||
 		   off + s[begin+1] >= hlen)
 		{
-			off = 0xffffffff;
+			off = OFF_MASK;
 		}
-		if(off != 0xffffffff) {
+		if(off != OFF_MASK) {
 			if(off < depth) {
 				qsortSufDcU8<T1,T2>(host1, host, hlen, s, slen, dc,
 				                    begin, end, sanityCheck);
 				// It's helpful for debugging if we call this here
 				if(sanityCheck) {
 					sanityCheckOrderedSufs(host1, hlen, s, slen,
-					                       0xffffffff, begin, end);
+					                       OFF_MASK, begin, end);
 				}
 				return;
 			}
@@ -955,7 +956,7 @@ static void selectionSortSufDcU8(
 		if(i != targ) {
 			ASSERT_SUF_LT(targ, i);
 			// swap i and targ
-			uint32_t tmp = s[i];
+			TIndexOffU tmp = s[i];
 			s[i] = s[targ];
 			s[targ] = tmp;
 		}
@@ -964,7 +965,7 @@ static void selectionSortSufDcU8(
 		}
 	}
 	if(sanityCheck) {
-		sanityCheckOrderedSufs(host1, hlen, s, slen, 0xffffffff, begin, end);
+		sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK, begin, end);
 	}
 }
 
@@ -973,7 +974,7 @@ static void bucketSortSufDcU8(
 		const T1& host1,
 		const T2& host,
         size_t hlen,
-        uint32_t* s,
+        TIndexOffU* s,
         size_t slen,
         const DifferenceCoverSample<T1>& dc,
         uint8_t hi,
@@ -1004,7 +1005,7 @@ static void bucketSortSufDcU8(
 		                     begin, end, depth, sanityCheck);
 		if(sanityCheck) {
 			sanityCheckOrderedSufs(host1, hlen, s, slen,
-			                       0xffffffff, begin, end);
+			                       OFF_MASK, begin, end);
 		}
 		return;
 	}
@@ -1020,10 +1021,10 @@ static void bucketSortSufDcU8(
 	}
 	assert_eq(cnts[0] + cnts[1] + cnts[2] + cnts[3] + cnts[4], end - begin);
 	size_t cur = begin + cnts[0];
-	if(cnts[1] > 0) { memcpy(&s[cur], bkts[0], cnts[1] << 2); cur += cnts[1]; }
-	if(cnts[2] > 0) { memcpy(&s[cur], bkts[1], cnts[2] << 2); cur += cnts[2]; }
-	if(cnts[3] > 0) { memcpy(&s[cur], bkts[2], cnts[3] << 2); cur += cnts[3]; }
-	if(cnts[4] > 0) { memcpy(&s[cur], bkts[3], cnts[4] << 2); }
+	if(cnts[1] > 0) { memcpy(&s[cur], bkts[0], cnts[1] << (OFF_SIZE/4 + 1)); cur += cnts[1]; }
+	if(cnts[2] > 0) { memcpy(&s[cur], bkts[1], cnts[2] << (OFF_SIZE/4 + 1)); cur += cnts[2]; }
+	if(cnts[3] > 0) { memcpy(&s[cur], bkts[2], cnts[3] << (OFF_SIZE/4 + 1)); cur += cnts[3]; }
+	if(cnts[4] > 0) { memcpy(&s[cur], bkts[3], cnts[4] << (OFF_SIZE/4 + 1)); }
 	// This frame is now totally finished with bkts[][], so recursive
 	// callees can safely clobber it; we're not done with cnts[], but
 	// that's local to the stack frame.
@@ -1067,7 +1068,7 @@ void mkeyQSortSufDcU8(
 	const T1& host1,
 	const T2& host,
 	size_t hlen,
-	uint32_t* s,
+	TIndexOffU* s,
 	size_t slen,
 	const DifferenceCoverSample<T1>& dc,
 	int hi,
@@ -1092,7 +1093,7 @@ void mkeyQSortSufDcU8(
 		// k=(end-begin)
 		qsortSufDcU8<T1,T2>(host1, host, hlen, s, slen, dc, begin, end, sanityCheck);
 		if(sanityCheck) {
-			sanityCheckOrderedSufs(host1, hlen, s, slen, 0xffffffff, begin, end);
+			sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK, begin, end);
 		}
 		return;
 	}
@@ -1101,7 +1102,7 @@ void mkeyQSortSufDcU8(
 		bucketSortSufDcU8(host1, host, hlen, s, slen, dc,
 		                  (uint8_t)hi, begin, end, depth, sanityCheck);
 		if(sanityCheck) {
-			sanityCheckOrderedSufs(host1, hlen, s, slen, 0xffffffff, begin, end);
+			sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK, begin, end);
 		}
 		return;
 	}
diff --git a/opts.h b/opts.h
index 69364c6..bba213d 100644
--- a/opts.h
+++ b/opts.h
@@ -97,7 +97,6 @@ enum {
 	ARG_PRESET_FAST_LOCAL,           // --fast-local
 	ARG_PRESET_SENSITIVE_LOCAL,      // --sensitive-local
 	ARG_PRESET_VERY_SENSITIVE_LOCAL, // --very-sensitive-local
-	ARG_NO_SCORE_PRIORITY,      // --no-score-priority
 	ARG_IGNORE_QUALS,           // --ignore-quals
 	ARG_DESC,                   // --arg-desc
 	ARG_TAB5,                   // --tab5
@@ -149,7 +148,10 @@ enum {
 	ARG_DESC_KB,                // --desc-kb
 	ARG_DESC_LANDING,           // --desc-landing
 	ARG_DESC_EXP,               // --desc-exp
-	ARG_DESC_FMOPS              // --desc-fmops
+	ARG_DESC_PRIORITIZE,        // --desc-prioritize
+	ARG_DESC_FMOPS,             // --desc-fmops
+	ARG_LOG_DP,                 // --log-dp
+	ARG_LOG_DP_OPP              // --log-dp-opp
 };
 
 #endif
diff --git a/pat.cpp b/pat.cpp
index 95c6109..0969a2b 100644
--- a/pat.cpp
+++ b/pat.cpp
@@ -412,6 +412,12 @@ PairedPatternSource* PairedPatternSource::setupPatternSources(
 	return patsrc;
 }
 
+void PairedPatternSource::free_EList_pmembers( const EList<PatternSource*> &elist) {
+    for (size_t i = 0; i < elist.size(); i++)
+        if (elist[i] != NULL)
+            delete elist[i];
+}
+
 VectorPatternSource::VectorPatternSource(
 	const EList<string>& v,
 	const PatternParams& p) :
diff --git a/pat.h b/pat.h
index 041802a..d260724 100644
--- a/pat.h
+++ b/pat.h
@@ -327,6 +327,7 @@ public:
 		bool verbose);              // be talkative?
 
 protected:
+        static void free_EList_pmembers(const EList<PatternSource*>&);
 
 	MUTEX_T mutex_m; /// mutex for syncing over critical regions
 	uint32_t seed_;
@@ -353,7 +354,10 @@ public:
 		}
 	}
 
-	virtual ~PairedSoloPatternSource() { delete src_; }
+	virtual ~PairedSoloPatternSource() { 
+            free_EList_pmembers(*src_);
+            delete src_; 
+        }
 
 	/**
 	 * Call this whenever this PairedPatternSource is wrapped by a new
@@ -402,7 +406,6 @@ public:
 	}
 
 protected:
-
 	volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
 	const EList<PatternSource*>* src_; /// PatternSources for paired-end reads
 };
@@ -437,7 +440,9 @@ public:
 	}
 
 	virtual ~PairedDualPatternSource() {
-		delete srca_;
+		free_EList_pmembers(*srca_);
+                delete srca_;
+                free_EList_pmembers(*srcb_);
 		delete srcb_;
 	}
 
diff --git a/processor_support.h b/processor_support.h
new file mode 100644
index 0000000..f68ee65
--- /dev/null
+++ b/processor_support.h
@@ -0,0 +1,70 @@
+#ifndef PROCESSOR_SUPPORT_H_
+#define PROCESSOR_SUPPORT_H_
+
+// Utility class ProcessorSupport provides POPCNTenabled() to determine
+// processor support for POPCNT instruction. It uses CPUID to
+// retrieve the processor capabilities.
+// for Intel ICC compiler __cpuid() is an intrinsic 
+// for Microsoft compiler __cpuid() is provided by #include <intrin.h>
+// for GCC compiler __get_cpuid() is provided by #include <cpuid.h>
+
+// Intel compiler defines __GNUC__, so this is needed to disambiguate
+
+#if defined(__INTEL_COMPILER)
+#   define USING_INTEL_COMPILER
+#elif defined(__GNUC__)
+#   define USING_GCC_COMPILER
+#   include <cpuid.h>
+#elif defined(_MSC_VER)
+// __MSC_VER defined by Microsoft compiler
+#define USING MSC_COMPILER
+#endif
+
+struct regs_t {unsigned int EAX, EBX, ECX, EDX;};
+#define BIT(n) ((1<<n))
+
+class ProcessorSupport {
+
+#ifdef POPCNT_CAPABILITY 
+
+public: 
+    ProcessorSupport() { } 
+    bool POPCNTenabled()
+    {
+    // from: Intel® 64 and IA-32 Architectures Software Developer’s Manual, 325462-036US,March 2013
+    //Before an application attempts to use the POPCNT instruction, it must check that the
+    //processor supports SSE4.2
+    //“(if CPUID.01H:ECX.SSE4_2[bit 20] = 1) and POPCNT (if CPUID.01H:ECX.POPCNT[bit 23] = 1)”
+    //
+    // see p.272 of http://download.intel.com/products/processor/manual/253667.pdf available at
+    // http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html
+    // Also http://en.wikipedia.org/wiki/SSE4 talks about available on Intel & AMD processors
+
+    regs_t regs;
+
+    try {
+#if ( defined(USING_INTEL_COMPILER) || defined(USING_MSC_COMPILER) )
+        __cpuid((void *) &regs,0); // test if __cpuid() works, if not catch the exception
+        __cpuid((void *) &regs,0x1); // POPCNT bit is bit 23 in ECX
+#elif defined(USING_GCC_COMPILER)
+        __get_cpuid(0x1, &regs.EAX, &regs.EBX, &regs.ECX, &regs.EDX);
+#else
+        std::cerr << “ERROR: please define __cpuid() for this build.\n”; 
+        assert(0);
+#endif
+        if( !( (regs.ECX & BIT(20)) && (regs.ECX & BIT(23)) ) ) return false;
+    }
+    catch (int e) {
+        return false;
+    }
+    return true;
+    }
+
+#endif // POPCNT_CAPABILITY
+};
+
+#endif /*PROCESSOR_SUPPORT_H_*/
+
+
+
+
diff --git a/random_source.h b/random_source.h
index 9304d98..0a4be15 100644
--- a/random_source.h
+++ b/random_source.h
@@ -60,6 +60,23 @@ public:
 		return ret;
 	}
 
+    uint64_t nextU64() {
+		assert(inited_);
+		uint64_t first = nextU32();
+		first = first << 32;
+		uint64_t second = nextU32();
+		return first | second;
+	}
+    
+	
+	size_t nextSizeT() {
+		if(sizeof(size_t) == 4) {
+			return nextU32();
+		} else {
+			return nextU64();
+		}
+	}
+	
 	/**
 	 * Return a pseudo-random unsigned 32-bit integer sampled uniformly
 	 * from [lo, hi].
diff --git a/random_util.h b/random_util.h
index 39f8c04..2246694 100644
--- a/random_util.h
+++ b/random_util.h
@@ -31,7 +31,7 @@
  */
 class Random1toN {
 
-	typedef uint32_t T;
+	typedef size_t T;
 
 public:
 
diff --git a/ref_read.cpp b/ref_read.cpp
index 2b97986..40dd454 100644
--- a/ref_read.cpp
+++ b/ref_read.cpp
@@ -35,7 +35,7 @@ RefRecord fastaRefReadSize(
 	static int lastc = '>'; // last character seen
 
 	// RefRecord params
-	size_t len = 0; // 'len' counts toward total length
+	TIndexOffU len = 0; // 'len' counts toward total length
 	// 'off' counts number of ambiguous characters before first
 	// unambiguous character
 	size_t off = 0;
@@ -78,7 +78,7 @@ RefRecord fastaRefReadSize(
 			// Don't emit a warning, since this might legitimately be
 			// a gap on the end of the final sequence in the file
 			lastc = -1;
-			return RefRecord((uint32_t)off, (uint32_t)len, first);
+			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
 		}
 	}
 
@@ -116,7 +116,7 @@ RefRecord fastaRefReadSize(
 			}
 			lastc = '>';
 			//return RefRecord(off, 0, false);
-			return RefRecord((uint32_t)off, 0, first);
+			return RefRecord((TIndexOffU)off, 0, first);
 		}
 		c = in.get();
 		if(c == -1) {
@@ -128,7 +128,7 @@ RefRecord fastaRefReadSize(
 			}
 			lastc = -1;
 			//return RefRecord(off, 0, false);
-			return RefRecord((uint32_t)off, 0, first);
+			return RefRecord((TIndexOffU)off, 0, first);
 		}
 	}
 	assert(!rparms.color || (lc != -1));
@@ -150,6 +150,10 @@ RefRecord fastaRefReadSize(
 		if(cat == 1) {
 			// It's a DNA character
 			assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
+			// Check for overflow
+			if((TIndexOffU)(len + 1) < len) {
+				throw RefTooLongException();
+			}
 			// Consume it
 			len++;
 			// Output it
@@ -167,7 +171,7 @@ RefRecord fastaRefReadSize(
 			// It's an N or a gap
 			lastc = c;
 			assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
-			return RefRecord((uint32_t)off, (uint32_t)len, first);
+			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
 		} else {
 			// Not DNA and not a gap, ignore it
 #ifndef NDEBUG
@@ -184,7 +188,7 @@ RefRecord fastaRefReadSize(
 		c = in.get();
 	}
 	lastc = c;
-	return RefRecord((uint32_t)off, (uint32_t)len, first);
+	return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
 }
 
 #if 0
@@ -273,10 +277,10 @@ fastaRefReadSizes(
 	EList<RefRecord>& recs,
 	const RefReadInParams& rparms,
 	BitpairOutFileBuf* bpout,
-	int& numSeqs)
+	TIndexOff& numSeqs)
 {
-	uint32_t unambigTot = 0;
-	uint32_t bothTot = 0;
+	TIndexOffU unambigTot = 0;
+	size_t bothTot = 0;
 	assert_gt(in.size(), 0);
 	// For each input istream
 	for(size_t i = 0; i < in.size(); i++) {
@@ -284,11 +288,15 @@ fastaRefReadSizes(
 		assert(!in[i]->eof());
 		// For each pattern in this istream
 		while(!in[i]->eof()) {
-			RefRecord rec = fastaRefReadSize(*in[i], rparms, first, bpout);
-			if((unambigTot + rec.len) < unambigTot) {
-				cerr << "Error: Reference sequence has more than 2^32-1 characters!  Please divide the" << endl
-				     << "reference into batches or chunks of about 3.6 billion characters or less each" << endl
-				     << "and index each independently." << endl;
+			RefRecord rec;
+			try {
+				rec = fastaRefReadSize(*in[i], rparms, first, bpout);
+				if((unambigTot + rec.len) < unambigTot) {
+					throw RefTooLongException();
+				}
+			}
+			catch(RefTooLongException& e) {
+				cerr << e.what() << endl;
 				throw 1;
 			}
 			// Add the length of this record.
diff --git a/ref_read.h b/ref_read.h
index a6654ff..a387737 100644
--- a/ref_read.h
+++ b/ref_read.h
@@ -31,9 +31,38 @@
 #include "filebuf.h"
 #include "word_io.h"
 #include "ds.h"
+#include "endian_swap.h"
 
 using namespace std;
 
+class RefTooLongException : public exception {
+
+public:
+	RefTooLongException() {
+#ifdef BOWTIE_64BIT_INDEX
+		// This should never happen!
+		msg = "Error: Reference sequence has more than 2^64-1 characters!  "
+		      "Please divide the reference into smaller chunks and index each "
+			  "independently.";
+#else
+		msg = "Error: Reference sequence has more than 2^32-1 characters!  "
+		      "Please build a large index by passing the --large-index option "
+			  "to bowtie2-build";
+#endif
+	}
+	
+	~RefTooLongException() throw() {}
+	
+	const char* what() const throw() {
+		return msg.c_str();
+	}
+
+protected:
+	
+	string msg;
+	
+};
+
 /**
  * Encapsulates a stretch of the reference containing only unambiguous
  * characters.  From an ordered list of RefRecords, one can (almost)
@@ -43,29 +72,29 @@ using namespace std;
  */
 struct RefRecord {
 	RefRecord() : off(), len(), first() { }
-	RefRecord(uint32_t _off, uint32_t _len, bool _first) :
+	RefRecord(TIndexOffU _off, TIndexOffU _len, bool _first) :
 		off(_off), len(_len), first(_first)
 	{ }
 
 	RefRecord(FILE *in, bool swap) {
 		assert(in != NULL);
-		if(!fread(&off, 4, 1, in)) {
+		if(!fread(&off, OFF_SIZE, 1, in)) {
 			cerr << "Error reading RefRecord offset from FILE" << endl;
 			throw 1;
 		}
-		if(swap) off = endianSwapU32(off);
-		if(!fread(&len, 4, 1, in)) {
+		if(swap) off = endianSwapU(off);
+		if(!fread(&len, OFF_SIZE, 1, in)) {
 			cerr << "Error reading RefRecord offset from FILE" << endl;
 			throw 1;
 		}
-		if(swap) len = endianSwapU32(len);
+		if(swap) len = endianSwapU(len);
 		first = fgetc(in) ? true : false;
 	}
 
 #ifdef BOWTIE_MM
 	RefRecord(int in, bool swap) {
-		off = readU32(in, swap);
-		len = readU32(in, swap);
+		off = readU<TIndexOffU>(in, swap);
+		len = readU<TIndexOffU>(in, swap);
 		char c;
 		if(!read(in, &c, 1)) {
 			cerr << "Error reading RefRecord 'first' flag" << endl;
@@ -76,13 +105,13 @@ struct RefRecord {
 #endif
 
 	void write(std::ostream& out, bool be) {
-		writeU32(out, off, be);
-		writeU32(out, len, be);
+		writeU<TIndexOffU>(out, off, be);
+		writeU<TIndexOffU>(out, len, be);
 		out.put(first ? 1 : 0);
 	}
 
-	uint32_t off; /// Offset of the first character in the record
-	uint32_t len; /// Length of the record
+	TIndexOffU off; /// Offset of the first character in the record
+	TIndexOffU len; /// Length of the record
 	bool   first; /// Whether this record is the first for a reference sequence
 };
 
@@ -121,7 +150,7 @@ fastaRefReadSizes(
 	EList<RefRecord>& recs,
 	const RefReadInParams& rparms,
 	BitpairOutFileBuf* bpout,
-	int& numSeqs);
+	TIndexOff& numSeqs);
 
 extern void
 reverseRefRecords(
@@ -139,7 +168,7 @@ static RefRecord fastaRefReadAppend(
 	FileBuf& in,             // input file
 	bool first,              // true iff this is the first record in the file
 	TStr& dst,               // destination buf for parsed characters
-	size_t& dstoff,          // index of next character in dst to assign
+	TIndexOffU& dstoff,          // index of next character in dst to assign
 	RefReadInParams& rparms, // 
 	string* name = NULL)     // put parsed FASTA name here
 {
@@ -292,7 +321,7 @@ static RefRecord fastaRefReadAppend(
 		size_t nlen = dstoff;
 		dst.reverseWindow(ilen, nlen);
 	}
-	return RefRecord((uint32_t)off, (uint32_t)len, first);
+	return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
 }
 
 #endif /*ndef REF_READ_H_*/
diff --git a/reference.cpp b/reference.cpp
index a1dc0c9..06c76e0 100644
--- a/reference.cpp
+++ b/reference.cpp
@@ -25,7 +25,7 @@
 using namespace std;
 
 /**
- * Load from .3.bt2/.4.bt2 Bowtie index files.
+ * Load from .3.gEbwt_ext/.4.gEbwt_ext Bowtie index files.
  */
 BitPairReference::BitPairReference(
 	const string& in,
@@ -47,8 +47,8 @@ BitPairReference::BitPairReference(
 	useShmem_(useShmem),
 	verbose_(verbose)
 {
-	string s3 = in + ".3.bt2";
-	string s4 = in + ".4.bt2";
+	string s3 = in + ".3." + gEbwt_ext;
+	string s4 = in + ".4." + gEbwt_ext;
 	
 #ifdef BOWTIE_MM
 	int f3, f4;
@@ -85,9 +85,9 @@ BitPairReference::BitPairReference(
 			throw 1;
 		}
 		if(mmSweep) {
-			int sum = 0;
+			TIndexOff sum = 0;
 			for(off_t i = 0; i < sbuf.st_size; i += 1024) {
-				sum += (int) mmFile[i];
+				sum += (TIndexOff) mmFile[i];
 			}
 			if(startVerbose) {
 				cerr << "  Swept the memory-mapped ref index file; checksum: " << sum << ": ";
@@ -115,7 +115,7 @@ BitPairReference::BitPairReference(
 	// Read endianness sentinel, set 'swap'
 	uint32_t one;
 	bool swap = false;
-	one = readU32(f3, swap);
+	one = readU<int32_t>(f3, swap);
 	if(one != 1) {
 		if(useMm_) {
 			cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
@@ -126,8 +126,8 @@ BitPairReference::BitPairReference(
 	}
 	
 	// Read # records
-	uint32_t sz;
-	sz = readU32(f3, swap);
+	TIndexOffU sz;
+	sz = readU<TIndexOffU>(f3, swap);
 	if(sz == 0) {
 		cerr << "Error: number of reference records is 0 in " << s3.c_str() << endl;
 		throw 1;
@@ -139,15 +139,15 @@ BitPairReference::BitPairReference(
 	// Cumulative count of all unambiguous characters on a per-
 	// stretch 8-bit alignment (i.e. count of bytes we need to
 	// allocate in buf_)
-	uint32_t cumsz = 0;
-	uint32_t cumlen = 0;
+	TIndexOffU cumsz = 0;
+	TIndexOffU cumlen = 0;
 	// For each unambiguous stretch...
-	for(uint32_t i = 0; i < sz; i++) {
+	for(TIndexOffU i = 0; i < sz; i++) {
 		recs_.push_back(RefRecord(f3, swap));
 		if(recs_.back().first) {
 			// This is the first record for this reference sequence (and the
 			// last record for the one before)
-			refRecOffs_.push_back((uint32_t)recs_.size()-1);
+			refRecOffs_.push_back((TIndexOffU)recs_.size()-1);
 			// refOffs_ links each reference sequence with the total number of
 			// unambiguous characters preceding it in the pasted reference
 			refOffs_.push_back(cumsz);
@@ -173,13 +173,13 @@ BitPairReference::BitPairReference(
 		logTime(cerr);
 	}
 	// Store a cap entry for the end of the last reference seq
-	refRecOffs_.push_back((uint32_t)recs_.size());
+	refRecOffs_.push_back((TIndexOffU)recs_.size());
 	refOffs_.push_back(cumsz);
 	refLens_.push_back(cumlen);
 	bufSz_ = cumsz;
 	assert_eq(nrefs_, refLens_.size());
 	assert_eq(sz, recs_.size());
-	MM_FILE_CLOSE(f3); // done with .3.bt2 file
+	MM_FILE_CLOSE(f3); // done with .3.gEbwt_ext file
 	// Round cumsz up to nearest byte boundary
 	if((cumsz & 3) != 0) {
 		cumsz += (4 - (cumsz & 3));
@@ -339,26 +339,26 @@ BitPairReference::~BitPairReference() {
  * there are many records, binary search would be more appropriate.
  */
 int BitPairReference::getBase(size_t tidx, size_t toff) const {
-	uint32_t reci = refRecOffs_[tidx];   // first record for target reference sequence
-	uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
+	uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
+	uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
 	assert_gt(recf, reci);
-	uint32_t bufOff = refOffs_[tidx];
-	uint32_t off = 0;
+	uint64_t bufOff = refOffs_[tidx];
+	uint64_t off = 0;
 	// For all records pertaining to the target reference sequence...
-	for(uint32_t i = reci; i < recf; i++) {
+	for(uint64_t i = reci; i < recf; i++) {
 		assert_geq(toff, off);
 		off += recs_[i].off;
 		if(toff < off) {
 			return 4;
 		}
 		assert_geq(toff, off);
-		uint32_t recOff = off + recs_[i].len;
+		uint64_t recOff = off + recs_[i].len;
 		if(toff < recOff) {
 			toff -= off;
-			bufOff += (uint32_t)toff;
+			bufOff += (uint64_t)toff;
 			assert_lt(bufOff, bufSz_);
-			const uint32_t bufElt = (bufOff) >> 2;
-			const uint32_t shift = (bufOff & 3) << 1;
+			const uint64_t bufElt = (bufOff) >> 2;
+			const uint64_t shift = (bufOff & 3) << 1;
 			return ((buf_[bufElt] >> shift) & 3);
 		}
 		bufOff += recs_[i].len;
@@ -382,14 +382,14 @@ int BitPairReference::getStretchNaive(
 	size_t count) const
 {
 	uint8_t *dest = (uint8_t*)destU32;
-	uint32_t reci = refRecOffs_[tidx];   // first record for target reference sequence
-	uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
+	uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
+	uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
 	assert_gt(recf, reci);
-	uint32_t cur = 0;
-	uint32_t bufOff = refOffs_[tidx];
-	uint32_t off = 0;
+	uint64_t cur = 0;
+	uint64_t bufOff = refOffs_[tidx];
+	uint64_t off = 0;
 	// For all records pertaining to the target reference sequence...
-	for(uint32_t i = reci; i < recf; i++) {
+	for(uint64_t i = reci; i < recf; i++) {
 		assert_geq(toff, off);
 		off += recs_[i].off;
 		for(; toff < off && count > 0; toff++) {
@@ -399,15 +399,15 @@ int BitPairReference::getStretchNaive(
 		if(count == 0) break;
 		assert_geq(toff, off);
 		if(toff < off + recs_[i].len) {
-			bufOff += (uint32_t)(toff - off); // move bufOff pointer forward
+			bufOff += (TIndexOffU)(toff - off); // move bufOff pointer forward
 		} else {
 			bufOff += recs_[i].len;
 		}
 		off += recs_[i].len;
 		for(; toff < off && count > 0; toff++) {
 			assert_lt(bufOff, bufSz_);
-			const uint32_t bufElt = (bufOff) >> 2;
-			const uint32_t shift = (bufOff & 3) << 1;
+			const uint64_t bufElt = (bufOff) >> 2;
+			const uint64_t shift = (bufOff & 3) << 1;
 			dest[cur++] = (buf_[bufElt] >> shift) & 3;
 			bufOff++;
 			count--;
@@ -454,31 +454,31 @@ int BitPairReference::getStretch(
 	}
 #endif
 	destU32[0] = 0x04040404; // Add Ns, which we might end up using later
-	uint32_t reci = refRecOffs_[tidx];   // first record for target reference sequence
-	uint32_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
+	uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
+	uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
 	assert_gt(recf, reci);
-	uint32_t cur = 4; // keep a cushion of 4 bases at the beginning
-	uint32_t bufOff = refOffs_[tidx];
-	uint32_t off = 0;
-	int offset = 4;
+	uint64_t cur = 4; // keep a cushion of 4 bases at the beginning
+	uint64_t bufOff = refOffs_[tidx];
+	uint64_t off = 0;
+	int64_t offset = 4;
 	bool firstStretch = true;
 	// For all records pertaining to the target reference sequence...
-	for(uint32_t i = reci; i < recf; i++) {
-		ASSERT_ONLY(uint32_t origBufOff = bufOff);
+	for(uint64_t i = reci; i < recf; i++) {
+		ASSERT_ONLY(uint64_t origBufOff = bufOff);
 		assert_geq(toff, off);
 		off += recs_[i].off;
 		assert_gt(count, 0);
 		if(toff < off) {
-			size_t cpycnt = min(off - toff, count);
+			size_t cpycnt = min((size_t)(off - toff), count);
 			memset(&dest[cur], 4, cpycnt);
 			count -= cpycnt;
 			toff += cpycnt;
-			cur += (uint32_t)cpycnt;
+			cur += cpycnt;
 			if(count == 0) break;
 		}
 		assert_geq(toff, off);
 		if(toff < off + recs_[i].len) {
-			bufOff += (uint32_t)(toff - off); // move bufOff pointer forward
+			bufOff += toff - off; // move bufOff pointer forward
 		} else {
 			bufOff += recs_[i].len;
 		}
@@ -492,11 +492,11 @@ int BitPairReference::getStretch(
 					if(cur & 3) {
 						offset -= (cur & 3);
 					}
-					uint32_t curU32 = cur >> 2;
+					uint64_t curU32 = cur >> 2;
 					// Do the initial few bases
 					if(bufOff & 3) {
-						const uint32_t bufElt = (bufOff) >> 2;
-						const int low2 = bufOff & 3;
+						const uint64_t bufElt = (bufOff) >> 2;
+						const int64_t low2 = bufOff & 3;
 						// Lots of cache misses on the following line
 						destU32[curU32] = byteToU32_[buf_[bufElt]];
 						for(int j = 0; j < low2; j++) {
@@ -504,18 +504,18 @@ int BitPairReference::getStretch(
 						}
 						curU32++;
 						offset += low2;
-						const int chars = 4 - low2;
+						const int64_t chars = 4 - low2;
 						count -= chars;
 						bufOff += chars;
 						toff += chars;
 					}
 					assert_eq(0, bufOff & 3);
-					uint32_t bufOffU32 = bufOff >> 2;
-					uint32_t countLim = (uint32_t)count >> 2;
-					uint32_t offLim = (uint32_t)((off - (toff + 4)) >> 2);
-					uint32_t lim = min(countLim, offLim);
+					uint64_t bufOffU32 = bufOff >> 2;
+					uint64_t countLim = count >> 2;
+					uint64_t offLim = ((off - (toff + 4)) >> 2);
+					uint64_t lim = min(countLim, offLim);
 					// Do the fast thing for as far as possible
-					for(uint32_t j = 0; j < lim; j++) {
+					for(uint64_t j = 0; j < lim; j++) {
 						// Lots of cache misses on the following line
 						destU32[curU32] = byteToU32_[buf_[bufOffU32++]];
 #ifndef NDEBUG
@@ -538,8 +538,8 @@ int BitPairReference::getStretch(
 				// Do the slow thing for the rest
 				for(; toff < off && count > 0; toff++) {
 					assert_lt(bufOff, bufSz_);
-					const uint32_t bufElt = (bufOff) >> 2;
-					const uint32_t shift = (bufOff & 3) << 1;
+					const uint64_t bufElt = (bufOff) >> 2;
+					const uint64_t shift = (bufOff & 3) << 1;
 					dest[cur++] = (buf_[bufElt] >> shift) & 3;
 					bufOff++;
 					count--;
@@ -549,8 +549,8 @@ int BitPairReference::getStretch(
 				// Do the slow thing
 				for(; toff < off && count > 0; toff++) {
 					assert_lt(bufOff, bufSz_);
-					const uint32_t bufElt = (bufOff) >> 2;
-					const uint32_t shift = (bufOff & 3) << 1;
+					const uint64_t bufElt = (bufOff) >> 2;
+					const uint64_t shift = (bufOff & 3) << 1;
 					dest[cur++] = (buf_[bufElt] >> shift) & 3;
 					bufOff++;
 					count--;
@@ -568,13 +568,13 @@ int BitPairReference::getStretch(
 		dest[cur++] = 4;
 	}
 	assert_eq(0, count);
-	return offset;
+	return (int)offset;
 }
 
 
 /**
  * Parse the input fasta files, populating the szs list and writing the
- * .3.bt2 and .4.bt2 portions of the index as we go.
+ * .3.gEbwt_ext and .4.gEbwt_ext portions of the index as we go.
  */
 pair<size_t, size_t>
 BitPairReference::szsFromFasta(
@@ -588,9 +588,9 @@ BitPairReference::szsFromFasta(
 	RefReadInParams parms = refparams;
 	std::pair<size_t, size_t> sztot;
 	if(!outfile.empty()) {
-		string file3 = outfile + ".3.bt2";
-		string file4 = outfile + ".4.bt2";
-		// Open output stream for the '.3.bt2' file which will
+		string file3 = outfile + ".3." + gEbwt_ext;
+		string file4 = outfile + ".4." + gEbwt_ext;
+		// Open output stream for the '.3.gEbwt_ext' file which will
 		// hold the size records.
 		ofstream fout3(file3.c_str(), ios::binary);
 		if(!fout3.good()) {
@@ -603,31 +603,31 @@ BitPairReference::szsFromFasta(
 		// Read in the sizes of all the unambiguous stretches of the genome
 		// into a vector of RefRecords.  The input streams are reset once
 		// it's done.
-		writeU32(fout3, 1, bigEndian); // endianness sentinel
+		writeU<int32_t>(fout3, 1, bigEndian); // endianness sentinel
 		bool color = parms.color;
 		if(color) {
 			parms.color = false;
-			// Make sure the .3.bt2 and .4.bt2 files contain
+			// Make sure the .3.gEbwt_ext and .4.gEbwt_ext files contain
 			// nucleotides; not colors
-			int numSeqs = 0;
+			TIndexOff numSeqs = 0;
 			ASSERT_ONLY(std::pair<size_t, size_t> sztot2 =)
 			fastaRefReadSizes(is, szs, parms, &bpout, numSeqs);
 			parms.color = true;
-			writeU32(fout3, (uint32_t)szs.size(), bigEndian); // write # records
+			writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
 			for(size_t i = 0; i < szs.size(); i++) {
 				szs[i].write(fout3, bigEndian);
 			}
 			szs.clear();
 			// Now read in the colorspace size records; these are
 			// the ones that were indexed
-			int numSeqs2 = 0;
+			TIndexOff numSeqs2 = 0;
 			sztot = fastaRefReadSizes(is, szs, parms, NULL, numSeqs2);
 			assert_eq(numSeqs, numSeqs2);
 			assert_eq(sztot2.second, sztot.second + numSeqs);
 		} else {
-			int numSeqs = 0;
+			TIndexOff numSeqs = 0;
 			sztot = fastaRefReadSizes(is, szs, parms, &bpout, numSeqs);
-			writeU32(fout3, (uint32_t)szs.size(), bigEndian); // write # records
+			writeU<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
 			for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian);
 		}
 		if(sztot.first == 0) {
@@ -641,13 +641,13 @@ BitPairReference::szsFromFasta(
 	} else {
 		// Read in the sizes of all the unambiguous stretches of the
 		// genome into a vector of RefRecords
-		int numSeqs = 0;
+		TIndexOff numSeqs = 0;
 		sztot = fastaRefReadSizes(is, szs, parms, NULL, numSeqs);
 #ifndef NDEBUG
 		if(parms.color) {
 			parms.color = false;
 			EList<RefRecord> szs2(EBWTB_CAT);
-			int numSeqs2 = 0;
+			TIndexOff numSeqs2 = 0;
 			ASSERT_ONLY(std::pair<size_t, size_t> sztot2 =)
 			fastaRefReadSizes(is, szs2, parms, NULL, numSeqs2);
 			assert_eq(numSeqs, numSeqs2);
diff --git a/reference.h b/reference.h
index 711cff3..c955303 100644
--- a/reference.h
+++ b/reference.h
@@ -35,6 +35,8 @@
 #include "shmem.h"
 #include "timer.h"
 #include "sstring.h"
+#include "btypes.h"
+
 
 /**
  * Concrete reference representation that bulk-loads the reference from
@@ -116,7 +118,7 @@ public:
 	/**
 	 * Return the number of reference sequences.
 	 */
-	uint32_t numRefs() const {
+	TIndexOffU numRefs() const {
 		return nrefs_;
 	}
 
@@ -126,7 +128,7 @@ public:
 	 *
 	 * TODO: Is it still true that it might leave off Ns?
 	 */
-	uint32_t approxLen(uint32_t elt) const {
+	TIndexOffU approxLen(TIndexOffU elt) const {
 		assert_lt(elt, nrefs_);
 		return refLens_[elt];
 	}
@@ -143,7 +145,7 @@ public:
 	 * reference string; i.e., return the number of unambiguous nucleotides
 	 * preceding it.
 	 */
-	uint32_t pastedOffset(uint32_t idx) const {
+	TIndexOffU pastedOffset(TIndexOffU idx) const {
 		return refOffs_[idx];
 	}
 
@@ -165,14 +167,14 @@ protected:
 	uint32_t byteToU32_[256];
 
 	EList<RefRecord> recs_;       /// records describing unambiguous stretches
-	EList<uint32_t>  refLens_;    /// approx lens of ref seqs (excludes trailing ambig chars)
-	EList<uint32_t>  refOffs_;    /// buf_ begin offsets per ref seq
-	EList<uint32_t>  refRecOffs_; /// record begin/end offsets per ref seq
+	EList<TIndexOffU>  refLens_;    /// approx lens of ref seqs (excludes trailing ambig chars)
+	EList<TIndexOffU>  refOffs_;    /// buf_ begin offsets per ref seq
+	EList<TIndexOffU>  refRecOffs_; /// record begin/end offsets per ref seq
 	uint8_t *buf_;      /// the whole reference as a big bitpacked byte array
 	uint8_t *sanityBuf_;/// for sanity-checking buf_
-	uint32_t bufSz_;    /// size of buf_
-	uint32_t bufAllocSz_;
-	uint32_t nrefs_;    /// the number of reference sequences
+	TIndexOffU bufSz_;    /// size of buf_
+	TIndexOffU bufAllocSz_;
+	TIndexOffU nrefs_;    /// the number of reference sequences
 	bool     loaded_;   /// whether it's loaded
 	bool     sanity_;   /// do sanity checking
 	bool     useMm_;    /// load the reference as a memory-mapped file
diff --git a/sam.cpp b/sam.cpp
index 243ac84..358dba5 100644
--- a/sam.cpp
+++ b/sam.cpp
@@ -104,6 +104,9 @@ void SamConfig::printPgLine(BTString& o) const {
 	o.append(pg_pn_.c_str());
 	o.append("\tVN:");
 	o.append(pg_vn_.c_str());
+	o.append("\tCL:\"");
+	o.append(pg_cl_.c_str());
+	o.append('"');
 	o.append('\n');
 }
 
@@ -119,6 +122,7 @@ void SamConfig::printAlignedOptFlags(
 	BTString& o,               // output buffer
 	bool first,                // first opt flag printed is first overall?
 	const Read& rd,            // the read
+	const Read* rdo,           // the opposite read
 	AlnRes& res,               // individual alignment result
 	StackedAln& staln,         // stacked alignment buffer
 	const AlnFlags& flags,     // alignment flags
@@ -130,6 +134,7 @@ void SamConfig::printAlignedOptFlags(
 	const
 {
 	char buf[1024];
+	assert(summ.best(rd.mate < 2).valid());
 	if(print_as_) {
 		// AS:i: Alignment score generated by aligner
 		itoa10<TAlScore>(res.score().score(), buf);
@@ -139,7 +144,12 @@ void SamConfig::printAlignedOptFlags(
 	}
 	if(print_xs_) {
 		// XS:i: Suboptimal alignment score
-		AlnScore sco = summ.secbestMate(rd.mate < 2);
+		AlnScore sco;
+		if(flags.alignedConcordant()) {
+			sco = summ.bestUnchosen(rd.mate < 2);
+		} else {
+			sco = summ.secbest(rd.mate < 2);
+		}
 		if(sco.valid()) {
 			itoa10<TAlScore>(sco.score(), buf);
 			WRITE_SEP();
@@ -246,6 +256,21 @@ void SamConfig::printAlignedOptFlags(
 		WRITE_SEP();
 		o.append("Yn:i:");
 		o.append(buf);
+		if(summ.paired()) {
+			assert(rdo != NULL);
+			// ZN:i: Minimum valid score for opposite mate
+			TAlScore mn = sc.scoreMin.f<TAlScore>(rdo->length());
+			itoa10<TAlScore>(mn, buf);
+			WRITE_SEP();
+			o.append("ZN:i:");
+			o.append(buf);
+			// Zn:i: Perfect score for opposite mate
+			TAlScore pe = sc.perfectScore(rdo->length());
+			itoa10<TAlScore>(pe, buf);
+			WRITE_SEP();
+			o.append("Zn:i:");
+			o.append(buf);
+		}
 	}
 	if(print_xss_) {
 		// Xs:i: Best invalid alignment score of this mate
@@ -308,22 +333,18 @@ void SamConfig::printAlignedOptFlags(
 	}
 	if(flags.partOfPair() && print_zp_) {
 		// ZP:i: Score of best concordant paired-end alignment
-		WRITE_SEP();
-		o.append("ZP:Z:");
 		if(summ.bestPaired().valid()) {
+			WRITE_SEP();
+			o.append("ZP:i:");
 			itoa10<TAlScore>(summ.bestPaired().score(), buf);
 			o.append(buf);
-		} else {
-			o.append("NA");
 		}
-		// Zp:i: Second-best concordant paired-end alignment score
-		WRITE_SEP();
-		o.append("Zp:Z:");
+		// Zp:i: Score of second-best concordant paired-end alignment
 		if(summ.secbestPaired().valid()) {
+			WRITE_SEP();
+			o.append("Zp:i:");
 			itoa10<TAlScore>(summ.secbestPaired().score(), buf);
 			o.append(buf);
-		} else {
-			o.append("NA");
 		}
 	}
 	if(print_zu_) {
@@ -552,7 +573,7 @@ void SamConfig::printEmptyOptFlags(
 		flags.printYM(o);
 	}
 	if(print_yf_ && flags.filtered()) {
-		// YM:i: Read was repetitive when aligned unpaired?
+		// YF:i: Why read was filtered out prior to alignment
 		first = flags.printYF(o, first) && first;
 	}
 	if(!rgs_.empty()) {
diff --git a/sam.h b/sam.h
index 644d0b5..2c6e921 100644
--- a/sam.h
+++ b/sam.h
@@ -234,7 +234,7 @@ public:
 			namelen = 255;
 		}
 		for(size_t i = 0; i < namelen; i++) {
-			if(isspace(name[i])) {
+			if(truncQname_ && isspace(name[i])) {
 				return;
 			}
 			o.append(name[i]);
@@ -283,6 +283,7 @@ public:
 		BTString& o,               // output buffer
 		bool first,                // first opt flag printed is first overall?
 		const Read& rd,            // the read
+		const Read* rdo,           // the opposite read
 		AlnRes& res,               // individual alignment result
 		StackedAln& staln,         // stacked alignment
 		const AlnFlags& flags,     // alignment flags
diff --git a/scripts/infer_fraglen.pl b/scripts/infer_fraglen.pl
index 65ebe86..caf2f77 100755
--- a/scripts/infer_fraglen.pl
+++ b/scripts/infer_fraglen.pl
@@ -38,6 +38,8 @@ my $bowtie_args = "";
 my $bowtie2 = "$Bin/../bowtie2";
 my $debug = 0;
 my $binsz = 10;
+my $mapq_cutoff = 30;
+my $upto = undef;
 
 sub dieusage {
 	my $msg = shift;
@@ -52,12 +54,13 @@ sub dieusage {
 #
 sub checkIndex($) {
 	my $idx = shift;
-	return -f "$idx.1.ebwt" &&
-	       -f "$idx.2.ebwt" &&
-	       -f "$idx.3.ebwt" &&
-	       -f "$idx.4.ebwt" &&
-	       -f "$idx.rev.1.ebwt" &&
-	       -f "$idx.rev.2.ebwt";
+	my $ext = "bt2";
+	return -f "$idx.1.$ext" &&
+	       -f "$idx.2.$ext" &&
+	       -f "$idx.3.$ext" &&
+	       -f "$idx.4.$ext" &&
+	       -f "$idx.rev.1.$ext" &&
+	       -f "$idx.rev.2.$ext";
 }
 
 GetOptions (
@@ -65,6 +68,8 @@ GetOptions (
 	"index=s"       => \$index,
 	"m1=s"          => \$m1,
 	"m2=s"          => \$m2,
+	"upto=i"        => \$upto,
+	"mapq_cutoff=i" => \$mapq_cutoff,
 	"debug"         => \$debug,
 	"bowtie-args=s" => \$bowtie_args) || dieusage("Bad option", 1);
 
@@ -81,8 +86,9 @@ die "Bad index: $index" if !checkIndex($index);
 my %fragments = ();
 my $m1cmd = ($m1 =~ /\.gz$/) ? "gzip -dc $m1" : "cat $m1";
 my $m2cmd = ($m2 =~ /\.gz$/) ? "gzip -dc $m2" : "cat $m2";
-my $cmd1 = "$m1cmd | $bowtie2 $bowtie_args -m 1 -S --sam-nohead $index - > .infer_fraglen.tmp";
-my $cmd2 = "$m2cmd | $bowtie2 $bowtie_args -m 1 -S --sam-nohead $index - |";
+my $cmd1 = "$m1cmd | $bowtie2 $bowtie_args --sam-nohead -x $index - > .infer_fraglen.tmp";
+my $cmd2 = "$m2cmd | $bowtie2 $bowtie_args --sam-nohead -x $index - |";
+my $tot = 0;
 system($cmd1) == 0 || die "Error running '$cmd1'";
 open (M1, ".infer_fraglen.tmp") || die "Could not open '.infer_fraglen.tmp'";
 open (M2, $cmd2) || die "Could not open '$cmd2'";
@@ -92,12 +98,14 @@ while(<M1>) {
 	chomp($lm1); chomp($lm2);
 	my @lms1 = split(/\t/, $lm1);
 	my @lms2 = split(/\t/, $lm2);
-	my ($name1, $flags1, $chr1, $off1, $slen1) = ($lms1[0], $lms1[1], $lms1[2], $lms1[3], length($lms1[9]));
-	my ($name2, $flags2, $chr2, $off2, $slen2) = ($lms2[0], $lms2[1], $lms2[2], $lms2[3], length($lms2[9]));
+	my ($name1, $flags1, $chr1, $off1, $mapq1, $slen1) = ($lms1[0], $lms1[1], $lms1[2], $lms1[3], $lms1[4], length($lms1[9]));
+	my ($name2, $flags2, $chr2, $off2, $mapq2, $slen2) = ($lms2[0], $lms2[1], $lms2[2], $lms2[3], $lms2[4], length($lms2[9]));
 	# One or both mates didn't align uniquely?
 	next if $chr1 eq "*" || $chr2 eq "*";
 	# Mates aligned to different chromosomes?
 	next if $chr1 ne $chr2;
+	# MAPQs too low?
+	next if $mapq1 < $mapq_cutoff || $mapq2 < $mapq_cutoff;
 	# This pairing can be used as an observation of fragment orientation and length
 	my $fw1 = (($flags1 & 16) == 0) ? "F" : "R";
 	my $fw2 = (($flags2 & 16) == 0) ? "F" : "R";
@@ -108,6 +116,7 @@ while(<M1>) {
 	# Install into bin
 	$frag = int(($frag + ($binsz/2))/$binsz); # Round to nearest bin
 	$fragments{"$fw1$fw2"}{$frag}++;
+	$tot++;
 }
 close(M1);
 close(M2);
diff --git a/scripts/make_e_coli.sh b/scripts/make_e_coli.sh
index ced879a..4ca9046 100755
--- a/scripts/make_e_coli.sh
+++ b/scripts/make_e_coli.sh
@@ -24,11 +24,11 @@ if [ ! -f NC_008253.fna ] ; then
 			echo curl not found either, aborting...
 		else
 			# Use curl
-			curl ${GENOMES_MIRROR}/Bacteria/Escherichia_coli_536/NC_008253.fna -o NC_008253.fna
+			curl ${GENOMES_MIRROR}/Bacteria/Escherichia_coli_536_uid58531/NC_008253.fna -o NC_008253.fna
 		fi
 	else
 		# Use wget
-		wget ${GENOMES_MIRROR}/Bacteria/Escherichia_coli_536/NC_008253.fna
+		wget ${GENOMES_MIRROR}/Bacteria/Escherichia_coli_536_uid58531/NC_008253.fna
 	fi
 fi
 
diff --git a/shmem.h b/shmem.h
index 374ee0c..b36f3ad 100644
--- a/shmem.h
+++ b/shmem.h
@@ -30,11 +30,13 @@
 #include <stdint.h>
 #include <stdexcept>
 #include "str_util.h"
+#include "btypes.h"
 
 extern void notifySharedMem(void *mem, size_t len);
 
 extern void waitSharedMem(void *mem, size_t len);
 
+#define ALLOC_SHARED_U allocSharedMem<TIndexOffU>
 #define ALLOC_SHARED_U8 allocSharedMem<uint8_t>
 #define ALLOC_SHARED_U32 allocSharedMem<uint32_t>
 #define FREE_SHARED shmdt
@@ -147,6 +149,7 @@ bool allocSharedMem(std::string fname,
 
 #else
 
+#define ALLOC_SHARED_U(...) 0
 #define ALLOC_SHARED_U8(...) 0
 #define ALLOC_SHARED_U32(...) 0
 #define FREE_SHARED(...)
diff --git a/sse_util.h b/sse_util.h
index b5781f1..929b4f6 100644
--- a/sse_util.h
+++ b/sse_util.h
@@ -222,6 +222,7 @@ private:
 			std::cerr << "Error: Out of memory allocating " << sz << " __m128i's for DP matrix: '" << e.what() << "'" << std::endl;
 			throw e;
 		}
+                this->last_alloc_ = last_alloc_;
 		__m128i* tmp = last_alloc_;
 		size_t tmpint = (size_t)tmp;
 		// Align it!
@@ -267,6 +268,7 @@ private:
 	 */
 	void expandCopyExact(size_t newsz) {
 		if(newsz <= sz_) return;
+                __m128i* prev_last_alloc = last_alloc_;
 		__m128i* tmp = alloc(newsz);
 		assert(tmp != NULL);
 		size_t cur = cur_;
@@ -275,7 +277,10 @@ private:
 				// Note: operator= is used
 				tmp[i] = list_[i];
 			}
+                        __m128i* current_last_alloc = last_alloc_;
+                        last_alloc_ = prev_last_alloc;
 			free();
+                        last_alloc_ = current_last_alloc;
 		}
 		list_ = tmp;
 		sz_ = newsz;
diff --git a/third_party/cpuid.h b/third_party/cpuid.h
new file mode 100644
index 0000000..6a9688f
--- /dev/null
+++ b/third_party/cpuid.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ * 
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ * 
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+/* %ecx */
+#define bit_SSE3	(1 << 0)
+#define bit_PCLMUL	(1 << 1)
+#define bit_SSSE3	(1 << 9)
+#define bit_FMA		(1 << 12)
+#define bit_CMPXCHG16B	(1 << 13)
+#define bit_SSE4_1	(1 << 19)
+#define bit_SSE4_2	(1 << 20)
+#define bit_MOVBE	(1 << 22)
+#define bit_POPCNT	(1 << 23)
+#define bit_AES		(1 << 25)
+#define bit_XSAVE	(1 << 26)
+#define bit_OSXSAVE	(1 << 27)
+#define bit_AVX		(1 << 28)
+#define bit_F16C	(1 << 29)
+#define bit_RDRND	(1 << 30)
+
+/* %edx */
+#define bit_CMPXCHG8B	(1 << 8)
+#define bit_CMOV	(1 << 15)
+#define bit_MMX		(1 << 23)
+#define bit_FXSAVE	(1 << 24)
+#define bit_SSE		(1 << 25)
+#define bit_SSE2	(1 << 26)
+
+/* Extended Features */
+/* %ecx */
+#define bit_LAHF_LM	(1 << 0)
+#define bit_ABM		(1 << 5)
+#define bit_SSE4a	(1 << 6)
+#define bit_XOP         (1 << 11)
+#define bit_LWP 	(1 << 15)
+#define bit_FMA4        (1 << 16)
+#define bit_TBM         (1 << 21)
+
+/* %edx */
+#define bit_LM		(1 << 29)
+#define bit_3DNOWP	(1 << 30)
+#define bit_3DNOW	(1 << 31)
+
+/* Extended Features (%eax == 7) */
+#define bit_FSGSBASE	(1 << 0)
+#define bit_BMI		(1 << 3)
+
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx may be the PIC register.  */
+#if __GNUC__ >= 3
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchgl\t%%ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchgl\t%%ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+#else
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+
+/* Return highest supported input value for cpuid instruction.  ext can
+   be either 0x0 or 0x8000000 to return highest supported value for
+   basic or extended cpuid information.  Function returns 0 if cpuid
+   is not supported or whatever cpuid returns in eax register.  If sig
+   pointer is non-null, then first four bytes of the signature
+   (as found in ebx register) are returned in location pointed by sig.  */
+
+static __inline unsigned int
+__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
+{
+  unsigned int __eax, __ebx, __ecx, __edx;
+
+#ifndef __x86_64__
+#if __GNUC__ >= 3
+  /* See if we can use cpuid.  On AMD64 we always can.  */
+  __asm__ ("pushf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "mov{l}\t{%0, %1|%1, %0}\n\t"
+	   "xor{l}\t{%2, %0|%0, %2}\n\t"
+	   "push{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+  __asm__ ("pushfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "movl\t%0, %1\n\t"
+	   "xorl\t%2, %0\n\t"
+	   "pushl\t%0\n\t"
+	   "popfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "popfl\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#endif
+
+  if (!((__eax ^ __ebx) & 0x00200000))
+    return 0;
+#endif
+
+  /* Host supports cpuid.  Return highest supported cpuid input value.  */
+  __cpuid (__ext, __eax, __ebx, __ecx, __edx);
+
+  if (__sig)
+    *__sig = __ebx;
+
+  return __eax;
+}
+
+/* Return cpuid data for requested cpuid level, as found in returned
+   eax, ebx, ecx and edx registers.  The function checks if cpuid is
+   supported and returns 1 for valid cpuid information or 0 for
+   unsupported cpuid level.  All pointers are required to be non-null.  */
+
+static __inline int
+__get_cpuid (unsigned int __level,
+	     unsigned int *__eax, unsigned int *__ebx,
+	     unsigned int *__ecx, unsigned int *__edx)
+{
+  unsigned int __ext = __level & 0x80000000;
+
+  if (__get_cpuid_max (__ext, 0) < __level)
+    return 0;
+
+  __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
+  return 1;
+}
diff --git a/tinythread.cpp b/tinythread.cpp
old mode 100755
new mode 100644
diff --git a/tinythread.h b/tinythread.h
old mode 100755
new mode 100644
diff --git a/word_io.h b/word_io.h
index 345073a..48fb57b 100644
--- a/word_io.h
+++ b/word_io.h
@@ -26,70 +26,112 @@
 #include <fstream>
 #include "assert_helpers.h"
 #include "endian_swap.h"
+#include "btypes.h"
 
 /**
- * Write a 32-bit unsigned to an output stream being careful to
+ * Write a 32/64 bit unsigned to an output stream being careful to
  * re-endianize if caller-requested endianness differs from current
  * host.
  */
-static inline void writeU32(std::ostream& out, uint32_t x, bool toBigEndian) {
-	uint32_t y = endianizeU32(x, toBigEndian);
-	out.write((const char*)&y, 4);
+template <typename T>
+static inline void writeU(std::ostream& out, T x, bool toBigEndian) {
+	T y = endianizeU<T>(x, toBigEndian);
+	out.write((const char*)&y, sizeof(T));
 }
 
 /**
- * Write a 32-bit unsigned to an output stream using the native
+ * Write a 32/64 bit unsigned to an output stream using the native
  * endianness.
  */
-static inline void writeU32(std::ostream& out, uint32_t x) {
-	out.write((const char*)&x, 4);
+template <typename T>
+static inline void writeU(std::ostream& out, T x) {
+	out.write((const char*)&x, sizeof(T));
 }
 
 /**
- * Write a 32-bit signed int to an output stream being careful to
+ * Write a 32/64 bit signed int to an output stream being careful to
  * re-endianize if caller-requested endianness differs from current
  * host.
  */
-static inline void writeI32(std::ostream& out, int32_t x, bool toBigEndian) {
-	int32_t y = endianizeI32(x, toBigEndian);
-	out.write((const char*)&y, 4);
+template <typename T>
+static inline void writeI(std::ostream& out, T x, bool toBigEndian) {
+	T y = endianizeI<T>(x, toBigEndian);
+	out.write((const char*)&y, sizeof(T));
 }
 
 /**
- * Write a 32-bit unsigned to an output stream using the native
+ * Write a 32/64 bit unsigned to an output stream using the native
  * endianness.
  */
-static inline void writeI32(std::ostream& out, int32_t x) {
-	out.write((const char*)&x, 4);
+template <typename T>
+static inline void writeI(std::ostream& out, T x) {
+	out.write((const char*)&x, sizeof(T));
 }
 
 /**
- * Read a 32-bit unsigned from an input stream, inverting endianness
+ * Read a 32/64 bit unsigned from an input stream, inverting endianness
  * if necessary.
  */
-static inline uint32_t readU32(std::istream& in, bool swap) {
-	uint32_t x;
-	in.read((char *)&x, 4);
-	assert_eq(4, in.gcount());
+//template <typename T>
+//static inline T readU(std::istream& in, bool swap) {
+//	T x;
+//	in.read((char *)&x, OFF_SIZE);
+//	assert_eq(OFF_SIZE, in.gcount());
+//	if(swap) {
+//		return endianSwapU(x);
+//	} else {
+//		return x;
+//	}
+//}
+template <typename T>
+static inline T readU(std::istream& in, bool swap) {
+	T x;
+	in.read((char *)&x, sizeof(T));
+	assert_eq(sizeof(T), in.gcount());
 	if(swap) {
-		return endianSwapU32(x);
+		if(sizeof(T) == 4) {
+			return endianSwapU32(x);
+		} else if(sizeof(T) == 8) {
+			return endianSwapU64(x);
+		} else {
+			assert(false);
+		}
 	} else {
 		return x;
 	}
 }
 
 /**
- * Read a 32-bit unsigned from a file descriptor, optionally inverting
+ * Read a 32/64 bit unsigned from a file descriptor, optionally inverting
  * endianness.
  */
 #ifdef BOWTIE_MM
-static inline uint32_t readU32(int in, bool swap) {
-	uint32_t x;
-	if(read(in, (void *)&x, 4) != 4) {
+//template <typename T>
+//static inline T readU(int in, bool swap) {
+//	T x;
+//	if(read(in, (void *)&x, OFF_SIZE) != OFF_SIZE) {
+//		assert(false);
+//	}
+//	if(swap) {
+//		return endianSwapU(x);
+//	} else {
+//		return x;
+//	}
+//}
+template <typename T>
+static inline T readU(int in, bool swap) {
+	T x;
+	if(read(in, (void *)&x, sizeof(T)) != sizeof(T)) {
 		assert(false);
 	}
 	if(swap) {
-		return endianSwapU32(x);
+		if(sizeof(T) == 4) {
+			return endianSwapU32(x);
+		} else if(sizeof(T) == 8) {
+			return endianSwapU64(x);
+		} else {
+			assert(false);
+		}
 	} else {
 		return x;
 	}
@@ -97,16 +139,35 @@ static inline uint32_t readU32(int in, bool swap) {
 #endif
 
 /**
- * Read a 32-bit unsigned from a FILE*, optionally inverting
+ * Read a 32/64 bit unsigned from a FILE*, optionally inverting
  * endianness.
  */
-static inline uint32_t readU32(FILE* in, bool swap) {
-	uint32_t x;
-	if(fread((void *)&x, 1, 4, in) != 4) {
+//template <typename T>
+//static inline T readU(FILE* in, bool swap) {
+//	T x;
+//	if(fread((void *)&x, 1, OFF_SIZE, in) != OFF_SIZE) {
+//		assert(false);
+//	}
+//	if(swap) {
+//		return endianSwapU(x);
+//	} else {
+//		return x;
+//	}
+//}
+template <typename T>
+static inline T readU(FILE* in, bool swap) {
+	T x;
+	if(fread((void *)&x, 1, sizeof(T), in) != sizeof(T)) {
 		assert(false);
 	}
 	if(swap) {
-		return endianSwapU32(x);
+		if(sizeof(T) == 4) {
+			return endianSwapU32(x);
+		} else if(sizeof(T) == 8) {
+			return endianSwapU64(x);
+		} else {
+			assert(false);
+		}
 	} else {
 		return x;
 	}
@@ -114,32 +175,69 @@ static inline uint32_t readU32(FILE* in, bool swap) {
 
 
 /**
- * Read a 32-bit signed from an input stream, inverting endianness
+ * Read a 32/64 bit signed from an input stream, inverting endianness
  * if necessary.
  */
-static inline int32_t readI32(std::istream& in, bool swap) {
-	int32_t x;
-	in.read((char *)&x, 4);
-	assert_eq(4, in.gcount());
+//template <typename T>
+//static inline T readI(std::istream& in, bool swap) {
+//	T x;
+//	in.read((char *)&x, OFF_SIZE);
+//	assert_eq(OFF_SIZE, in.gcount());
+//	if(swap) {
+//		return endianSwapI(x);
+//	} else {
+//		return x;
+//	}
+//}
+template <typename T>
+static inline T readI(std::istream& in, bool swap) {
+	T x;
+	in.read((char *)&x, sizeof(T));
+	assert_eq(sizeof(T), in.gcount());
 	if(swap) {
-		return endianSwapI32(x);
+		if(sizeof(T) == 4) {
+			return endianSwapI32(x);
+		} else if(sizeof(T) == 8) {
+			return endianSwapI64(x);
+		} else {
+			assert(false);
+		}
 	} else {
 		return x;
 	}
 }
 
 /**
- * Read a 32-bit unsigned from a file descriptor, optionally inverting
+ * Read a 32/64 bit unsigned from a file descriptor, optionally inverting
  * endianness.
  */
 #ifdef BOWTIE_MM
-static inline uint32_t readI32(int in, bool swap) {
-	int32_t x;
-	if(read(in, (void *)&x, 4) != 4) {
+//template <typename T>
+//static inline T readI(int in, bool swap) {
+//	T x;
+//	if(read(in, (void *)&x, OFF_SIZE) != OFF_SIZE) {
+//		assert(false);
+//	}
+//	if(swap) {
+//		return endianSwapI(x);
+//	} else {
+//		return x;
+//	}
+//}
+template <typename T>
+static inline T readI(int in, bool swap) {
+	T x;
+	if(read(in, (void *)&x, sizeof(T)) != sizeof(T)) {
 		assert(false);
 	}
 	if(swap) {
-		return endianSwapI32(x);
+		if(sizeof(T) == 4) {
+			return endianSwapI32(x);
+		} else if(sizeof(T) == 8) {
+			return endianSwapI64(x);
+		} else {
+			assert(false);
+		}
 	} else {
 		return x;
 	}
@@ -147,16 +245,35 @@ static inline uint32_t readI32(int in, bool swap) {
 #endif
 
 /**
- * Read a 32-bit unsigned from a FILE*, optionally inverting
+ * Read a 32/64 bit unsigned from a FILE*, optionally inverting
  * endianness.
  */
-static inline uint32_t readI32(FILE* in, bool swap) {
-	int32_t x;
-	if(fread((void *)&x, 1, 4, in) != 4) {
+//template <typename T>
+//static inline T readI(FILE* in, bool swap) {
+//	T x;
+//	if(fread((void *)&x, 1, OFF_SIZE, in) != OFF_SIZE) {
+//		assert(false);
+//	}
+//	if(swap) {
+//		return endianSwapI(x);
+//	} else {
+//		return x;
+//	}
+//}
+template <typename T>
+static inline T readI(FILE* in, bool swap) {
+	T x;
+	if(fread((void *)&x, 1, sizeof(T), in) != sizeof(T)) {
 		assert(false);
 	}
 	if(swap) {
-		return endianSwapI32(x);
+		if(sizeof(T) == 4) {
+			return endianSwapI32(x);
+		} else if(sizeof(T) == 8) {
+			return endianSwapI64(x);
+		} else {
+			assert(false);
+		}
 	} else {
 		return x;
 	}
diff --git a/zbox.h b/zbox.h
index 63de784..6ef1456 100644
--- a/zbox.h
+++ b/zbox.h
@@ -20,6 +20,8 @@
 #ifndef ZBOX_H_
 #define ZBOX_H_
 
+#include "btypes.h"
+
 /**
  * Fill z with Z-box information for s.  String z will not be resized
  * and will only be filled up to its size cap.  This is the linear-time
@@ -28,8 +30,8 @@
  */
 template<typename T>
 void calcZ(const T& s,
-           uint32_t off,
-           EList<uint32_t>& z,
+           TIndexOffU off,
+           EList<TIndexOffU>& z,
            bool verbose = false,
            bool sanityCheck = false)
 {
@@ -46,7 +48,7 @@ void calcZ(const T& s,
 			// compare starting at k with prefix starting at 0
 			size_t ki = k;
 			while(off+ki < s.length() && s[off+ki] == s[off+ki-k]) ki++;
-			z[k] = (uint32_t)(ki - k);
+			z[k] = (TIndexOffU)(ki - k);
 			assert_lt(off+z[k], slen);
 			if(z[k] > 0) {
 				lCur = k;
@@ -64,7 +66,7 @@ void calcZ(const T& s,
 			} else if (z[kPrime] > 0) {
 				int q = 0;
 				while (off+q+rCur+1 < s.length() && s[off+q+rCur+1] == s[off+betaLen+q]) q++;
-				z[k] = (uint32_t)(betaLen + q);
+				z[k] = (TIndexOffU)(betaLen + q);
 				assert_lt(off+z[k], slen);
 				rCur = rCur + q;
 				assert_geq(k, lCur);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bowtie2.git