[med-svn] [centrifuge] 01/02: New upstream version 1.0.2~beta

Andreas Tille tille at debian.org
Sat Nov 25 20:47:01 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository centrifuge.

commit d9e5f020a21cd26dbd5ffd0e3c081c20b85eb565
Author: Andreas Tille <tille at debian.org>
Date:   Sat Nov 25 21:25:02 2017 +0100

    New upstream version 1.0.2~beta
---
 .gitignore                                   |   14 +
 AUTHORS                                      |   29 +
 LICENSE                                      |  674 ++++
 MANUAL                                       |  940 ++++++
 MANUAL.markdown                              | 1505 +++++++++
 Makefile                                     |  413 +++
 NEWS                                         |    3 +
 README.md                                    |   40 +
 TUTORIAL                                     |    4 +
 VERSION                                      |    1 +
 aligner_bt.cpp                               | 1773 +++++++++++
 aligner_bt.h                                 |  947 ++++++
 aligner_cache.cpp                            |  181 ++
 aligner_cache.h                              | 1013 ++++++
 aligner_metrics.h                            |  352 +++
 aligner_result.h                             |  469 +++
 aligner_seed.cpp                             |  532 ++++
 aligner_seed.h                               | 2866 +++++++++++++++++
 aligner_seed_policy.cpp                      |  888 ++++++
 aligner_seed_policy.h                        |  233 ++
 aligner_sw.cpp                               | 2990 ++++++++++++++++++
 aligner_sw.h                                 |  648 ++++
 aligner_sw_common.h                          |  304 ++
 aligner_sw_nuc.h                             |  262 ++
 aligner_swsse.cpp                            |   88 +
 aligner_swsse.h                              |  500 +++
 aligner_swsse_ee_i16.cpp                     | 1914 ++++++++++++
 aligner_swsse_ee_u8.cpp                      | 1905 ++++++++++++
 aligner_swsse_loc_i16.cpp                    | 2275 ++++++++++++++
 aligner_swsse_loc_u8.cpp                     | 2269 ++++++++++++++
 aln_sink.h                                   | 2427 +++++++++++++++
 alphabet.cpp                                 |  440 +++
 alphabet.h                                   |  199 ++
 assert_helpers.h                             |  283 ++
 binary_sa_search.h                           |  102 +
 bitpack.h                                    |   53 +
 blockwise_sa.h                               | 1120 +++++++
 bt2_idx.cpp                                  |   70 +
 bt2_idx.h                                    | 3940 +++++++++++++++++++++++
 bt2_io.h                                     | 1030 ++++++
 bt2_util.h                                   |  229 ++
 btypes.h                                     |   47 +
 ccnt_lut.cpp                                 |   60 +
 centrifuge                                   |  559 ++++
 centrifuge-BuildSharedSequence.pl            |  526 ++++
 centrifuge-RemoveEmptySequence.pl            |   28 +
 centrifuge-RemoveN.pl                        |   57 +
 centrifuge-build                             |   79 +
 centrifuge-compress.pl                       |  575 ++++
 centrifuge-download                          |  298 ++
 centrifuge-inspect                           |   55 +
 centrifuge-kreport                           |  161 +
 centrifuge-sort-nt.pl                        |   63 +
 centrifuge.cpp                               | 3201 +++++++++++++++++++
 centrifuge.xcodeproj/project.pbxproj         |  825 +++++
 centrifuge_build.cpp                         |  748 +++++
 centrifuge_build_main.cpp                    |   70 +
 centrifuge_compress.cpp                      | 1433 +++++++++
 centrifuge_inspect.cpp                       |  674 ++++
 centrifuge_main.cpp                          |   69 +
 centrifuge_report.cpp                        |  186 ++
 classifier.h                                 | 1053 +++++++
 diff_sample.cpp                              |  117 +
 diff_sample.h                                | 1000 ++++++
 doc/README                                   |    4 +
 doc/add.css                                  |   57 +
 doc/faq.shtml                                |   45 +
 doc/footer.inc.html                          |    7 +
 doc/index.shtml                              |   87 +
 doc/manual.html                              | 1060 +++++++
 doc/manual.inc.html                          |  884 ++++++
 doc/manual.inc.html.old                      | 1186 +++++++
 doc/manual.shtml                             |   37 +
 doc/sidebar.inc.shtml                        |  124 +
 doc/strip_markdown.pl                        |   45 +
 doc/style.css                                |  306 ++
 dp_framer.cpp                                |  910 ++++++
 dp_framer.h                                  |  261 ++
 ds.cpp                                       |  155 +
 ds.h                                         | 4305 ++++++++++++++++++++++++++
 edit.cpp                                     |  486 +++
 edit.h                                       |  394 +++
 endian_swap.h                                |  160 +
 evaluation/centrifuge_evaluate.py            |  614 ++++
 evaluation/centrifuge_simulate_reads.py      |  875 ++++++
 evaluation/test/centrifuge_evaluate_mason.py |  187 ++
 example/index/test.1.cf                      |  Bin 0 -> 8389424 bytes
 example/index/test.2.cf                      |  Bin 0 -> 140 bytes
 example/index/test.3.cf                      |  Bin 0 -> 716 bytes
 example/reads/input.fa                       |   24 +
 example/reference/gi_to_tid.dmp              |    2 +
 example/reference/names.dmp                  |   90 +
 example/reference/nodes.dmp                  |   35 +
 example/reference/test.fa                    |   16 +
 fast_mutex.h                                 |  248 ++
 filebuf.h                                    |  718 +++++
 formats.h                                    |   57 +
 functions.sh                                 |   60 +
 group_walk.cpp                               |   20 +
 group_walk.h                                 | 1285 ++++++++
 hi_aligner.h                                 | 1033 ++++++
 hier_idx.h                                   | 1877 +++++++++++
 hier_idx_common.h                            |   41 +
 hyperloglogbias.h                            |  133 +
 hyperloglogplus.h                            |  623 ++++
 indices/Makefile                             |  321 ++
 limit.cpp                                    |   43 +
 limit.h                                      |   48 +
 ls.cpp                                       |  142 +
 ls.h                                         |  333 ++
 mask.cpp                                     |   36 +
 mask.h                                       |   79 +
 mem_ids.h                                    |   35 +
 mm.h                                         |   51 +
 multikey_qsort.h                             | 1232 ++++++++
 opts.h                                       |  179 ++
 outq.cpp                                     |  201 ++
 outq.h                                       |  149 +
 pat.cpp                                      | 1783 +++++++++++
 pat.h                                        | 1788 +++++++++++
 pe.cpp                                       |  940 ++++++
 pe.h                                         |  321 ++
 presets.cpp                                  |   87 +
 presets.h                                    |   67 +
 processor_support.h                          |   70 +
 qual.cpp                                     |   85 +
 qual.h                                       |  235 ++
 random_source.cpp                            |  128 +
 random_source.h                              |  239 ++
 random_util.cpp                              |   24 +
 random_util.h                                |  221 ++
 read.h                                       |  533 ++++
 read_qseq.cpp                                |  304 ++
 ref_coord.cpp                                |   33 +
 ref_coord.h                                  |  424 +++
 ref_read.cpp                                 |  327 ++
 ref_read.h                                   |  314 ++
 reference.cpp                                |  670 ++++
 reference.h                                  |  189 ++
 scoring.cpp                                  |  286 ++
 scoring.h                                    |  519 ++++
 search_globals.h                             |   48 +
 sequence_io.h                                |  125 +
 shmem.cpp                                    |   49 +
 shmem.h                                      |  161 +
 simple_func.cpp                              |   93 +
 simple_func.h                                |  125 +
 sse_util.cpp                                 |   33 +
 sse_util.h                                   |  574 ++++
 sstring.cpp                                  |  202 ++
 sstring.h                                    | 3537 +++++++++++++++++++++
 str_util.h                                   |   47 +
 taxonomy.h                                   |  338 ++
 third_party/MurmurHash3.cpp                  |  335 ++
 third_party/MurmurHash3.h                    |   37 +
 third_party/cpuid.h                          |  187 ++
 threading.h                                  |   57 +
 timer.h                                      |   87 +
 tinythread.cpp                               |  303 ++
 tinythread.h                                 |  714 +++++
 tokenize.h                                   |   62 +
 util.h                                       |   94 +
 word_io.h                                    |  376 +++
 zbox.h                                       |   97 +
 164 files changed, 86377 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..628c190
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+*~
+*.dSYM
+.DS_Store
+*-debug
+*-s
+*-l
+centrifuge.xcodeproj/project.xcworkspace
+centrifuge.xcodeproj/xcuserdata
+centrifuge.xcodeproj/xcshareddata
+
+centrifuge-build-bin
+centrifuge-buildc
+centrifuge-class
+centrifuge-inspect-bin
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..d22b8b2
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,29 @@
+Ben Langmead <langmea at cs.jhu.edu> wrote Bowtie 2, which is based partially on
+Bowtie.  Bowtie was written by Ben Langmead and Cole Trapnell.
+
+  Bowtie & Bowtie 2:  http://bowtie-bio.sf.net
+
+A DLL from the pthreads for Win32 library is distributed with the Win32 version
+of Bowtie 2.  The pthreads for Win32 library and the GnuWin32 package have many
+contributors (see their respective web sites).
+
+  pthreads for Win32: http://sourceware.org/pthreads-win32
+  GnuWin32:           http://gnuwin32.sf.net
+
+The ForkManager.pm perl module is used in Bowtie 2's random testing framework,
+and is included as scripts/sim/contrib/ForkManager.pm.  ForkManager.pm is
+written by dLux (Szabo, Balazs), with contributions by others.  See the perldoc
+in ForkManager.pm for the complete list.
+
+The file ls.h includes an implementation of the Larsson-Sadakane suffix sorting
+algorithm.  The implementation is by N. Jesper Larsson and was adapted somewhat
+for use in Bowtie 2.
+
+TinyThreads is a portable thread implementation with a fairly compatible subset 
+of C++11 thread management classes written by Marcus Geelnard. For more info
+check http://tinythreadpp.bitsnbites.eu/ 
+
+Various users have kindly supplied patches, bug reports and feature requests
+over the years.  Many, many thanks go to them.
+
+September 2011
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/MANUAL b/MANUAL
new file mode 100644
index 0000000..c139009
--- /dev/null
+++ b/MANUAL
@@ -0,0 +1,940 @@
+
+Introduction
+============
+
+What is Centrifuge?
+-----------------
+
+[Centrifuge] is a novel microbial classification engine that enables
+rapid, accurate, and sensitive labeling of reads and quantification of
+species on desktop computers.  The system uses a novel indexing scheme
+based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini
+(FM) index, optimized specifically for the metagenomic classification
+problem. Centrifuge requires a relatively small index (5.8 GB for all
+complete bacterial and viral genomes plus the human genome) and
+classifies sequences at a very high speed, allowing it to process the
+millions of reads from a typical high-throughput DNA sequencing run
+within a few minutes.  Together these advances enable timely and
+accurate analysis of large metagenomics data sets on conventional
+desktop computers.
+
+[Centrifuge]:     http://www.ccb.jhu.edu/software/centrifuge
+
+[Burrows-Wheeler Transform]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[FM Index]:        http://en.wikipedia.org/wiki/FM-index
+
+[GPLv3 license]:   http://www.gnu.org/licenses/gpl-3.0.html
+
+Obtaining Centrifuge
+==================
+
+Download Centrifuge and binaries from the Releases sections on the right side.
+Binaries are available for Intel architectures (`x86_64`) running Linux, and Mac OS X.
+
+Building from source
+--------------------
+
+Building Centrifuge from source requires a GNU-like environment with GCC, GNU Make
+and other basics.  It should be possible to build Centrifuge on most vanilla Linux
+installations or on a Mac installation with [Xcode] installed.  Centrifuge can
+also be built on Windows using [Cygwin] or [MinGW] (MinGW recommended). For a 
+MinGW build the choice of what compiler is to be used is important since this
+will determine if a 32 or 64 bit code can be successfully compiled using it. If 
+there is a need to generate both 32 and 64 bit on the same machine then a multilib 
+MinGW has to be properly installed. [MSYS], the [zlib] library, and depending on 
+architecture [pthreads] library are also required. We are recommending a 64 bit
+build since it has some clear advantages in real life research problems. In order 
+to simplify the MinGW setup it might be worth investigating popular MinGW personal 
+builds since these are coming already prepared with most of the toolchains needed.
+
+First, download the [source package] from the Releases secion on the right side.
+Unzip the file, change to the unzipped directory, and build the
+Centrifuge tools by running GNU `make` (usually with the command `make`, but
+sometimes with `gmake`) with no arguments.  If building with MinGW, run `make`
+from the MSYS environment.
+
+Centrifuge is using the multithreading software model in order to speed up 
+execution times on SMP architectures where this is possible. On POSIX 
+platforms (like linux, Mac OS, etc) it needs the pthread library. Although
+it is possible to use pthread library on non-POSIX platform like Windows, due
+to performance reasons Centrifuge will try to use Windows native multithreading
+if possible.
+
+For the support of SRA data access in HISAT2, please download and install the [NCBI-NGS] toolkit.
+When running `make`, specify additional variables as follow.
+`make USE_SRA=1 NCBI_NGS_DIR=/path/to/NCBI-NGS-directory NCBI_VDB_DIR=/path/to/NCBI-NGS-directory`,
+where `NCBI_NGS_DIR` and `NCBI_VDB_DIR` will be used in Makefile for -I and -L compilation options.
+For example, $(NCBI_NGS_DIR)/include and $(NCBI_NGS_DIR)/lib64 will be used.  
+
+[Cygwin]:   http://www.cygwin.com/
+[MinGW]:    http://www.mingw.org/
+[MSYS]:     http://www.mingw.org/wiki/msys
+[zlib]:     http://cygwin.com/packages/mingw-zlib/
+[pthreads]: http://sourceware.org/pthreads-win32/
+[GnuWin32]: http://gnuwin32.sf.net/packages/coreutils.htm
+[Xcode]:    http://developer.apple.com/xcode/
+[Github site]: https://github.com/infphilo/centrifuge
+[NCBI-NGS]: https://github.com/ncbi/ngs/wiki/Downloads
+
+Running Centrifuge
+=============
+
+Adding to PATH
+--------------
+
+By adding your new Centrifuge directory to your [PATH environment variable], you
+ensure that whenever you run `centrifuge`, `centrifuge-build`, `centrifuge-download` or `centrifuge-inspect`
+from the command line, you will get the version you just installed without
+having to specify the entire path.  This is recommended for most users.  To do
+this, follow your operating system's instructions for adding the directory to
+your [PATH].
+
+If you would like to install Centrifuge by copying the Centrifuge executable files
+to an existing directory in your [PATH], make sure that you copy all the
+executables, including `centrifuge`, `centrifuge-class`, `centrifuge-build`, `centrifuge-build-bin`, `centrifuge-download` `centrifuge-inspect`
+and `centrifuge-inspect-bin`. Furthermore you need the programs
+in the scripts/ folder if you opt for genome compression in the database construction.
+
+[PATH environment variable]: http://en.wikipedia.org/wiki/PATH_(variable)
+[PATH]: http://en.wikipedia.org/wiki/PATH_(variable)
+
+Before running Centrifuge
+-----------------
+
+Classification is considerably different from alignment in that classification is performed on a large set of genomes as opposed to on just one reference genome as in alignment.  Currently, an enormous number of complete genomes are available at the GenBank (e.g. >4,000 bacterial genomes, >10,000 viral genomes, …).  These genomes are organized in a taxonomic tree where each genome is located at the bottom of the tree, at the strain or subspecies level.  On the taxonomic tree, genomes hav [...]
+
+Given the gigantic number of genomes available, which continues to expand at a rapid rate, and the development of the taxonomic tree, which continues to evolve with new advancements in research, we have designed Centrifuge to be flexible and general enough to reflect this huge database.  We provide several standard indexes that will meet most of users’ needs (see the side panel - Indexes).  In our approach our indexes not only include raw genome sequences, but also genome names/sizes and [...]
+
+We encourage first time users to take a look at and follow a `small example` that illustrates how to build an index, how to run Centrifuge using the index, how to interpret the classification results, and how to extract additional genomic information from the index.  For those who choose to build customized indexes, please take a close look at the following description.
+
+Database download and index building
+-----------------
+
+Centrifuge indexes can be built with arbritary sequences. Standard choices are
+all of the complete bacterial and viral genomes, or using the sequences that
+are part of the BLAST nt database. Centrifuge always needs the
+nodes.dmp file from the NCBI taxonomy dump to build the taxonomy tree,
+as well as a sequence ID to taxonomy ID map. The map is a tab-separated
+file with the sequence ID to taxonomy ID map.
+
+To download all of the complete archaeal, viral, and bacterial genomes from RefSeq, and
+build the index:
+
+Centrifuge indices can be build on arbritary sequences. Usually an ensemble of
+genomes is used - such as all complete microbial genomes in the RefSeq database,
+or all sequences in the BLAST nt database. 
+
+To map sequence identifiers to taxonomy IDs, and taxonomy IDs to names and 
+its parents, three files are necessary in addition to the sequence files:
+
+ - taxonomy tree: typically nodes.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their parents
+ - names file: typically names.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their scientific name
+ - a tab-separated sequence ID to taxonomy ID mapping
+
+When using the provided scripts to download the genomes, these files are automatically downloaded or generated. 
+When using a custom taxonomy or sequence files, please refer to the section `TODO` to learn more about their format.
+
+### Building index on all complete bacterial and viral genomes
+
+Use `centrifuge-download` to download genomes from NCBI. The following two commands download
+the NCBI taxonomy to `taxonomy/` in the current directory, and all complete archaeal,
+bacterial and viral genomes to `library/`. Low-complexity regions in the genomes are masked after
+download (parameter `-m`) using blast+'s `dustmasker`. `centrifuge-download` outputs tab-separated 
+sequence ID to taxonomy ID mappings to standard out, which are required by `centrifuge-build`.
+
+    centrifuge-download -o taxonomy taxonomy
+    centrifuge-download -o library -m -d "archaea,bacteria,viral" refseq > seqid2taxid.map
+
+To build the index, first concatenate all downloaded sequences into a single file, and then
+run `centrifuge-build`:
+    
+    cat library/*/*.fna > input-sequences.fna
+
+    ## build centrifuge index with 4 threads
+    centrifuge-build -p 4 --conversion-table seqid2taxid.map \
+                     --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \
+                     input-sequences.fna abv
+
+After building the index, all files except the index *.[123].cf files may be removed.
+If you also want to include the human and/or the mouse genome, add their sequences to 
+the library folder before building the index with one of the following commands:
+
+After the index building, all but the *.[123].cf index files may be removed. I.e. the files in
+the `library/` and `taxonomy/` directories are no longer needed.
+
+### Adding human or mouse genome to the index
+The human and mouse genomes can also be downloaded using `centrifuge-download`. They are in the
+domain "vertebrate_mammalian" (argument `-d`), are assembled at the chromosome level (argument `-a`)
+and categorized as reference genomes by RefSeq (`-c`). The argument `-t` takes a comma-separated
+list of taxonomy IDs - e.g. `9606` for human and `10090` for mouse:
+
+    # download mouse and human reference genomes
+    centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606,10090 -c 'reference genome' >> seqid2taxid.map
+    # only human
+    centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606 -c 'reference genome' >> seqid2taxid.map
+    # only mouse
+    centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 10090 -c 'reference genome' >> seqid2taxid.map
+
+### nt database
+
+NCBI BLAST's nt database contains all spliced non-redundant coding
+sequences from multiplpe databases, inferred from genommic
+sequences. Traditionally used with BLAST, a download of the FASTA is
+provided on the NCBI homepage. Building an index with any database 
+requires the user to creates a sequence ID to taxonomy ID map that 
+can be generated from a GI taxid dump:
+
+    wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz
+    gunzip nt.gz && mv -v nt nt.fa
+
+    # Get mapping file
+    wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
+    gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
+
+    # build index using 16 cores and a small bucket size, which will require less memory
+    centrifuge-build -p 16 --bmax 1342177280 --conversion-table gi_taxid_nucl.map \
+                     --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \ 
+                     nt.fa nt
+
+### Custom database
+
+TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
+
+### Centrifuge classification output
+
+The following example shows classification assignments for a read.  The assignment output has 8 columns.
+
+    readID    seqID   taxID score	   2ndBestScore	   hitLength	queryLength	numMatches
+    1_1	      gi|4    9646  4225	   0		       80	80		1
+
+    The first column is the read ID from a raw sequencing read (e.g., 1_1 in the example).
+    The second column is the sequence ID of the genomic sequence, where the read is classified (e.g., gi|4).
+    The third column is the taxonomic ID of the genomic sequence in the second column (e.g., 9646).
+    The fourth column is the score for the classification, which is the weighted sum of hits (e.g., 4225)
+    The fifth column is the score for the next best classification (e.g., 0).
+    The sixth column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80).
+    The seventh column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80). 
+    The eighth column is the number of classifications, indicating how many assignments were made (e.g.,1).
+
+### Centrifuge summary output (the default filename is centrifuge_report.tsv)
+
+The following example shows a classification summary for each genome or taxonomic unit.  The assignment output has 7 columns.
+
+    name      	      	      		     	     	      	     	taxID	taxRank	   genomeSize 	numReads   numUniqueReads   abundance
+    Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis	36870	leaf	   703004		5981	   5964	            0.0152317
+
+    The first column is the name of a genome, or the name corresponding to a taxonomic ID (the second column) at a rank higher than the strain (e.g., Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis).
+    The second column is the taxonomic ID (e.g., 36870).
+    The third column is the taxonomic rank (e.g., leaf).
+    The fourth column is the length of the genome sequence (e.g., 703004).
+    The fifth column is the number of reads classified to this genomic sequence including multi-classified reads (e.g., 5981).
+    The sixth column is the number of reads uniquely classified to this genomic sequence (e.g., 5964).
+    The seventh column is the proportion of this genome normalized by its genomic length (e.g., 0.0152317).
+
+Inspecting the Centrifuge index
+-----------------------
+
+The index can be inspected with `centrifuge-inspect`.  To extract raw sequences:
+
+    centrifuge-inspect <centrifuge index>
+
+Extract the sequence ID to taxonomy ID conversion table from the index
+
+    centrifuge-inspect --conversion-table <centrifuge index>
+
+Extract the taxonomy tree from the index:
+
+    centrifuge-inspect --taxonomy-tree <centrifuge index>
+
+Extract the lengths of the sequences from the index (each row has two columns: taxonomic ID and length):
+
+    centrifuge-inspect --size-table <centrifuge index>
+
+Extract the names from the index (each row has two columns: taxonomic ID and name):
+
+    centrifuge-inspect --name-table <centrifuge index>
+    
+Wrapper
+-------
+
+The `centrifuge`, `centrifuge-build` and `centrifuge-inspect` executables are actually 
+wrapper scripts that call binary programs as appropriate. Also, the `centrifuge` wrapper
+provides some key functionality, like the ability to handle compressed inputs,
+and the functionality for `--un`, `--al` and related options.
+
+It is recommended that you always run the centrifuge wrappers and not run the
+binaries directly.
+
+Performance tuning
+------------------
+
+1.  If your computer has multiple processors/cores, use `-p NTHREADS`
+
+    The `-p` option causes Centrifuge to launch a specified number of parallel
+    search threads.  Each thread runs on a different processor/core and all
+    threads find alignments in parallel, increasing alignment throughput by
+    approximately a multiple of the number of threads (though in practice,
+    speedup is somewhat worse than linear).
+
+Command Line
+------------
+
+### Usage
+
+    centrifuge [options]* -x <centrifuge-idx> {-1 <m1> -2 <m2> | -U <r> | --sra-acc <SRA accession number>} [--report-file <report file name> -S <classification output file name>]
+
+### Main arguments
+
+    -x <centrifuge-idx>
+
+The basename of the index for the reference genomes.  The basename is the name of
+any of the index files up to but not including the final `.1.cf` / etc.  
+`centrifuge` looks for the specified index first in the current directory,
+then in the directory specified in the `CENTRIFUGE_INDEXES` environment variable.
+
+    -1 <m1>
+
+Comma-separated list of files containing mate 1s (filename usually includes
+`_1`), e.g. `-1 flyA_1.fq,flyB_1.fq`.  Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in `<m2>`. Reads
+may be a mix of different lengths. If `-` is specified, `centrifuge` will read the
+mate 1s from the "standard in" or "stdin" filehandle.
+
+    -2 <m2>
+
+Comma-separated list of files containing mate 2s (filename usually includes
+`_2`), e.g. `-2 flyA_2.fq,flyB_2.fq`.  Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in `<m1>`. Reads
+may be a mix of different lengths. If `-` is specified, `centrifuge` will read the
+mate 2s from the "standard in" or "stdin" filehandle.
+
+    -U <r>
+
+Comma-separated list of files containing unpaired reads to be aligned, e.g.
+`lane1.fq,lane2.fq,lane3.fq,lane4.fq`.  Reads may be a mix of different lengths.
+If `-` is specified, `centrifuge` gets the reads from the "standard in" or "stdin"
+filehandle.
+
+    --sra-acc <SRA accession number>
+
+Comma-separated list of SRA accession numbers, e.g. `--sra-acc SRR353653,SRR353654`.
+Information about read types is available at http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?sp=runinfo&acc=<b>sra-acc</b>&retmode=xml,
+where <b>sra-acc</b> is SRA accession number.  If users run HISAT2 on a computer cluster, it is recommended to disable SRA-related caching (see the instruction at [SRA-MANUAL]).
+
+[SRA-MANUAL]:	     https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration
+
+    -S <filename>
+
+File to write classification results to.  By default, assignments are written to the
+"standard out" or "stdout" filehandle (i.e. the console).
+
+    --report-file <filename>
+
+File to write a classification summary to (default: centrifuge_report.tsv).
+
+### Options
+
+#### Input options
+
+    -q
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are FASTQ files.  FASTQ files
+usually have extension `.fq` or `.fastq`.  FASTQ is the default format.  See
+also: `--solexa-quals` and `--int-quals`.
+
+    --qseq
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are QSEQ files.  QSEQ files usually
+end in `_qseq.txt`.  See also: `--solexa-quals` and `--int-quals`.
+
+    -f
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are FASTA files.  FASTA files
+usually have extension `.fa`, `.fasta`, `.mfa`, `.fna` or similar.  FASTA files
+do not have a way of specifying quality values, so when `-f` is set, the result
+is as if `--ignore-quals` is also set.
+
+    -r
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are files with one input sequence
+per line, without any other information (no read names, no qualities).  When
+`-r` is set, the result is as if `--ignore-quals` is also set.
+
+    -c
+
+The read sequences are given on command line.  I.e. `<m1>`, `<m2>` and
+`<singles>` are comma-separated lists of reads rather than lists of read files.
+There is no way to specify read names or qualities, so `-c` also implies
+`--ignore-quals`.
+
+    -s/--skip <int>
+
+Skip (i.e. do not align) the first `<int>` reads or pairs in the input.
+
+    -u/--qupto <int>
+
+Align the first `<int>` reads or read pairs from the input (after the
+`-s`/`--skip` reads or pairs have been skipped), then stop.  Default: no limit.
+
+    -5/--trim5 <int>
+
+Trim `<int>` bases from 5' (left) end of each read before alignment (default: 0).
+
+    -3/--trim3 <int>
+
+Trim `<int>` bases from 3' (right) end of each read before alignment (default:
+0).
+
+    --phred33
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 33.  This is
+also called the "Phred+33" encoding, which is used by the very latest Illumina
+pipelines.
+
+[Phred quality]: http://en.wikipedia.org/wiki/Phred_quality_score
+
+    --phred64
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 64.  This is
+also called the "Phred+64" encoding.
+
+    --solexa-quals
+
+Convert input qualities from [Solexa][Phred quality] (which can be negative) to
+[Phred][Phred quality] (which can't).  This scheme was used in older Illumina GA
+Pipeline versions (prior to 1.3).  Default: off.
+
+    --int-quals
+
+Quality values are represented in the read input file as space-separated ASCII
+integers, e.g., `40 40 30 40`..., rather than ASCII characters, e.g., `II?I`....
+ Integers are treated as being on the [Phred quality] scale unless
+`--solexa-quals` is also specified. Default: off.
+
+#### Classification
+
+    -k <int>
+
+It searches for at most `<int>` distinct, primary assignments for each read or pair.  
+Primary assignments mean assignments whose assignment score is equal or higher than any other assignments.
+If there are more primary assignments than this value, 
+the search will merge some of the assignments into a higher taxonomic rank.
+The assignment score for a paired-end assignment equals the sum of the assignment scores of the individual mates. 
+Default: 5
+
+    --host-taxids
+
+A comma-separated list of taxonomic IDs that will be preferred in classification procedure.
+The descendants from these IDs will also be preferred.  In case some of a read's assignments correspond to
+these taxonomic IDs, only those corresponding assignments will be reported.
+
+    --exclude-taxids
+
+A comma-separated list of taxonomic IDs that will be excluded in classification procedure.
+The descendants from these IDs will also be exclude. 
+
+#### Alignment options
+
+    --n-ceil <func>
+
+Sets a function governing the maximum number of ambiguous characters (usually
+`N`s and/or `.`s) allowed in a read as a function of read length.  For instance,
+specifying `-L,0,0.15` sets the N-ceiling function `f` to `f(x) = 0 + 0.15 * x`,
+where x is the read length.  See also: [setting function options].  Reads
+exceeding this ceiling are [filtered out].  Default: `L,0,0.15`.
+
+    --ignore-quals
+
+When calculating a mismatch penalty, always consider the quality value at the
+mismatched position to be the highest possible, regardless of the actual value. 
+I.e. input is treated as though all quality values are high.  This is also the
+default behavior when the input doesn't specify quality values (e.g. in `-f`,
+`-r`, or `-c` modes).
+
+    --nofw/--norc
+
+If `--nofw` is specified, `centrifuge` will not attempt to align unpaired reads to
+the forward (Watson) reference strand.  If `--norc` is specified, `centrifuge` will
+not attempt to align unpaired reads against the reverse-complement (Crick)
+reference strand. In paired-end mode, `--nofw` and `--norc` pertain to the
+fragments; i.e. specifying `--nofw` causes `centrifuge` to explore only those
+paired-end configurations corresponding to fragments from the reverse-complement
+(Crick) strand.  Default: both strands enabled. 
+
+#### Paired-end options
+
+    --fr/--rf/--ff
+
+The upstream/downstream mate orientations for a valid paired-end alignment
+against the forward reference strand.  E.g., if `--fr` is specified and there is
+a candidate paired-end alignment where mate 1 appears upstream of the reverse
+complement of mate 2 and the fragment length constraints (`-I` and `-X`) are
+met, that alignment is valid.  Also, if mate 2 appears upstream of the reverse
+complement of mate 1 and all other constraints are met, that too is valid.
+`--rf` likewise requires that an upstream mate1 be reverse-complemented and a
+downstream mate2 be forward-oriented. ` --ff` requires both an upstream mate 1
+and a downstream mate 2 to be forward-oriented.  Default: `--fr` (appropriate
+for Illumina's Paired-end Sequencing Assay).
+
+#### Output options
+
+    -t/--time
+
+Print the wall-clock time required to load the index files and align the reads. 
+This is printed to the "standard error" ("stderr") filehandle.  Default: off.
+
+    --un <path>
+    --un-gz <path>
+    --un-bz2 <path>
+
+Write unpaired reads that fail to align to file at `<path>`.  These reads
+correspond to the SAM records with the FLAGS `0x4` bit set and neither the
+`0x40` nor `0x80` bits set.  If `--un-gz` is specified, output will be gzip
+compressed. If `--un-bz2` is specified, output will be bzip2 compressed.  Reads
+written in this way will appear exactly as they did in the input file, without
+any modification (same sequence, same name, same quality string, same quality
+encoding).  Reads will not necessarily appear in the same order as they did in
+the input.
+
+    --al <path>
+    --al-gz <path>
+    --al-bz2 <path>
+
+Write unpaired reads that align at least once to file at `<path>`.  These reads
+correspond to the SAM records with the FLAGS `0x4`, `0x40`, and `0x80` bits
+unset.  If `--al-gz` is specified, output will be gzip compressed. If `--al-bz2`
+is specified, output will be bzip2 compressed.  Reads written in this way will
+appear exactly as they did in the input file, without any modification (same
+sequence, same name, same quality string, same quality encoding).  Reads will
+not necessarily appear in the same order as they did in the input.
+
+    --un-conc <path>
+    --un-conc-gz <path>
+    --un-conc-bz2 <path>
+
+Write paired-end reads that fail to align concordantly to file(s) at `<path>`.
+These reads correspond to the SAM records with the FLAGS `0x4` bit set and
+either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2).
+`.1` and `.2` strings are added to the filename to distinguish which file
+contains mate #1 and mate #2.  If a percent symbol, `%`, is used in `<path>`,
+the percent symbol is replaced with `1` or `2` to make the per-mate filenames.
+Otherwise, `.1` or `.2` are added before the final dot in `<path>` to make the
+per-mate filenames.  Reads written in this way will appear exactly as they did
+in the input files, without any modification (same sequence, same name, same
+quality string, same quality encoding).  Reads will not necessarily appear in
+the same order as they did in the inputs.
+
+    --al-conc <path>
+    --al-conc-gz <path>
+    --al-conc-bz2 <path>
+
+Write paired-end reads that align concordantly at least once to file(s) at
+`<path>`. These reads correspond to the SAM records with the FLAGS `0x4` bit
+unset and either the `0x40` or `0x80` bit set (depending on whether it's mate #1
+or #2). `.1` and `.2` strings are added to the filename to distinguish which
+file contains mate #1 and mate #2.  If a percent symbol, `%`, is used in
+`<path>`, the percent symbol is replaced with `1` or `2` to make the per-mate
+filenames. Otherwise, `.1` or `.2` are added before the final dot in `<path>` to
+make the per-mate filenames.  Reads written in this way will appear exactly as
+they did in the input files, without any modification (same sequence, same name,
+same quality string, same quality encoding).  Reads will not necessarily appear
+in the same order as they did in the inputs.
+
+    --quiet
+
+Print nothing besides alignments and serious errors.
+
+    --met-file <path>
+
+Write `centrifuge` metrics to file `<path>`.  Having alignment metric can be useful
+for debugging certain problems, especially performance issues.  See also:
+`--met`.  Default: metrics disabled.
+
+    --met-stderr
+
+Write `centrifuge` metrics to the "standard error" ("stderr") filehandle.  This is
+not mutually exclusive with `--met-file`.  Having alignment metric can be
+useful for debugging certain problems, especially performance issues.  See also:
+`--met`.  Default: metrics disabled.
+
+    --met <int>
+
+Write a new `centrifuge` metrics record every `<int>` seconds.  Only matters if
+either `--met-stderr` or `--met-file` are specified.  Default: 1.
+
+#### Performance options
+
+    -o/--offrate <int>
+
+Override the offrate of the index with `<int>`.  If `<int>` is greater
+than the offrate used to build the index, then some row markings are
+discarded when the index is read into memory.  This reduces the memory
+footprint of the aligner but requires more time to calculate text
+offsets.  `<int>` must be greater than the value used to build the
+index.
+
+    -p/--threads NTHREADS
+
+Launch `NTHREADS` parallel search threads (default: 1).  Threads will run on
+separate processors/cores and synchronize when parsing reads and outputting
+alignments.  Searching for alignments is highly parallel, and speedup is close
+to linear.  Increasing `-p` increases Centrifuge's memory footprint. E.g. when
+aligning to a human genome index, increasing `-p` from 1 to 8 increases the
+memory footprint by a few hundred megabytes.  This option is only available if
+`bowtie` is linked with the `pthreads` library (i.e. if `BOWTIE_PTHREADS=0` is
+not specified at build time).
+
+    --reorder
+
+Guarantees that output records are printed in an order corresponding to the
+order of the reads in the original input file, even when `-p` is set greater
+than 1.  Specifying `--reorder` and setting `-p` greater than 1 causes Centrifuge
+to run somewhat slower and use somewhat more memory then if `--reorder` were
+not specified.  Has no effect if `-p` is set to 1, since output order will
+naturally correspond to input order in that case.
+
+    --mm
+
+Use memory-mapped I/O to load the index, rather than typical file I/O.
+Memory-mapping allows many concurrent `bowtie` processes on the same computer to
+share the same memory image of the index (i.e. you pay the memory overhead just
+once).  This facilitates memory-efficient parallelization of `bowtie` in
+situations where using `-p` is not possible or not preferable.
+
+#### Other options
+
+    --qc-filter
+
+Filter out reads for which the QSEQ filter field is non-zero.  Only has an
+effect when read format is `--qseq`.  Default: off.
+
+    --seed <int>
+
+Use `<int>` as the seed for pseudo-random number generator.  Default: 0.
+
+    --non-deterministic
+
+Normally, Centrifuge re-initializes its pseudo-random generator for each read.  It
+seeds the generator with a number derived from (a) the read name, (b) the
+nucleotide sequence, (c) the quality sequence, (d) the value of the `--seed`
+option.  This means that if two reads are identical (same name, same
+nucleotides, same qualities) Centrifuge will find and report the same classification(s)
+for both, even if there was ambiguity.  When `--non-deterministic` is specified,
+Centrifuge re-initializes its pseudo-random generator for each read using the
+current time.  This means that Centrifuge will not necessarily report the same
+classification for two identical reads.  This is counter-intuitive for some users,
+but might be more appropriate in situations where the input consists of many
+identical reads.
+
+    --version
+
+Print version information and quit.
+
+    -h/--help
+
+Print usage information and quit.
+
+The `centrifuge-build` indexer
+===========================
+
+`centrifuge-build` builds a Centrifuge index from a set of DNA sequences.
+`centrifuge-build` outputs a set of 6 files with suffixes `.1.cf`, `.2.cf`, and
+`.3.cf`.  These files together
+constitute the index: they are all that is needed to align reads to that
+reference.  The original sequence FASTA files are no longer used by Centrifuge
+once the index is built.
+
+Use of Karkkainen's [blockwise algorithm] allows `centrifuge-build` to trade off
+between running time and memory usage. `centrifuge-build` has two options
+governing how it makes this trade: `--bmax`/`--bmaxdivn`,
+and `--dcv`.  By default, `centrifuge-build` will automatically search for the
+settings that yield the best running time without exhausting memory.  This
+behavior can be disabled using the `-a`/`--noauto` option.
+
+The indexer provides options pertaining to the "shape" of the index, e.g.
+`--offrate` governs the fraction of [Burrows-Wheeler]
+rows that are "marked" (i.e., the density of the suffix-array sample; see the
+original [FM Index] paper for details).  All of these options are potentially
+profitable trade-offs depending on the application.  They have been set to
+defaults that are reasonable for most cases according to our experiments.  See
+[Performance tuning] for details.
+
+The Centrifuge index is based on the [FM Index] of Ferragina and Manzini, which in
+turn is based on the [Burrows-Wheeler] transform.  The algorithm used to build
+the index is based on the [blockwise algorithm] of Karkkainen.
+
+[Blockwise algorithm]: http://portal.acm.org/citation.cfm?id=1314852
+[Burrows-Wheeler]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+
+Command Line
+------------
+
+Usage:
+
+    centrifuge-build [options]* --conversion-table <table_in> --taxonomy-tree <taxonomy_in> --name-table <table_in2> <reference_in> <cf_base>
+
+### Main arguments
+
+A comma-separated list of FASTA files containing the reference sequences to be
+aligned to, or, if `-c` is specified, the sequences
+themselves. E.g., `<reference_in>` might be `chr1.fa,chr2.fa,chrX.fa,chrY.fa`,
+or, if `-c` is specified, this might be
+`GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA`.
+
+The basename of the index files to write.  By default, `centrifuge-build` writes
+files named `NAME.1.cf`, `NAME.2.cf`, and `NAME.3.cf`, where `NAME` is `<cf_base>`.
+
+### Options
+
+    -f
+
+The reference input files (specified as `<reference_in>`) are FASTA files
+(usually having extension `.fa`, `.mfa`, `.fna` or similar).
+
+    -c
+
+The reference sequences are given on the command line.  I.e. `<reference_in>` is
+a comma-separated list of sequences rather than a list of FASTA files.
+
+    -a/--noauto
+
+Disable the default behavior whereby `centrifuge-build` automatically selects
+values for the `--bmax`, `--dcv` and `--packed` parameters according to
+available memory.  Instead, user may specify values for those parameters.  If
+memory is exhausted during indexing, an error message will be printed; it is up
+to the user to try new parameters.
+
+    -p/--threads <int>
+
+Launch `NTHREADS` parallel search threads (default: 1).
+
+    --conversion-table <file>
+
+List of UIDs (unique ID) and corresponding taxonomic IDs.
+
+    --taxonomy-tree <file>
+
+Taxonomic tree (e.g. nodes.dmp).
+
+    --name-table <file>
+
+Name table (e.g. names.dmp).
+
+    --size-table <file>
+
+List of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.
+
+    --bmax <int>
+
+The maximum number of suffixes allowed in a block.  Allowing more suffixes per
+block makes indexing faster, but increases peak memory usage.  Setting this
+option overrides any previous setting for `--bmax`, or `--bmaxdivn`. 
+Default (in terms of the `--bmaxdivn` parameter) is `--bmaxdivn` 4.  This is
+configured automatically by default; use `-a`/`--noauto` to configure manually.
+
+    --bmaxdivn <int>
+
+The maximum number of suffixes allowed in a block, expressed as a fraction of
+the length of the reference.  Setting this option overrides any previous setting
+for `--bmax`, or `--bmaxdivn`.  Default: `--bmaxdivn` 4.  This is
+configured automatically by default; use `-a`/`--noauto` to configure manually.
+
+    --dcv <int>
+
+Use `<int>` as the period for the difference-cover sample.  A larger period
+yields less memory overhead, but may make suffix sorting slower, especially if
+repeats are present.  Must be a power of 2 no greater than 4096.  Default: 1024.
+ This is configured automatically by default; use `-a`/`--noauto` to configure
+manually.
+
+    --nodc
+
+Disable use of the difference-cover sample.  Suffix sorting becomes
+quadratic-time in the worst case (where the worst case is an extremely
+repetitive reference).  Default: off.
+
+    -o/--offrate <int>
+
+To map alignments back to positions on the reference sequences, it's necessary
+to annotate ("mark") some or all of the [Burrows-Wheeler] rows with their
+corresponding location on the genome. 
+`-o`/`--offrate` governs how many rows get marked:
+the indexer will mark every 2^`<int>` rows.  Marking more rows makes
+reference-position lookups faster, but requires more memory to hold the
+annotations at runtime.  The default is 4 (every 16th row is marked; for human
+genome, annotations occupy about 680 megabytes).  
+
+    -t/--ftabchars <int>
+
+The ftab is the lookup table used to calculate an initial [Burrows-Wheeler]
+range with respect to the first `<int>` characters of the query.  A larger
+`<int>` yields a larger lookup table but faster query times.  The ftab has size
+4^(`<int>`+1) bytes.  The default setting is 10 (ftab is 4MB).
+
+    --seed <int>
+
+Use `<int>` as the seed for pseudo-random number generator.
+
+    --kmer-count <int>
+
+Use `<int>` as kmer-size for counting the distinct number of k-mers in the input sequences.
+
+    -q/--quiet
+
+`centrifuge-build` is verbose by default.  With this option `centrifuge-build` will
+print only error messages.
+
+    -h/--help
+
+Print usage information and quit.
+
+    --version
+
+Print version information and quit.
+
+The `centrifuge-inspect` index inspector
+=====================================
+
+`centrifuge-inspect` extracts information from a Centrifuge index about what kind of
+index it is and what reference sequences were used to build it. When run without
+any options, the tool will output a FASTA file containing the sequences of the
+original references (with all non-`A`/`C`/`G`/`T` characters converted to `N`s).
+ It can also be used to extract just the reference sequence names using the
+`-n`/`--names` option or a more verbose summary using the `-s`/`--summary`
+option.
+
+Command Line
+------------
+
+Usage:
+
+    centrifuge-inspect [options]* <cf_base>
+
+### Main arguments
+
+The basename of the index to be inspected.  The basename is name of any of the
+index files but with the `.X.cf` suffix omitted.
+`centrifuge-inspect` first looks in the current directory for the index files, then
+in the directory specified in the `Centrifuge_INDEXES` environment variable.
+
+### Options
+
+    -a/--across <int>
+
+When printing FASTA output, output a newline character every `<int>` bases
+(default: 60).
+
+    -n/--names
+
+Print reference sequence names, one per line, and quit.
+
+    -s/--summary
+
+Print a summary that includes information about index settings, as well as the
+names and lengths of the input sequences.  The summary has this format: 
+
+    Colorspace	<0 or 1>
+    SA-Sample	1 in <sample>
+    FTab-Chars	<chars>
+    Sequence-1	<name>	<len>
+    Sequence-2	<name>	<len>
+    ...
+    Sequence-N	<name>	<len>
+
+Fields are separated by tabs.  Colorspace is always set to 0 for Centrifuge.
+
+    --conversion-table
+
+Print a list of UIDs (unique ID) and corresponding taxonomic IDs.
+
+    --taxonomy-tree
+
+Print taxonomic tree.
+
+    --name-table
+
+Print name table.
+
+    --size-table
+
+Print a list of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.
+
+    -v/--verbose
+
+Print verbose output (for debugging).
+
+    --version
+
+Print version information and quit.
+
+    -h/--help
+
+Print usage information and quit.
+
+Getting started with Centrifuge
+===================================================
+
+Centrifuge comes with some example files to get you started.  The example files
+are not scientifically significant; these files will simply let you start running Centrifuge and
+downstream tools right away.
+
+First follow the manual instructions to [obtain Centrifuge].  Set the `CENTRIFUGE_HOME`
+environment variable to point to the new Centrifuge directory containing the
+`centrifuge`, `centrifuge-build` and `centrifuge-inspect` binaries.  This is important,
+as the `CENTRIFUGE_HOME` variable is used in the commands below to refer to that
+directory.
+
+Indexing a reference genome
+---------------------------
+
+To create an index for two small sequences included with Centrifuge, create a new temporary directory (it doesn't matter where), change into that directory, and run:
+
+    $CENTRIFUGE_HOME/centrifuge-build --conversion-table $CENTRIFUGE_HOME/example/reference/gi_to_tid.dmp --taxonomy-tree $CENTRIFUGE_HOME/example/reference/nodes.dmp --name-table $CENTRIFUGE_HOME/example/reference/names.dmp $CENTRIFUGE_HOME/example/reference/test.fa test
+
+The command should print many lines of output then quit. When the command
+completes, the current directory will contain ten new files that all start with
+`test` and end with `.1.cf`, `.2.cf`, `.3.cf`.  These files constitute the index - you're done!
+
+You can use `centrifuge-build` to create an index for a set of FASTA files obtained
+from any source, including sites such as [UCSC], [NCBI], and [Ensembl]. When
+indexing multiple FASTA files, specify all the files using commas to separate
+file names.  For more details on how to create an index with `centrifuge-build`,
+see the [manual section on index building].  You may also want to bypass this
+process by obtaining a pre-built index.
+
+[UCSC]: http://genome.ucsc.edu/cgi-bin/hgGateway
+[NCBI]: http://www.ncbi.nlm.nih.gov/sites/genome
+[Ensembl]: http://www.ensembl.org/
+
+Classifying example reads
+----------------------
+
+Stay in the directory created in the previous step, which now contains the
+`test` index files.  Next, run:
+
+    $CENTRIFUGE_HOME/centrifuge -f -x test $CENTRIFUGE_HOME/example/reads/input.fa
+
+This runs the Centrifuge classifier, which classifies a set of unpaired reads to the
+the genomes using the index generated in the previous step.
+The classification results are reported to stdout, and a
+short classification summary is written to centrifuge-species_report.tsv.
+
+You will see something like this:
+
+    readID  seqID taxID     score	2ndBestScore	hitLength	numMatches
+    C_1 gi|7     9913      4225	4225		80		2
+    C_1 gi|4     9646      4225	4225		80		2
+    C_2 gi|4     9646      4225	4225		80		2
+    C_2 gi|7     9913      4225	4225		80		2
+    C_3 gi|7     9913      4225	4225		80		2
+    C_3 gi|4     9646      4225	4225		80		2
+    C_4 gi|4     9646      4225	4225		80		2
+    C_4 gi|7     9913      4225	4225		80		2
+    1_1 gi|4     9646      4225	0		80		1
+    1_2 gi|4     9646      4225	0		80		1
+    2_1 gi|7     9913      4225	0		80		1
+    2_2 gi|7     9913      4225	0		80		1
+    2_3 gi|7     9913      4225	0		80		1
+    2_4 gi|7     9913      4225	0		80		1
+    2_5 gi|7     9913      4225	0		80		1
+    2_6 gi|7     9913      4225	0		80		1
diff --git a/MANUAL.markdown b/MANUAL.markdown
new file mode 100644
index 0000000..714e4fc
--- /dev/null
+++ b/MANUAL.markdown
@@ -0,0 +1,1505 @@
+
+
+<!--
+ ! This manual is written in "markdown" format and thus contains some
+ ! distracting formatting clutter.  See 'MANUAL' for an easier-to-read version
+ ! of this text document, or see the HTML manual online.
+ ! -->
+
+Introduction
+============
+
+What is Centrifuge?
+-----------------
+
+[Centrifuge] is a novel microbial classification engine that enables
+rapid, accurate, and sensitive labeling of reads and quantification of
+species on desktop computers.  The system uses a novel indexing scheme
+based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini
+(FM) index, optimized specifically for the metagenomic classification
+problem. Centrifuge requires a relatively small index (5.8 GB for all
+complete bacterial and viral genomes plus the human genome) and
+classifies sequences at a very high speed, allowing it to process the
+millions of reads from a typical high-throughput DNA sequencing run
+within a few minutes.  Together these advances enable timely and
+accurate analysis of large metagenomics data sets on conventional
+desktop computers.
+
+[Centrifuge]:     http://www.ccb.jhu.edu/software/centrifuge
+
+[Burrows-Wheeler Transform]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[FM Index]:        http://en.wikipedia.org/wiki/FM-index
+
+[GPLv3 license]:   http://www.gnu.org/licenses/gpl-3.0.html
+
+
+Obtaining Centrifuge
+==================
+
+Download Centrifuge and binaries from the Releases sections on the right side.
+Binaries are available for Intel architectures (`x86_64`) running Linux, and Mac OS X.
+
+Building from source
+--------------------
+
+Building Centrifuge from source requires a GNU-like environment with GCC, GNU Make
+and other basics.  It should be possible to build Centrifuge on most vanilla Linux
+installations or on a Mac installation with [Xcode] installed.  Centrifuge can
+also be built on Windows using [Cygwin] or [MinGW] (MinGW recommended). For a 
+MinGW build the choice of what compiler is to be used is important since this
+will determine if a 32 or 64 bit code can be successfully compiled using it. If 
+there is a need to generate both 32 and 64 bit on the same machine then a multilib 
+MinGW has to be properly installed. [MSYS], the [zlib] library, and depending on 
+architecture [pthreads] library are also required. We are recommending a 64 bit
+build since it has some clear advantages in real life research problems. In order 
+to simplify the MinGW setup it might be worth investigating popular MinGW personal 
+builds since these are coming already prepared with most of the toolchains needed.
+
+First, download the [source package] from the Releases secion on the right side.
+Unzip the file, change to the unzipped directory, and build the
+Centrifuge tools by running GNU `make` (usually with the command `make`, but
+sometimes with `gmake`) with no arguments.  If building with MinGW, run `make`
+from the MSYS environment.
+
+Centrifuge is using the multithreading software model in order to speed up 
+execution times on SMP architectures where this is possible. On POSIX 
+platforms (like linux, Mac OS, etc) it needs the pthread library. Although
+it is possible to use pthread library on non-POSIX platform like Windows, due
+to performance reasons Centrifuge will try to use Windows native multithreading
+if possible.
+
+For the support of SRA data access in HISAT2, please download and install the [NCBI-NGS] toolkit.
+When running `make`, specify additional variables as follow.
+`make USE_SRA=1 NCBI_NGS_DIR=/path/to/NCBI-NGS-directory NCBI_VDB_DIR=/path/to/NCBI-NGS-directory`,
+where `NCBI_NGS_DIR` and `NCBI_VDB_DIR` will be used in Makefile for -I and -L compilation options.
+For example, $(NCBI_NGS_DIR)/include and $(NCBI_NGS_DIR)/lib64 will be used.  
+
+[Cygwin]:   http://www.cygwin.com/
+[MinGW]:    http://www.mingw.org/
+[MSYS]:     http://www.mingw.org/wiki/msys
+[zlib]:     http://cygwin.com/packages/mingw-zlib/
+[pthreads]: http://sourceware.org/pthreads-win32/
+[GnuWin32]: http://gnuwin32.sf.net/packages/coreutils.htm
+[Xcode]:    http://developer.apple.com/xcode/
+[Github site]: https://github.com/infphilo/centrifuge
+[NCBI-NGS]: https://github.com/ncbi/ngs/wiki/Downloads
+
+Running Centrifuge
+=============
+
+Adding to PATH
+--------------
+
+By adding your new Centrifuge directory to your [PATH environment variable], you
+ensure that whenever you run `centrifuge`, `centrifuge-build`, `centrifuge-download` or `centrifuge-inspect`
+from the command line, you will get the version you just installed without
+having to specify the entire path.  This is recommended for most users.  To do
+this, follow your operating system's instructions for adding the directory to
+your [PATH].
+
+If you would like to install Centrifuge by copying the Centrifuge executable files
+to an existing directory in your [PATH], make sure that you copy all the
+executables, including `centrifuge`, `centrifuge-class`, `centrifuge-build`, `centrifuge-build-bin`, `centrifuge-download` `centrifuge-inspect`
+and `centrifuge-inspect-bin`. Furthermore you need the programs
+in the scripts/ folder if you opt for genome compression in the database construction.
+
+[PATH environment variable]: http://en.wikipedia.org/wiki/PATH_(variable)
+[PATH]: http://en.wikipedia.org/wiki/PATH_(variable)
+
+
+Before running Centrifuge
+-----------------
+
+Classification is considerably different from alignment in that classification is performed on a large set of genomes as opposed to on just one reference genome as in alignment.  Currently, an enormous number of complete genomes are available at the GenBank (e.g. >4,000 bacterial genomes, >10,000 viral genomes, …).  These genomes are organized in a taxonomic tree where each genome is located at the bottom of the tree, at the strain or subspecies level.  On the taxonomic tree, genomes hav [...]
+
+Given the gigantic number of genomes available, which continues to expand at a rapid rate, and the development of the taxonomic tree, which continues to evolve with new advancements in research, we have designed Centrifuge to be flexible and general enough to reflect this huge database.  We provide several standard indexes that will meet most of users’ needs (see the side panel - Indexes).  In our approach our indexes not only include raw genome sequences, but also genome names/sizes and [...]
+
+We encourage first time users to take a look at and follow a [`small example`] that illustrates how to build an index, how to run Centrifuge using the index, how to interpret the classification results, and how to extract additional genomic information from the index.  For those who choose to build customized indexes, please take a close look at the following description.
+
+Database download and index building
+-----------------
+
+Centrifuge indexes can be built with arbritary sequences. Standard choices are
+all of the complete bacterial and viral genomes, or using the sequences that
+are part of the BLAST nt database. Centrifuge always needs the
+nodes.dmp file from the NCBI taxonomy dump to build the taxonomy tree,
+as well as a sequence ID to taxonomy ID map. The map is a tab-separated
+file with the sequence ID to taxonomy ID map.
+
+To download all of the complete archaeal, viral, and bacterial genomes from RefSeq, and
+build the index:
+
+Centrifuge indices can be build on arbritary sequences. Usually an ensemble of
+genomes is used - such as all complete microbial genomes in the RefSeq database,
+or all sequences in the BLAST nt database. 
+
+
+To map sequence identifiers to taxonomy IDs, and taxonomy IDs to names and 
+its parents, three files are necessary in addition to the sequence files:
+
+ - taxonomy tree: typically nodes.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their parents
+ - names file: typically names.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their scientific name
+ - a tab-separated sequence ID to taxonomy ID mapping
+
+When using the provided scripts to download the genomes, these files are automatically downloaded or generated. 
+When using a custom taxonomy or sequence files, please refer to the section `TODO` to learn more about their format.
+
+### Building index on all complete bacterial and viral genomes
+
+Use `centrifuge-download` to download genomes from NCBI. The following two commands download
+the NCBI taxonomy to `taxonomy/` in the current directory, and all complete archaeal,
+bacterial and viral genomes to `library/`. Low-complexity regions in the genomes are masked after
+download (parameter `-m`) using blast+'s `dustmasker`. `centrifuge-download` outputs tab-separated 
+sequence ID to taxonomy ID mappings to standard out, which are required by `centrifuge-build`.
+
+    centrifuge-download -o taxonomy taxonomy
+    centrifuge-download -o library -m -d "archaea,bacteria,viral" refseq > seqid2taxid.map
+
+To build the index, first concatenate all downloaded sequences into a single file, and then
+run `centrifuge-build`:
+    
+    cat library/*/*.fna > input-sequences.fna
+
+    ## build centrifuge index with 4 threads
+    centrifuge-build -p 4 --conversion-table seqid2taxid.map \
+                     --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \
+                     input-sequences.fna abv
+
+After building the index, all files except the index *.[123].cf files may be removed.
+If you also want to include the human and/or the mouse genome, add their sequences to 
+the library folder before building the index with one of the following commands:
+
+After the index building, all but the *.[123].cf index files may be removed. I.e. the files in
+the `library/` and `taxonomy/` directories are no longer needed.
+
+### Adding human or mouse genome to the index
+The human and mouse genomes can also be downloaded using `centrifuge-download`. They are in the
+domain "vertebrate_mammalian" (argument `-d`), are assembled at the chromosome level (argument `-a`)
+and categorized as reference genomes by RefSeq (`-c`). The argument `-t` takes a comma-separated
+list of taxonomy IDs - e.g. `9606` for human and `10090` for mouse:
+
+    # download mouse and human reference genomes
+    centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606,10090 -c 'reference genome' >> seqid2taxid.map
+    # only human
+    centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606 -c 'reference genome' >> seqid2taxid.map
+    # only mouse
+    centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 10090 -c 'reference genome' >> seqid2taxid.map
+
+
+### nt database
+
+NCBI BLAST's nt database contains all spliced non-redundant coding
+sequences from multiplpe databases, inferred from genommic
+sequences. Traditionally used with BLAST, a download of the FASTA is
+provided on the NCBI homepage. Building an index with any database 
+requires the user to creates a sequence ID to taxonomy ID map that 
+can be generated from a GI taxid dump:
+
+    wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz
+    gunzip nt.gz && mv -v nt nt.fa
+
+    # Get mapping file
+    wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
+    gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
+
+    # build index using 16 cores and a small bucket size, which will require less memory
+    centrifuge-build -p 16 --bmax 1342177280 --conversion-table gi_taxid_nucl.map \
+                     --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \ 
+                     nt.fa nt
+
+
+
+### Custom database
+
+TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
+
+
+### Centrifuge classification output
+
+The following example shows classification assignments for a read.  The assignment output has 8 columns.
+
+    readID    seqID   taxID score	   2ndBestScore	   hitLength	queryLength	numMatches
+    1_1	      gi|4    9646  4225	   0		       80	80		1
+
+    The first column is the read ID from a raw sequencing read (e.g., 1_1 in the example).
+    The second column is the sequence ID of the genomic sequence, where the read is classified (e.g., gi|4).
+    The third column is the taxonomic ID of the genomic sequence in the second column (e.g., 9646).
+    The fourth column is the score for the classification, which is the weighted sum of hits (e.g., 4225)
+    The fifth column is the score for the next best classification (e.g., 0).
+    The sixth column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80).
+    The seventh column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80). 
+    The eighth column is the number of classifications, indicating how many assignments were made (e.g.,1).
+
+### Centrifuge summary output (the default filename is centrifuge_report.tsv)
+
+The following example shows a classification summary for each genome or taxonomic unit.  The assignment output has 7 columns.
+
+    name      	      	      		     	     	      	     	taxID	taxRank	   genomeSize 	numReads   numUniqueReads   abundance
+    Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis	36870	leaf	   703004		5981	   5964	            0.0152317
+
+    The first column is the name of a genome, or the name corresponding to a taxonomic ID (the second column) at a rank higher than the strain (e.g., Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis).
+    The second column is the taxonomic ID (e.g., 36870).
+    The third column is the taxonomic rank (e.g., leaf).
+    The fourth column is the length of the genome sequence (e.g., 703004).
+    The fifth column is the number of reads classified to this genomic sequence including multi-classified reads (e.g., 5981).
+    The sixth column is the number of reads uniquely classified to this genomic sequence (e.g., 5964).
+    The seventh column is the proportion of this genome normalized by its genomic length (e.g., 0.0152317).
+
+
+
+
+Inspecting the Centrifuge index
+-----------------------
+
+The index can be inspected with `centrifuge-inspect`.  To extract raw sequences:
+
+    centrifuge-inspect <centrifuge index>
+
+Extract the sequence ID to taxonomy ID conversion table from the index
+
+    centrifuge-inspect --conversion-table <centrifuge index>
+
+Extract the taxonomy tree from the index:
+
+    centrifuge-inspect --taxonomy-tree <centrifuge index>
+
+Extract the lengths of the sequences from the index (each row has two columns: taxonomic ID and length):
+
+    centrifuge-inspect --size-table <centrifuge index>
+
+Extract the names from the index (each row has two columns: taxonomic ID and name):
+
+    centrifuge-inspect --name-table <centrifuge index>
+    
+
+
+Wrapper
+-------
+
+The `centrifuge`, `centrifuge-build` and `centrifuge-inspect` executables are actually 
+wrapper scripts that call binary programs as appropriate. Also, the `centrifuge` wrapper
+provides some key functionality, like the ability to handle compressed inputs,
+and the functionality for [`--un`], [`--al`] and related options.
+
+It is recommended that you always run the centrifuge wrappers and not run the
+binaries directly.
+
+Performance tuning
+------------------
+
+1.  If your computer has multiple processors/cores, use `-p NTHREADS`
+
+    The [`-p`] option causes Centrifuge to launch a specified number of parallel
+    search threads.  Each thread runs on a different processor/core and all
+    threads find alignments in parallel, increasing alignment throughput by
+    approximately a multiple of the number of threads (though in practice,
+    speedup is somewhat worse than linear).
+
+Command Line
+------------
+
+
+### Usage
+
+    centrifuge [options]* -x <centrifuge-idx> {-1 <m1> -2 <m2> | -U <r> | --sra-acc <SRA accession number>} [--report-file <report file name> -S <classification output file name>]
+
+### Main arguments
+
+<table><tr><td>
+
+[`-x`]: #centrifuge-options-x
+
+    -x <centrifuge-idx>
+
+</td><td>
+
+The basename of the index for the reference genomes.  The basename is the name of
+any of the index files up to but not including the final `.1.cf` / etc.  
+`centrifuge` looks for the specified index first in the current directory,
+then in the directory specified in the `CENTRIFUGE_INDEXES` environment variable.
+
+</td></tr><tr><td>
+
+[`-1`]: #centrifuge-options-1
+
+    -1 <m1>
+
+</td><td>
+
+Comma-separated list of files containing mate 1s (filename usually includes
+`_1`), e.g. `-1 flyA_1.fq,flyB_1.fq`.  Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in `<m2>`. Reads
+may be a mix of different lengths. If `-` is specified, `centrifuge` will read the
+mate 1s from the "standard in" or "stdin" filehandle.
+
+</td></tr><tr><td>
+
+[`-2`]: #centrifuge-options-2
+
+    -2 <m2>
+
+</td><td>
+
+Comma-separated list of files containing mate 2s (filename usually includes
+`_2`), e.g. `-2 flyA_2.fq,flyB_2.fq`.  Sequences specified with this option must
+correspond file-for-file and read-for-read with those specified in `<m1>`. Reads
+may be a mix of different lengths. If `-` is specified, `centrifuge` will read the
+mate 2s from the "standard in" or "stdin" filehandle.
+
+</td></tr><tr><td>
+
+[`-U`]: #centrifuge-options-U
+
+    -U <r>
+
+</td><td>
+
+Comma-separated list of files containing unpaired reads to be aligned, e.g.
+`lane1.fq,lane2.fq,lane3.fq,lane4.fq`.  Reads may be a mix of different lengths.
+If `-` is specified, `centrifuge` gets the reads from the "standard in" or "stdin"
+filehandle.
+
+</td></tr><tr><td>
+
+[`--sra-acc`]: #hisat2-options-sra-acc
+
+    --sra-acc <SRA accession number>
+
+</td><td>
+
+Comma-separated list of SRA accession numbers, e.g. `--sra-acc SRR353653,SRR353654`.
+Information about read types is available at http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?sp=runinfo&acc=<b>sra-acc</b>&retmode=xml,
+where <b>sra-acc</b> is SRA accession number.  If users run HISAT2 on a computer cluster, it is recommended to disable SRA-related caching (see the instruction at [SRA-MANUAL]).
+
+[SRA-MANUAL]:	     https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration
+
+</td></tr><tr><td>
+
+[`-S`]: #centrifuge-options-S
+
+    -S <filename>
+
+</td><td>
+
+File to write classification results to.  By default, assignments are written to the
+"standard out" or "stdout" filehandle (i.e. the console).
+
+</td></tr><tr><td>
+
+[`--report-file`]: #centrifuge-options-report-file
+
+    --report-file <filename>
+
+</td><td>
+
+File to write a classification summary to (default: centrifuge_report.tsv).
+
+</td></tr></table>
+
+### Options
+
+#### Input options
+
+<table>
+<tr><td id="centrifuge-options-q">
+
+[`-q`]: #centrifuge-options-q
+
+    -q
+
+</td><td>
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are FASTQ files.  FASTQ files
+usually have extension `.fq` or `.fastq`.  FASTQ is the default format.  See
+also: [`--solexa-quals`] and [`--int-quals`].
+
+</td></tr>
+<tr><td id="centrifuge-options-qseq">
+
+[`--qseq`]: #centrifuge-options-qseq
+
+    --qseq
+
+</td><td>
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are QSEQ files.  QSEQ files usually
+end in `_qseq.txt`.  See also: [`--solexa-quals`] and [`--int-quals`].
+
+</td></tr>
+<tr><td id="centrifuge-options-f">
+
+[`-f`]: #centrifuge-options-f
+
+    -f
+
+</td><td>
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are FASTA files.  FASTA files
+usually have extension `.fa`, `.fasta`, `.mfa`, `.fna` or similar.  FASTA files
+do not have a way of specifying quality values, so when `-f` is set, the result
+is as if `--ignore-quals` is also set.
+
+</td></tr>
+<tr><td id="centrifuge-options-r">
+
+[`-r`]: #centrifuge-options-r
+
+    -r
+
+</td><td>
+
+Reads (specified with `<m1>`, `<m2>`, `<s>`) are files with one input sequence
+per line, without any other information (no read names, no qualities).  When
+`-r` is set, the result is as if `--ignore-quals` is also set.
+
+</td></tr>
+<tr><td id="centrifuge-options-c">
+
+[`-c`]: #centrifuge-options-c
+
+    -c
+
+</td><td>
+
+The read sequences are given on command line.  I.e. `<m1>`, `<m2>` and
+`<singles>` are comma-separated lists of reads rather than lists of read files.
+There is no way to specify read names or qualities, so `-c` also implies
+`--ignore-quals`.
+
+</td></tr>
+<tr><td id="centrifuge-options-s">
+
+[`-s`/`--skip`]: #centrifuge-options-s
+[`-s`]: #centrifuge-options-s
+
+    -s/--skip <int>
+
+</td><td>
+
+Skip (i.e. do not align) the first `<int>` reads or pairs in the input.
+
+</td></tr>
+<tr><td id="centrifuge-options-u">
+
+[`-u`/`--qupto`]: #centrifuge-options-u
+[`-u`]: #centrifuge-options-u
+
+    -u/--qupto <int>
+
+</td><td>
+
+Align the first `<int>` reads or read pairs from the input (after the
+[`-s`/`--skip`] reads or pairs have been skipped), then stop.  Default: no limit.
+
+</td></tr>
+<tr><td id="centrifuge-options-5">
+
+[`-5`/`--trim5`]: #centrifuge-options-5
+[`-5`]: #centrifuge-options-5
+
+    -5/--trim5 <int>
+
+</td><td>
+
+Trim `<int>` bases from 5' (left) end of each read before alignment (default: 0).
+
+</td></tr>
+<tr><td id="centrifuge-options-3">
+
+[`-3`/`--trim3`]: #centrifuge-options-3
+[`-3`]: #centrifuge-options-3
+
+    -3/--trim3 <int>
+
+</td><td>
+
+Trim `<int>` bases from 3' (right) end of each read before alignment (default:
+0).
+
+</td></tr><tr><td id="centrifuge-options-phred33-quals">
+
+[`--phred33`]: #centrifuge-options-phred33-quals
+
+    --phred33
+
+</td><td>
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 33.  This is
+also called the "Phred+33" encoding, which is used by the very latest Illumina
+pipelines.
+
+[Phred quality]: http://en.wikipedia.org/wiki/Phred_quality_score
+
+</td></tr>
+<tr><td id="centrifuge-options-phred64-quals">
+
+[`--phred64`]: #centrifuge-options-phred64-quals
+
+    --phred64
+
+</td><td>
+
+Input qualities are ASCII chars equal to the [Phred quality] plus 64.  This is
+also called the "Phred+64" encoding.
+
+</td></tr>
+<tr><td id="centrifuge-options-solexa-quals">
+
+[`--solexa-quals`]: #centrifuge-options-solexa-quals
+
+    --solexa-quals
+
+</td><td>
+
+Convert input qualities from [Solexa][Phred quality] (which can be negative) to
+[Phred][Phred quality] (which can't).  This scheme was used in older Illumina GA
+Pipeline versions (prior to 1.3).  Default: off.
+
+</td></tr>
+<tr><td id="centrifuge-options-int-quals">
+
+[`--int-quals`]: #centrifuge-options-int-quals
+
+    --int-quals
+
+</td><td>
+
+Quality values are represented in the read input file as space-separated ASCII
+integers, e.g., `40 40 30 40`..., rather than ASCII characters, e.g., `II?I`....
+ Integers are treated as being on the [Phred quality] scale unless
+[`--solexa-quals`] is also specified. Default: off.
+
+</td></tr></table>
+
+#### Classification
+
+<table>
+
+<tr><td id="centrifuge-options-k">
+
+[`-k`]: #centrifuge-options-k
+
+    -k <int>
+
+</td><td>
+
+It searches for at most `<int>` distinct, primary assignments for each read or pair.  
+Primary assignments mean assignments whose assignment score is equal or higher than any other assignments.
+If there are more primary assignments than this value, 
+the search will merge some of the assignments into a higher taxonomic rank.
+The assignment score for a paired-end assignment equals the sum of the assignment scores of the individual mates. 
+Default: 5
+
+</td></tr>
+
+<tr><td id="centrifuge-options-host-taxids">
+
+[`--host-taxids`]: #centrifuge-options-host-taxids
+
+    --host-taxids
+
+</td><td>
+
+A comma-separated list of taxonomic IDs that will be preferred in classification procedure.
+The descendants from these IDs will also be preferred.  In case some of a read's assignments correspond to
+these taxonomic IDs, only those corresponding assignments will be reported.
+
+</td></tr>
+
+<tr><td id="centrifuge-options-exclude-taxids">
+
+[`--exclude-taxids`]: #centrifuge-options-exclude-taxids
+
+    --exclude-taxids
+
+</td><td>
+
+A comma-separated list of taxonomic IDs that will be excluded in classification procedure.
+The descendants from these IDs will also be exclude. 
+
+</td></tr>
+
+</table>
+
+
+<!--
+#### Alignment options
+
+<table>
+
+<tr><td id="centrifuge-options-n-ceil">
+
+[`--n-ceil`]: #centrifuge-options-n-ceil
+
+    --n-ceil <func>
+
+</td><td>
+
+Sets a function governing the maximum number of ambiguous characters (usually
+`N`s and/or `.`s) allowed in a read as a function of read length.  For instance,
+specifying `-L,0,0.15` sets the N-ceiling function `f` to `f(x) = 0 + 0.15 * x`,
+where x is the read length.  See also: [setting function options].  Reads
+exceeding this ceiling are [filtered out].  Default: `L,0,0.15`.
+
+[filtered out]: #filtering
+
+</td></tr>
+
+<tr><td id="centrifuge-options-ignore-quals">
+
+[`--ignore-quals`]: #centrifuge-options-ignore-quals
+
+    --ignore-quals
+
+</td><td>
+
+When calculating a mismatch penalty, always consider the quality value at the
+mismatched position to be the highest possible, regardless of the actual value. 
+I.e. input is treated as though all quality values are high.  This is also the
+default behavior when the input doesn't specify quality values (e.g. in [`-f`],
+[`-r`], or [`-c`] modes).
+
+</td></tr>
+<tr><td id="centrifuge-options-nofw">
+
+[`--nofw`]: #centrifuge-options-nofw
+
+    --nofw/--norc
+
+</td><td>
+
+If `--nofw` is specified, `centrifuge` will not attempt to align unpaired reads to
+the forward (Watson) reference strand.  If `--norc` is specified, `centrifuge` will
+not attempt to align unpaired reads against the reverse-complement (Crick)
+reference strand. In paired-end mode, `--nofw` and `--norc` pertain to the
+fragments; i.e. specifying `--nofw` causes `centrifuge` to explore only those
+paired-end configurations corresponding to fragments from the reverse-complement
+(Crick) strand.  Default: both strands enabled. 
+
+</td></tr>
+
+</table>
+
+#### Paired-end options
+
+<table>
+
+<tr><td id="centrifuge-options-fr">
+
+[`--fr`/`--rf`/`--ff`]: #centrifuge-options-fr
+[`--fr`]: #centrifuge-options-fr
+[`--rf`]: #centrifuge-options-fr
+[`--ff`]: #centrifuge-options-fr
+
+    --fr/--rf/--ff
+
+</td><td>
+
+The upstream/downstream mate orientations for a valid paired-end alignment
+against the forward reference strand.  E.g., if `--fr` is specified and there is
+a candidate paired-end alignment where mate 1 appears upstream of the reverse
+complement of mate 2 and the fragment length constraints ([`-I`] and [`-X`]) are
+met, that alignment is valid.  Also, if mate 2 appears upstream of the reverse
+complement of mate 1 and all other constraints are met, that too is valid.
+`--rf` likewise requires that an upstream mate1 be reverse-complemented and a
+downstream mate2 be forward-oriented. ` --ff` requires both an upstream mate 1
+and a downstream mate 2 to be forward-oriented.  Default: `--fr` (appropriate
+for Illumina's Paired-end Sequencing Assay).
+
+</td></tr></table>
+-->
+
+#### Output options
+
+<table>
+
+<tr><td id="centrifuge-options-t">
+
+[`-t`/`--time`]: #centrifuge-options-t
+[`-t`]: #centrifuge-options-t
+
+    -t/--time
+
+</td><td>
+
+Print the wall-clock time required to load the index files and align the reads. 
+This is printed to the "standard error" ("stderr") filehandle.  Default: off.
+
+</td></tr>
+
+<!--
+<tr><td id="centrifuge-options-un">
+
+[`--un`]: #centrifuge-options-un
+[`--un-gz`]: #centrifuge-options-un
+[`--un-bz2`]: #centrifuge-options-un
+
+    --un <path>
+    --un-gz <path>
+    --un-bz2 <path>
+
+</td><td>
+
+Write unpaired reads that fail to align to file at `<path>`.  These reads
+correspond to the SAM records with the FLAGS `0x4` bit set and neither the
+`0x40` nor `0x80` bits set.  If `--un-gz` is specified, output will be gzip
+compressed. If `--un-bz2` is specified, output will be bzip2 compressed.  Reads
+written in this way will appear exactly as they did in the input file, without
+any modification (same sequence, same name, same quality string, same quality
+encoding).  Reads will not necessarily appear in the same order as they did in
+the input.
+
+</td></tr>
+<tr><td id="centrifuge-options-al">
+
+[`--al`]: #centrifuge-options-al
+[`--al-gz`]: #centrifuge-options-al
+[`--al-bz2`]: #centrifuge-options-al
+
+    --al <path>
+    --al-gz <path>
+    --al-bz2 <path>
+
+</td><td>
+
+Write unpaired reads that align at least once to file at `<path>`.  These reads
+correspond to the SAM records with the FLAGS `0x4`, `0x40`, and `0x80` bits
+unset.  If `--al-gz` is specified, output will be gzip compressed. If `--al-bz2`
+is specified, output will be bzip2 compressed.  Reads written in this way will
+appear exactly as they did in the input file, without any modification (same
+sequence, same name, same quality string, same quality encoding).  Reads will
+not necessarily appear in the same order as they did in the input.
+
+</td></tr>
+<tr><td id="centrifuge-options-un-conc">
+
+[`--un-conc`]: #centrifuge-options-un-conc
+[`--un-conc-gz`]: #centrifuge-options-un-conc
+[`--un-conc-bz2`]: #centrifuge-options-un-conc
+
+    --un-conc <path>
+    --un-conc-gz <path>
+    --un-conc-bz2 <path>
+
+</td><td>
+
+Write paired-end reads that fail to align concordantly to file(s) at `<path>`.
+These reads correspond to the SAM records with the FLAGS `0x4` bit set and
+either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2).
+`.1` and `.2` strings are added to the filename to distinguish which file
+contains mate #1 and mate #2.  If a percent symbol, `%`, is used in `<path>`,
+the percent symbol is replaced with `1` or `2` to make the per-mate filenames.
+Otherwise, `.1` or `.2` are added before the final dot in `<path>` to make the
+per-mate filenames.  Reads written in this way will appear exactly as they did
+in the input files, without any modification (same sequence, same name, same
+quality string, same quality encoding).  Reads will not necessarily appear in
+the same order as they did in the inputs.
+
+</td></tr>
+<tr><td id="centrifuge-options-al-conc">
+
+[`--al-conc`]: #centrifuge-options-al-conc
+[`--al-conc-gz`]: #centrifuge-options-al-conc
+[`--al-conc-bz2`]: #centrifuge-options-al-conc
+
+    --al-conc <path>
+    --al-conc-gz <path>
+    --al-conc-bz2 <path>
+
+</td><td>
+
+Write paired-end reads that align concordantly at least once to file(s) at
+`<path>`. These reads correspond to the SAM records with the FLAGS `0x4` bit
+unset and either the `0x40` or `0x80` bit set (depending on whether it's mate #1
+or #2). `.1` and `.2` strings are added to the filename to distinguish which
+file contains mate #1 and mate #2.  If a percent symbol, `%`, is used in
+`<path>`, the percent symbol is replaced with `1` or `2` to make the per-mate
+filenames. Otherwise, `.1` or `.2` are added before the final dot in `<path>` to
+make the per-mate filenames.  Reads written in this way will appear exactly as
+they did in the input files, without any modification (same sequence, same name,
+same quality string, same quality encoding).  Reads will not necessarily appear
+in the same order as they did in the inputs.
+
+</td></tr>
+-->
+
+<tr><td id="centrifuge-options-quiet">
+
+[`--quiet`]: #centrifuge-options-quiet
+
+    --quiet
+
+</td><td>
+
+Print nothing besides alignments and serious errors.
+
+</td></tr>
+<tr><td id="centrifuge-options-met-file">
+
+[`--met-file`]: #centrifuge-options-met-file
+
+    --met-file <path>
+
+</td><td>
+
+Write `centrifuge` metrics to file `<path>`.  Having alignment metric can be useful
+for debugging certain problems, especially performance issues.  See also:
+[`--met`].  Default: metrics disabled.
+
+</td></tr>
+<tr><td id="centrifuge-options-met-stderr">
+
+[`--met-stderr`]: #centrifuge-options-met-stderr
+
+    --met-stderr
+
+</td><td>
+
+Write `centrifuge` metrics to the "standard error" ("stderr") filehandle.  This is
+not mutually exclusive with [`--met-file`].  Having alignment metric can be
+useful for debugging certain problems, especially performance issues.  See also:
+[`--met`].  Default: metrics disabled.
+
+</td></tr>
+<tr><td id="centrifuge-options-met">
+
+[`--met`]: #centrifuge-options-met
+
+    --met <int>
+
+</td><td>
+
+Write a new `centrifuge` metrics record every `<int>` seconds.  Only matters if
+either [`--met-stderr`] or [`--met-file`] are specified.  Default: 1.
+
+</td></tr>
+</table>
+
+#### Performance options
+
+<table><tr>
+
+<td id="centrifuge-options-o">
+
+[`-o`/`--offrate`]: #centrifuge-options-o
+[`-o`]: #centrifuge-options-o
+[`--offrate`]: #centrifuge-options-o
+
+    -o/--offrate <int>
+
+</td><td>
+
+Override the offrate of the index with `<int>`.  If `<int>` is greater
+than the offrate used to build the index, then some row markings are
+discarded when the index is read into memory.  This reduces the memory
+footprint of the aligner but requires more time to calculate text
+offsets.  `<int>` must be greater than the value used to build the
+index.
+
+</td></tr>
+<tr><td id="centrifuge-options-p">
+
+[`-p`/`--threads`]: #centrifuge-options-p
+[`-p`]: #centrifuge-options-p
+
+    -p/--threads NTHREADS
+
+</td><td>
+
+Launch `NTHREADS` parallel search threads (default: 1).  Threads will run on
+separate processors/cores and synchronize when parsing reads and outputting
+alignments.  Searching for alignments is highly parallel, and speedup is close
+to linear.  Increasing `-p` increases Centrifuge's memory footprint. E.g. when
+aligning to a human genome index, increasing `-p` from 1 to 8 increases the
+memory footprint by a few hundred megabytes.  This option is only available if
+`bowtie` is linked with the `pthreads` library (i.e. if `BOWTIE_PTHREADS=0` is
+not specified at build time).
+
+</td></tr>
+<tr><td id="centrifuge-options-reorder">
+
+[`--reorder`]: #centrifuge-options-reorder
+
+    --reorder
+
+</td><td>
+
+Guarantees that output records are printed in an order corresponding to the
+order of the reads in the original input file, even when [`-p`] is set greater
+than 1.  Specifying `--reorder` and setting [`-p`] greater than 1 causes Centrifuge
+to run somewhat slower and use somewhat more memory then if `--reorder` were
+not specified.  Has no effect if [`-p`] is set to 1, since output order will
+naturally correspond to input order in that case.
+
+</td></tr>
+<tr><td id="centrifuge-options-mm">
+
+[`--mm`]: #centrifuge-options-mm
+
+    --mm
+
+</td><td>
+
+Use memory-mapped I/O to load the index, rather than typical file I/O.
+Memory-mapping allows many concurrent `bowtie` processes on the same computer to
+share the same memory image of the index (i.e. you pay the memory overhead just
+once).  This facilitates memory-efficient parallelization of `bowtie` in
+situations where using [`-p`] is not possible or not preferable.
+
+</td></tr></table>
+
+#### Other options
+
+<table>
+<tr><td id="centrifuge-options-qc-filter">
+
+[`--qc-filter`]: #centrifuge-options-qc-filter
+
+    --qc-filter
+
+</td><td>
+
+Filter out reads for which the QSEQ filter field is non-zero.  Only has an
+effect when read format is [`--qseq`].  Default: off.
+
+</td></tr>
+<tr><td id="centrifuge-options-seed">
+
+[`--seed`]: #centrifuge-options-seed
+
+    --seed <int>
+
+</td><td>
+
+Use `<int>` as the seed for pseudo-random number generator.  Default: 0.
+
+</td></tr>
+<tr><td id="centrifuge-options-non-deterministic">
+
+[`--non-deterministic`]: #centrifuge-options-non-deterministic
+
+    --non-deterministic
+
+</td><td>
+
+Normally, Centrifuge re-initializes its pseudo-random generator for each read.  It
+seeds the generator with a number derived from (a) the read name, (b) the
+nucleotide sequence, (c) the quality sequence, (d) the value of the [`--seed`]
+option.  This means that if two reads are identical (same name, same
+nucleotides, same qualities) Centrifuge will find and report the same classification(s)
+for both, even if there was ambiguity.  When `--non-deterministic` is specified,
+Centrifuge re-initializes its pseudo-random generator for each read using the
+current time.  This means that Centrifuge will not necessarily report the same
+classification for two identical reads.  This is counter-intuitive for some users,
+but might be more appropriate in situations where the input consists of many
+identical reads.
+
+</td></tr>
+<tr><td id="centrifuge-options-version">
+
+[`--version`]: #centrifuge-options-version
+
+    --version
+
+</td><td>
+
+Print version information and quit.
+
+</td></tr>
+<tr><td id="centrifuge-options-h">
+
+    -h/--help
+
+</td><td>
+
+Print usage information and quit.
+
+</td></tr></table>
+
+
+The `centrifuge-build` indexer
+===========================
+
+`centrifuge-build` builds a Centrifuge index from a set of DNA sequences.
+`centrifuge-build` outputs a set of 6 files with suffixes `.1.cf`, `.2.cf`, and
+`.3.cf`.  These files together
+constitute the index: they are all that is needed to align reads to that
+reference.  The original sequence FASTA files are no longer used by Centrifuge
+once the index is built.
+
+Use of Karkkainen's [blockwise algorithm] allows `centrifuge-build` to trade off
+between running time and memory usage. `centrifuge-build` has two options
+governing how it makes this trade: [`--bmax`]/[`--bmaxdivn`],
+and [`--dcv`].  By default, `centrifuge-build` will automatically search for the
+settings that yield the best running time without exhausting memory.  This
+behavior can be disabled using the [`-a`/`--noauto`] option.
+
+The indexer provides options pertaining to the "shape" of the index, e.g.
+[`--offrate`](#centrifuge-build-options-o) governs the fraction of [Burrows-Wheeler]
+rows that are "marked" (i.e., the density of the suffix-array sample; see the
+original [FM Index] paper for details).  All of these options are potentially
+profitable trade-offs depending on the application.  They have been set to
+defaults that are reasonable for most cases according to our experiments.  See
+[Performance tuning] for details.
+
+The Centrifuge index is based on the [FM Index] of Ferragina and Manzini, which in
+turn is based on the [Burrows-Wheeler] transform.  The algorithm used to build
+the index is based on the [blockwise algorithm] of Karkkainen.
+
+[Blockwise algorithm]: http://portal.acm.org/citation.cfm?id=1314852
+[Burrows-Wheeler]: http://en.wikipedia.org/wiki/Burrows-Wheeler_transform
+[Performance tuning]: #performance-tuning
+
+Command Line
+------------
+
+Usage:
+
+    centrifuge-build [options]* --conversion-table <table_in> --taxonomy-tree <taxonomy_in> --name-table <table_in2> <reference_in> <cf_base>
+
+### Main arguments
+
+<table><tr><td>
+
+    <reference_in>
+
+</td><td>
+
+A comma-separated list of FASTA files containing the reference sequences to be
+aligned to, or, if [`-c`](#centrifuge-build-options-c) is specified, the sequences
+themselves. E.g., `<reference_in>` might be `chr1.fa,chr2.fa,chrX.fa,chrY.fa`,
+or, if [`-c`](#centrifuge-build-options-c) is specified, this might be
+`GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA`.
+
+</td></tr><tr><td>
+
+    <cf_base>
+
+</td><td>
+
+The basename of the index files to write.  By default, `centrifuge-build` writes
+files named `NAME.1.cf`, `NAME.2.cf`, and `NAME.3.cf`, where `NAME` is `<cf_base>`.
+
+</td></tr></table>
+
+### Options
+
+<table><tr><td>
+
+    -f
+
+</td><td>
+
+The reference input files (specified as `<reference_in>`) are FASTA files
+(usually having extension `.fa`, `.mfa`, `.fna` or similar).
+
+</td></tr><tr><td id="centrifuge-build-options-c">
+
+    -c
+
+</td><td>
+
+The reference sequences are given on the command line.  I.e. `<reference_in>` is
+a comma-separated list of sequences rather than a list of FASTA files.
+
+</td></tr>
+<tr><td id="centrifuge-build-options-a">
+
+[`-a`/`--noauto`]: #centrifuge-build-options-a
+
+    -a/--noauto
+
+</td><td>
+
+Disable the default behavior whereby `centrifuge-build` automatically selects
+values for the [`--bmax`], [`--dcv`] and [`--packed`] parameters according to
+available memory.  Instead, user may specify values for those parameters.  If
+memory is exhausted during indexing, an error message will be printed; it is up
+to the user to try new parameters.
+
+</td></tr><tr><td id="centrifuge-build-options-p">
+
+[`-p`]: #centrifuge-build-options-p
+
+    -p/--threads <int>
+
+</td><td>
+
+Launch `NTHREADS` parallel search threads (default: 1).
+
+</td></tr><tr><td id="centrifuge-build-options-conversion-table">
+
+[`--conversion-table`]: #centrifuge-build-options-conversion-table
+
+    --conversion-table <file>
+
+</td><td>
+
+List of UIDs (unique ID) and corresponding taxonomic IDs.
+
+</td></tr><tr><td id="centrifuge-build-options-taxonomy-tree">
+
+[`--taxonomy-tree`]: #centrifuge-build-options-taxonomy-tree
+
+    --taxonomy-tree <file>
+
+</td><td>
+
+Taxonomic tree (e.g. nodes.dmp).
+
+</td></tr><tr><td id="centrifuge-build-options-name-table">
+
+[`--taxonomy-tree`]: #centrifuge-build-options-name-table
+
+    --name-table <file>
+
+</td><td>
+
+Name table (e.g. names.dmp).
+
+</td></tr><tr><td id="centrifuge-build-options-taxonomy-tree">
+
+[`--size-table`]: #centrifuge-build-options-size-table
+
+    --size-table <file>
+
+</td><td>
+
+List of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.
+
+</td></tr><tr><td id="centrifuge-build-options-bmax">
+
+[`--bmax`]: #centrifuge-build-options-bmax
+
+    --bmax <int>
+
+</td><td>
+
+The maximum number of suffixes allowed in a block.  Allowing more suffixes per
+block makes indexing faster, but increases peak memory usage.  Setting this
+option overrides any previous setting for [`--bmax`], or [`--bmaxdivn`]. 
+Default (in terms of the [`--bmaxdivn`] parameter) is [`--bmaxdivn`] 4.  This is
+configured automatically by default; use [`-a`/`--noauto`] to configure manually.
+
+</td></tr><tr><td id="centrifuge-build-options-bmaxdivn">
+
+[`--bmaxdivn`]: #centrifuge-build-options-bmaxdivn
+
+    --bmaxdivn <int>
+
+</td><td>
+
+The maximum number of suffixes allowed in a block, expressed as a fraction of
+the length of the reference.  Setting this option overrides any previous setting
+for [`--bmax`], or [`--bmaxdivn`].  Default: [`--bmaxdivn`] 4.  This is
+configured automatically by default; use [`-a`/`--noauto`] to configure manually.
+
+</td></tr><tr><td id="centrifuge-build-options-dcv">
+
+[`--dcv`]: #centrifuge-build-options-dcv
+
+    --dcv <int>
+
+</td><td>
+
+Use `<int>` as the period for the difference-cover sample.  A larger period
+yields less memory overhead, but may make suffix sorting slower, especially if
+repeats are present.  Must be a power of 2 no greater than 4096.  Default: 1024.
+ This is configured automatically by default; use [`-a`/`--noauto`] to configure
+manually.
+
+</td></tr><tr><td id="centrifuge-build-options-nodc">
+
+[`--nodc`]: #centrifuge-build-options-nodc
+
+    --nodc
+
+</td><td>
+
+Disable use of the difference-cover sample.  Suffix sorting becomes
+quadratic-time in the worst case (where the worst case is an extremely
+repetitive reference).  Default: off.
+
+</td></tr><tr><td id="centrifuge-build-options-o">
+
+    -o/--offrate <int>
+
+</td><td>
+
+To map alignments back to positions on the reference sequences, it's necessary
+to annotate ("mark") some or all of the [Burrows-Wheeler] rows with their
+corresponding location on the genome. 
+[`-o`/`--offrate`](#centrifuge-build-options-o) governs how many rows get marked:
+the indexer will mark every 2^`<int>` rows.  Marking more rows makes
+reference-position lookups faster, but requires more memory to hold the
+annotations at runtime.  The default is 4 (every 16th row is marked; for human
+genome, annotations occupy about 680 megabytes).  
+
+</td></tr><tr><td>
+
+    -t/--ftabchars <int>
+
+</td><td>
+
+The ftab is the lookup table used to calculate an initial [Burrows-Wheeler]
+range with respect to the first `<int>` characters of the query.  A larger
+`<int>` yields a larger lookup table but faster query times.  The ftab has size
+4^(`<int>`+1) bytes.  The default setting is 10 (ftab is 4MB).
+
+</td></tr><tr><td>
+
+    --seed <int>
+
+</td><td>
+
+Use `<int>` as the seed for pseudo-random number generator.
+
+</td></tr><tr><td>
+
+    --kmer-count <int>
+
+</td><td>
+
+Use `<int>` as kmer-size for counting the distinct number of k-mers in the input sequences.
+
+</td></tr><tr><td>
+
+    -q/--quiet
+
+</td><td>
+
+`centrifuge-build` is verbose by default.  With this option `centrifuge-build` will
+print only error messages.
+
+</td></tr><tr><td>
+
+    -h/--help
+
+</td><td>
+
+Print usage information and quit.
+
+</td></tr><tr><td>
+
+    --version
+
+</td><td>
+
+Print version information and quit.
+
+</td></tr></table>
+
+The `centrifuge-inspect` index inspector
+=====================================
+
+`centrifuge-inspect` extracts information from a Centrifuge index about what kind of
+index it is and what reference sequences were used to build it. When run without
+any options, the tool will output a FASTA file containing the sequences of the
+original references (with all non-`A`/`C`/`G`/`T` characters converted to `N`s).
+ It can also be used to extract just the reference sequence names using the
+[`-n`/`--names`] option or a more verbose summary using the [`-s`/`--summary`]
+option.
+
+Command Line
+------------
+
+Usage:
+
+    centrifuge-inspect [options]* <cf_base>
+
+### Main arguments
+
+<table><tr><td>
+
+    <cf_base>
+
+</td><td>
+
+The basename of the index to be inspected.  The basename is name of any of the
+index files but with the `.X.cf` suffix omitted.
+`centrifuge-inspect` first looks in the current directory for the index files, then
+in the directory specified in the `Centrifuge_INDEXES` environment variable.
+
+</td></tr></table>
+
+### Options
+
+<table><tr><td>
+
+    -a/--across <int>
+
+</td><td>
+
+When printing FASTA output, output a newline character every `<int>` bases
+(default: 60).
+
+</td></tr><tr><td id="centrifuge-inspect-options-n">
+
+[`-n`/`--names`]: #centrifuge-inspect-options-n
+
+    -n/--names
+
+</td><td>
+
+Print reference sequence names, one per line, and quit.
+
+</td></tr><tr><td id="centrifuge-inspect-options-s">
+
+[`-s`/`--summary`]: #centrifuge-inspect-options-s
+
+    -s/--summary
+
+</td><td>
+
+Print a summary that includes information about index settings, as well as the
+names and lengths of the input sequences.  The summary has this format: 
+
+    Colorspace	<0 or 1>
+    SA-Sample	1 in <sample>
+    FTab-Chars	<chars>
+    Sequence-1	<name>	<len>
+    Sequence-2	<name>	<len>
+    ...
+    Sequence-N	<name>	<len>
+
+Fields are separated by tabs.  Colorspace is always set to 0 for Centrifuge.
+
+</td></tr><tr><td id="centrifuge-inspect-options-conversion-table">
+
+[`--conversion-table`]: #centrifuge-inspect-options-conversion-table
+
+    --conversion-table
+
+</td><td>
+
+Print a list of UIDs (unique ID) and corresponding taxonomic IDs.
+
+</td></tr><tr><td id="centrifuge-inspect-options-taxonomy-tree">
+
+[`--taxonomy-tree`]: #centrifuge-inspect-options-taxonomy-tree
+
+    --taxonomy-tree
+
+</td><td>
+
+Print taxonomic tree.
+
+</td></tr><tr><td id="centrifuge-inspect-options-name-table">
+
+[`--taxonomy-tree`]: #centrifuge-inspect-options-name-table
+
+    --name-table
+
+</td><td>
+
+Print name table.
+
+</td></tr><tr><td id="centrifuge-inspect-options-taxonomy-tree">
+
+[`--size-table`]: #centrifuge-inspect-options-size-table
+
+    --size-table
+
+</td><td>
+
+Print a list of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.
+
+</td></tr><tr><td>
+
+    -v/--verbose
+
+</td><td>
+
+Print verbose output (for debugging).
+
+</td></tr><tr><td>
+
+    --version
+
+</td><td>
+
+Print version information and quit.
+
+</td></tr><tr><td>
+
+    -h/--help
+
+</td><td>
+
+Print usage information and quit.
+
+</td></tr></table>
+
+[`small example`]: #centrifuge-example
+
+Getting started with Centrifuge
+===================================================
+
+Centrifuge comes with some example files to get you started.  The example files
+are not scientifically significant; these files will simply let you start running Centrifuge and
+downstream tools right away.
+
+First follow the manual instructions to [obtain Centrifuge].  Set the `CENTRIFUGE_HOME`
+environment variable to point to the new Centrifuge directory containing the
+`centrifuge`, `centrifuge-build` and `centrifuge-inspect` binaries.  This is important,
+as the `CENTRIFUGE_HOME` variable is used in the commands below to refer to that
+directory.
+
+[obtain Centrifuge]: #obtaining-centrifuge
+
+Indexing a reference genome
+---------------------------
+
+To create an index for two small sequences included with Centrifuge, create a new temporary directory (it doesn't matter where), change into that directory, and run:
+
+    $CENTRIFUGE_HOME/centrifuge-build --conversion-table $CENTRIFUGE_HOME/example/reference/gi_to_tid.dmp --taxonomy-tree $CENTRIFUGE_HOME/example/reference/nodes.dmp --name-table $CENTRIFUGE_HOME/example/reference/names.dmp $CENTRIFUGE_HOME/example/reference/test.fa test
+
+The command should print many lines of output then quit. When the command
+completes, the current directory will contain ten new files that all start with
+`test` and end with `.1.cf`, `.2.cf`, `.3.cf`.  These files constitute the index - you're done!
+
+You can use `centrifuge-build` to create an index for a set of FASTA files obtained
+from any source, including sites such as [UCSC], [NCBI], and [Ensembl]. When
+indexing multiple FASTA files, specify all the files using commas to separate
+file names.  For more details on how to create an index with `centrifuge-build`,
+see the [manual section on index building].  You may also want to bypass this
+process by obtaining a pre-built index.
+
+[UCSC]: http://genome.ucsc.edu/cgi-bin/hgGateway
+[NCBI]: http://www.ncbi.nlm.nih.gov/sites/genome
+[Ensembl]: http://www.ensembl.org/
+[manual section on index building]: #the-centrifuge-build-indexer
+[using a pre-built index]: #using-a-pre-built-index
+
+Classifying example reads
+----------------------
+
+Stay in the directory created in the previous step, which now contains the
+`test` index files.  Next, run:
+
+    $CENTRIFUGE_HOME/centrifuge -f -x test $CENTRIFUGE_HOME/example/reads/input.fa
+
+This runs the Centrifuge classifier, which classifies a set of unpaired reads to the
+the genomes using the index generated in the previous step.
+The classification results are reported to stdout, and a
+short classification summary is written to centrifuge-species_report.tsv.
+
+You will see something like this:
+
+    readID  seqID taxID     score	2ndBestScore	hitLength	numMatches
+    C_1 gi|7     9913      4225	4225		80		2
+    C_1 gi|4     9646      4225	4225		80		2
+    C_2 gi|4     9646      4225	4225		80		2
+    C_2 gi|7     9913      4225	4225		80		2
+    C_3 gi|7     9913      4225	4225		80		2
+    C_3 gi|4     9646      4225	4225		80		2
+    C_4 gi|4     9646      4225	4225		80		2
+    C_4 gi|7     9913      4225	4225		80		2
+    1_1 gi|4     9646      4225	0		80		1
+    1_2 gi|4     9646      4225	0		80		1
+    2_1 gi|7     9913      4225	0		80		1
+    2_2 gi|7     9913      4225	0		80		1
+    2_3 gi|7     9913      4225	0		80		1
+    2_4 gi|7     9913      4225	0		80		1
+    2_5 gi|7     9913      4225	0		80		1
+    2_6 gi|7     9913      4225	0		80		1
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..d6de0bf
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,413 @@
+#
+# Copyright 2014, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of Centrifuge, which is copied and modified from Makefile in the Bowtie2 package.
+#
+# Centrifuge is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Centrifuge is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Centrifuge.  If not, see <http://www.gnu.org/licenses/>.
+#
+#
+# Makefile for centrifuge-bin, centrifuge-build, centrifuge-inspect
+#
+
+INC =
+GCC_PREFIX = $(shell dirname `which gcc`)
+GCC_SUFFIX =
+CC = $(GCC_PREFIX)/gcc$(GCC_SUFFIX)
+CPP = $(GCC_PREFIX)/g++$(GCC_SUFFIX)
+CXX = $(CPP) #-fdiagnostics-color=always
+HEADERS = $(wildcard *.h)
+BOWTIE_MM = 1
+BOWTIE_SHARED_MEM = 0
+
+# Detect Cygwin or MinGW
+WINDOWS = 0
+CYGWIN = 0
+MINGW = 0
+ifneq (,$(findstring CYGWIN,$(shell uname)))
+	WINDOWS = 1 
+	CYGWIN = 1
+	# POSIX memory-mapped files not currently supported on Windows
+	BOWTIE_MM = 0
+	BOWTIE_SHARED_MEM = 0
+else
+	ifneq (,$(findstring MINGW,$(shell uname)))
+		WINDOWS = 1
+		MINGW = 1
+		# POSIX memory-mapped files not currently supported on Windows
+		BOWTIE_MM = 0
+		BOWTIE_SHARED_MEM = 0
+	endif
+endif
+
+MACOS = 0
+ifneq (,$(findstring Darwin,$(shell uname)))
+	MACOS = 1
+endif
+
+POPCNT_CAPABILITY ?= 1
+ifeq (1, $(POPCNT_CAPABILITY))
+    EXTRA_FLAGS += -DPOPCNT_CAPABILITY
+    INC += -I third_party
+endif
+
+MM_DEF = 
+
+ifeq (1,$(BOWTIE_MM))
+	MM_DEF = -DBOWTIE_MM
+endif
+
+SHMEM_DEF = 
+
+ifeq (1,$(BOWTIE_SHARED_MEM))
+	SHMEM_DEF = -DBOWTIE_SHARED_MEM
+endif
+
+PTHREAD_PKG =
+PTHREAD_LIB = 
+
+ifeq (1,$(MINGW))
+	PTHREAD_LIB = 
+else
+	PTHREAD_LIB = -lpthread
+endif
+
+SEARCH_LIBS = 
+BUILD_LIBS = 
+INSPECT_LIBS =
+
+ifeq (1,$(MINGW))
+	BUILD_LIBS = 
+	INSPECT_LIBS = 
+endif
+
+USE_SRA = 0
+SRA_DEF =
+SRA_LIB =
+SERACH_INC = 
+ifeq (1,$(USE_SRA))
+	SRA_DEF = -DUSE_SRA
+	SRA_LIB = -lncbi-ngs-c++-static -lngs-c++-static -lncbi-vdb-static -ldl
+	SEARCH_INC += -I$(NCBI_NGS_DIR)/include -I$(NCBI_VDB_DIR)/include
+	SEARCH_LIBS += -L$(NCBI_NGS_DIR)/lib64 -L$(NCBI_VDB_DIR)/lib64
+endif
+
+LIBS = $(PTHREAD_LIB)
+
+SHARED_CPPS = ccnt_lut.cpp ref_read.cpp alphabet.cpp shmem.cpp \
+	edit.cpp bt2_idx.cpp \
+	reference.cpp ds.cpp limit.cpp \
+	random_source.cpp tinythread.cpp
+SEARCH_CPPS = qual.cpp pat.cpp \
+	read_qseq.cpp ref_coord.cpp mask.cpp \
+	pe.cpp aligner_seed_policy.cpp \
+	scoring.cpp presets.cpp \
+	simple_func.cpp random_util.cpp outq.cpp
+
+BUILD_CPPS = diff_sample.cpp
+
+CENTRIFUGE_CPPS_MAIN = $(SEARCH_CPPS) centrifuge_main.cpp
+CENTRIFUGE_BUILD_CPPS_MAIN = $(BUILD_CPPS) centrifuge_build_main.cpp
+CENTRIFUGE_COMPRESS_CPPS_MAIN = $(BUILD_CPPS) \
+	aligner_seed.cpp \
+	aligner_sw.cpp \
+	aligner_cache.cpp \
+	dp_framer.cpp \
+	aligner_bt.cpp sse_util.cpp \
+	aligner_swsse.cpp \
+	aligner_swsse_loc_i16.cpp \
+	aligner_swsse_ee_i16.cpp \
+	aligner_swsse_loc_u8.cpp \
+	aligner_swsse_ee_u8.cpp \
+	scoring.cpp \
+	mask.cpp \
+	qual.cpp
+
+CENTRIFUGE_REPORT_CPPS_MAIN=$(BUILD_CPPS)
+
+SEARCH_FRAGMENTS = $(wildcard search_*_phase*.c)
+VERSION = $(shell cat VERSION)
+GIT_VERSION = $(shell command -v git 2>&1 > /dev/null && git describe --long --tags --dirty --always --abbrev=10 || cat VERSION)
+
+# Convert BITS=?? to a -m flag
+BITS=32
+ifeq (x86_64,$(shell uname -m))
+BITS=64
+endif
+# msys will always be 32 bit so look at the cpu arch instead.
+ifneq (,$(findstring AMD64,$(PROCESSOR_ARCHITEW6432)))
+	ifeq (1,$(MINGW))
+		BITS=64
+	endif
+endif
+BITS_FLAG =
+
+ifeq (32,$(BITS))
+	BITS_FLAG = -m32
+endif
+
+ifeq (64,$(BITS))
+	BITS_FLAG = -m64
+endif
+SSE_FLAG=-msse2
+
+DEBUG_FLAGS    = -O0 -g3 $(BIToS_FLAG) $(SSE_FLAG)
+DEBUG_DEFS     = -DCOMPILER_OPTIONS="\"$(DEBUG_FLAGS) $(EXTRA_FLAGS)\""
+RELEASE_FLAGS  = -O3 $(BITS_FLAG) $(SSE_FLAG) -funroll-loops -g3
+RELEASE_DEFS   = -DCOMPILER_OPTIONS="\"$(RELEASE_FLAGS) $(EXTRA_FLAGS)\""
+NOASSERT_FLAGS = -DNDEBUG
+FILE_FLAGS     = -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+CFLAGS         = 
+#CFLAGS         = -fdiagnostics-color=always
+
+ifeq (1,$(USE_SRA))
+	ifeq (1, $(MACOS))
+		DEBUG_FLAGS += -mmacosx-version-min=10.6
+		RELEASE_FLAGS += -mmacosx-version-min=10.6
+	endif
+endif
+
+
+CENTRIFUGE_BIN_LIST = centrifuge-build-bin \
+	centrifuge-class \
+	centrifuge-inspect-bin
+
+CENTRIFUGE_BIN_LIST_AUX = centrifuge-build-bin-debug \
+	centrifuge-class-debug \
+	centrifuge-inspect-bin-debug
+
+GENERAL_LIST = $(wildcard scripts/*.sh) \
+	$(wildcard scripts/*.pl) \
+	$(wildcard *.py) \
+	$(wildcard *.pl) \
+	doc/manual.inc.html \
+	doc/README \
+	doc/style.css \
+	$(wildcard example/index/*.cf) \
+	$(wildcard example/reads/*.fa) \
+	$(wildcard example/reference/*) \
+	indices/Makefile \
+	$(PTHREAD_PKG) \
+	centrifuge \
+	centrifuge-build \
+	centrifuge-inspect \
+	AUTHORS \
+	LICENSE \
+	NEWS \
+	MANUAL \
+	MANUAL.markdown \
+	TUTORIAL \
+	VERSION
+
+ifeq (1,$(WINDOWS))
+	CENTRIFUGE_BIN_LIST := $(CENTRIFUGE_BIN_LIST) centrifuge.bat centrifuge-build.bat centrifuge-inspect.bat 
+endif
+
+# This is helpful on Windows under MinGW/MSYS, where Make might go for
+# the Windows FIND tool instead.
+FIND=$(shell which find)
+
+SRC_PKG_LIST = $(wildcard *.h) \
+	$(wildcard *.hh) \
+	$(wildcard *.c) \
+	$(wildcard *.cpp) \
+	$(wildcard third_party/*.h) \
+	$(wildcard third_party/*.cpp) \
+	doc/strip_markdown.pl \
+	Makefile \
+	$(GENERAL_LIST)
+
+BIN_PKG_LIST = $(GENERAL_LIST)
+
+.PHONY: all allall both both-debug
+
+all: $(CENTRIFUGE_BIN_LIST)
+
+allall: $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_BIN_LIST_AUX)
+
+both: centrifuge-class centrifuge-build-bin
+
+both-debug: centrifuge-class-debug centrifuge-build-bin-debug
+
+DEFS=-fno-strict-aliasing \
+     -DCENTRIFUGE_VERSION="\"$(GIT_VERSION)\"" \
+     -DBUILD_HOST="\"`hostname`\"" \
+     -DBUILD_TIME="\"`date`\"" \
+     -DCOMPILER_VERSION="\"`$(CXX) -v 2>&1 | tail -1`\"" \
+     $(FILE_FLAGS) \
+	 $(CFLAGS) \
+     $(PREF_DEF) \
+     $(MM_DEF) \
+     $(SHMEM_DEF)
+
+#
+# centrifuge targets
+#
+
+centrifuge-class: centrifuge.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) $(SRA_DEF) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+	$(INC) $(SEARCH_INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_CPPS_MAIN) \
+	$(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+centrifuge-class-debug: centrifuge.cpp $(SEARCH_CPPS) $(SHARED_CPPS) $(HEADERS) $(SEARCH_FRAGMENTS)
+	$(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) $(SRA_DEF) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+	$(INC) $(SRA_LIB) $(SEARCH_INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_CPPS_MAIN) \
+	$(LIBS) $(SRA_LIB) $(SEARCH_LIBS)
+
+centrifuge-build-bin: centrifuge_build.cpp $(SHARED_CPPS) $(HEADERS)
+	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+	$(INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_BUILD_CPPS_MAIN) \
+	$(LIBS) $(BUILD_LIBS)
+
+centrifuge-build-bin-debug: centrifuge_build.cpp $(SHARED_CPPS) $(HEADERS)
+	$(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+	$(INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_BUILD_CPPS_MAIN) \
+	$(LIBS) $(BUILD_LIBS)
+
+centrifuge-compress-bin: centrifuge_compress.cpp $(SHARED_CPPS) $(CENTRIFUGE_COMPRESS_CPPS_MAIN) $(HEADERS)
+	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+	$(INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_COMPRESS_CPPS_MAIN) \
+	$(LIBS) $(BUILD_LIBS)
+
+centrifuge-compress-bin-debug: centrifuge_compress.cpp $(SHARED_CPPS) $(CENTRIFUGE_COMPRESS_CPPS_MAIN) $(HEADERS)
+	$(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+	$(INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_COMPRESS_CPPS_MAIN) \
+	$(LIBS) $(BUILD_LIBS)
+
+centrifuge-report-bin: centrifuge_report.cpp $(SHARED_CPPS) $(CENTRIFUGE_REPORT_CPPS_MAIN) $(HEADERS)
+	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+	$(INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_REPORT_CPPS_MAIN) \
+	$(LIBS) $(BUILD_LIBS)
+
+centrifuge-report-bin-debug: centrifuge_report.cpp $(SHARED_CPPS) $(CENTRIFUGE_REPORT_CPPS_MAIN) $(HEADERS)
+	$(CXX) $(DEBUG_FLAGS) $(DEBUG_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+	$(INC) \
+	-o $@ $< \
+	$(SHARED_CPPS) $(CENTRIFUGE_REPORT_CPPS_MAIN) \
+	$(LIBS) $(BUILD_LIBS)
+
+#centrifuge-RemoveN: centrifuge-RemoveN.cpp 
+#	$(CXX) $(RELEASE_FLAGS) $(RELEASE_DEFS) $(EXTRA_FLAGS) \
+#	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX $(NOASSERT_FLAGS) -Wall \
+#	$(INC) \
+#	-o $@ $< 
+
+
+#
+# centrifuge-inspect targets
+#
+
+centrifuge-inspect-bin: centrifuge_inspect.cpp $(HEADERS) $(SHARED_CPPS)
+	$(CXX) $(RELEASE_FLAGS) \
+	$(RELEASE_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+	$(INC) -I . \
+	-o $@ $< \
+	$(SHARED_CPPS) \
+	$(LIBS) $(INSPECT_LIBS)
+
+centrifuge-inspect-bin-debug: centrifuge_inspect.cpp $(HEADERS) $(SHARED_CPPS) 
+	$(CXX) $(DEBUG_FLAGS) \
+	$(DEBUG_DEFS) $(EXTRA_FLAGS) \
+	$(DEFS) -DCENTRIFUGE -DBOWTIE2 -DBOWTIE_64BIT_INDEX -Wall \
+	$(INC) -I . \
+	-o $@ $< \
+	$(SHARED_CPPS) \
+	$(LIBS) $(INSPECT_LIBS)
+
+
+centrifuge: ;
+
+centrifuge.bat:
+	echo "@echo off" > centrifuge.bat
+	echo "perl %~dp0/centrifuge %*" >> centrifuge.bat
+
+centrifuge-build.bat:
+	echo "@echo off" > centrifuge-build.bat
+	echo "python %~dp0/centrifuge-build %*" >> centrifuge-build.bat
+
+centrifuge-inspect.bat:
+	echo "@echo off" > centrifuge-inspect.bat
+	echo "python %~dp0/centrifuge-inspect %*" >> centrifuge-inspect.bat
+
+
+.PHONY: centrifuge-src
+centrifuge-src: $(SRC_PKG_LIST)
+	mkdir .src.tmp
+	mkdir .src.tmp/centrifuge-$(VERSION)
+	zip tmp.zip $(SRC_PKG_LIST)
+	mv tmp.zip .src.tmp/centrifuge-$(VERSION)
+	cd .src.tmp/centrifuge-$(VERSION) ; unzip tmp.zip ; rm -f tmp.zip
+	cd .src.tmp ; zip -r centrifuge-$(VERSION)-source.zip centrifuge-$(VERSION)
+	cp .src.tmp/centrifuge-$(VERSION)-source.zip .
+	rm -rf .src.tmp
+
+.PHONY: centrifuge-bin
+centrifuge-bin: $(BIN_PKG_LIST) $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_BIN_LIST_AUX) 
+	rm -rf .bin.tmp
+	mkdir .bin.tmp
+	mkdir .bin.tmp/centrifuge-$(VERSION)
+	if [ -f centrifuge.exe ] ; then \
+		zip tmp.zip $(BIN_PKG_LIST) $(addsuffix .exe,$(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_BIN_LIST_AUX)) ; \
+	else \
+		zip tmp.zip $(BIN_PKG_LIST) $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_BIN_LIST_AUX) ; \
+	fi
+	mv tmp.zip .bin.tmp/centrifuge-$(VERSION)
+	cd .bin.tmp/centrifuge-$(VERSION) ; unzip tmp.zip ; rm -f tmp.zip
+	cd .bin.tmp ; zip -r centrifuge-$(VERSION)-$(BITS).zip centrifuge-$(VERSION)
+	cp .bin.tmp/centrifuge-$(VERSION)-$(BITS).zip .
+	rm -rf .bin.tmp
+
+.PHONY: doc
+doc: doc/manual.inc.html MANUAL
+
+doc/manual.inc.html: MANUAL.markdown
+	pandoc -T "Centrifuge Manual" -o $@ \
+	 --from markdown --to HTML --toc $^
+	perl -i -ne \
+	 '$$w=0 if m|^</body>|;print if $$w;$$w=1 if m|^<body>|;' $@
+
+MANUAL: MANUAL.markdown
+	perl doc/strip_markdown.pl < $^ > $@
+
+.PHONY: clean
+clean:
+	rm -f $(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_BIN_LIST_AUX) \
+	$(addsuffix .exe,$(CENTRIFUGE_BIN_LIST) $(CENTRIFUGE_BIN_LIST_AUX)) \
+	centrifuge-src.zip centrifuge-bin.zip
+	rm -f core.* .tmp.head
+	rm -rf *.dSYM
+push-doc: doc/manual.inc.html
+	scp doc/*.*html igm1:/data1/igm3/www/ccb.jhu.edu/html/software/centrifuge/
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..d5ae0ed
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,3 @@
+Centrifuge NEWS
+=============
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bb4ce51
--- /dev/null
+++ b/README.md
@@ -0,0 +1,40 @@
+# Centrifuge
+Classifier for metagenomic sequences
+
+[Centrifuge] is a novel microbial classification engine that enables
+rapid, accurate and sensitive labeling of reads and quantification of
+species on desktop computers.  The system uses a novel indexing scheme
+based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini
+(FM) index, optimized specifically for the metagenomic classification
+problem. Centrifuge requires a relatively small index (4.7 GB for all
+complete bacterial and viral genomes plus the human genome) and
+classifies sequences at very high speed, allowing it to process the
+millions of reads from a typical high-throughput DNA sequencing run
+within a few minutes.  Together these advances enable timely and
+accurate analysis of large metagenomics data sets on conventional
+desktop computers
+
+The Centrifuge hompage is  http://www.ccb.jhu.edu/software/centrifuge
+
+The Centrifuge poster is available at http://www.ccb.jhu.edu/people/infphilo/data/Centrifuge-poster.pdf
+
+For more details on installing and running Centrifuge, look at MANUAL
+
+## Quick guide
+### Installation from source
+
+    git clone https://github.com/infphilo/centrifuge
+    cd centrifuge
+    make
+
+### Building indexes
+
+We provide several indexes on the Centrifuge homepage at http://www.ccb.jhu.edu/software/centrifuge.
+Centrifuge needs sequence and taxonomy files,  as well as sequence ID to taxonomy ID mapping. 
+See the MANUAL files for details. We provide a Makefile that simplifies the building of several
+standard and custom indices
+
+    cd indices
+    make b+h+v                   # bacterial, human, and viral genomes [~12G]
+    make b_compressed            # bacterial genomes compressed at the species level [~4.2G]
+    make b_compressed+h+v        # combination of the two above [~8G]
diff --git a/TUTORIAL b/TUTORIAL
new file mode 100644
index 0000000..695b3b4
--- /dev/null
+++ b/TUTORIAL
@@ -0,0 +1,4 @@
+See section toward end of MANUAL entited "Getting started with Bowtie 2: Lambda
+phage example".  Or, for tutorial for latest Bowtie 2 version, visit:
+
+http://bowtie-bio.sf.net/bowtie2/manual.shtml#getting-started-with-bowtie-2-lambda-phage-example
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..ed69ddf
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+1.0.2-beta
diff --git a/aligner_bt.cpp b/aligner_bt.cpp
new file mode 100644
index 0000000..846a941
--- /dev/null
+++ b/aligner_bt.cpp
@@ -0,0 +1,1773 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "aligner_bt.h"
+#include "mask.h"
+
+using namespace std;
+
+#define CHECK_ROW_COL(rowc, colc) \
+	if(rowc >= 0 && colc >= 0) { \
+		if(!sawcell_[colc].insert(rowc)) { \
+			/* was already in there */ \
+			abort = true; \
+			return; \
+		} \
+		assert(local || prob_.cper_->debugCell(rowc, colc, hefc)); \
+	}
+
+/**
+ * Fill in a triangle of the DP table and backtrace from the given cell to
+ * a cell in the previous checkpoint, or to the terminal cell.
+ */
+void BtBranchTracer::triangleFill(
+	int64_t rw,          // row of cell to backtrace from
+	int64_t cl,          // column of cell to backtrace from
+	int hef,             // cell to backtrace from is H (0), E (1), or F (2)
+	TAlScore targ,       // score of cell to backtrace from
+	TAlScore targ_final, // score of alignment we're looking for
+	RandomSource& rnd,   // pseudo-random generator
+	int64_t& row_new,    // out: row we ended up in after backtrace
+	int64_t& col_new,    // out: column we ended up in after backtrace
+	int& hef_new,        // out: H/E/F after backtrace
+	TAlScore& targ_new,  // out: score up to cell we ended up in
+	bool& done,          // out: finished tracing out an alignment?
+	bool& abort)         // out: aborted b/c cell was seen before?
+{
+	assert_geq(rw, 0);
+	assert_geq(cl, 0);
+	assert_range(0, 2, hef);
+	assert_lt(rw, (int64_t)prob_.qrylen_);
+	assert_lt(cl, (int64_t)prob_.reflen_);
+	assert(prob_.usecp_ && prob_.fill_);
+	int64_t row = rw, col = cl;
+	const int64_t colmin = 0;
+	const int64_t rowmin = 0;
+	const int64_t colmax = prob_.reflen_ - 1;
+	const int64_t rowmax = prob_.qrylen_ - 1;
+	assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
+	assert_leq(col, (int64_t)prob_.cper_->hicol());
+	assert_geq(col, (int64_t)prob_.cper_->locol());
+	assert_geq(prob_.cper_->per(), 2);
+	size_t mod = (row + col) & prob_.cper_->lomask();
+	assert_lt(mod, prob_.cper_->per());
+	// Allocate room for diags
+	size_t depth = mod+1;
+	assert_leq(depth, prob_.cper_->per());
+	size_t breadth = depth;
+	tri_.resize(depth);
+	// Allocate room for each diag
+	for(size_t i = 0; i < depth; i++) {
+		tri_[i].resize(breadth - i);
+	}
+	bool upperleft = false;
+	size_t off = (row + col) >> prob_.cper_->perpow2();
+	if(off == 0) {
+		upperleft = true;
+	} else {
+		off--;
+	}
+	const TAlScore sc_rdo = prob_.sc_->readGapOpen();
+	const TAlScore sc_rde = prob_.sc_->readGapExtend();
+	const TAlScore sc_rfo = prob_.sc_->refGapOpen();
+	const TAlScore sc_rfe = prob_.sc_->refGapExtend();
+	const bool local = !prob_.sc_->monotone;
+	int64_t row_lo = row - (int64_t)mod;
+	const CpQuad *prev2 = NULL, *prev1 = NULL;
+	if(!upperleft) {
+		// Read-only pointer to cells in diagonal -2.  Start one row above the
+		// target row.
+		prev2 = prob_.cper_->qdiag1sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
+		// Read-only pointer to cells in diagonal -1.  Start one row above the
+		// target row
+		prev1 = prob_.cper_->qdiag2sPtr() + (off * prob_.cper_->nrow() + row_lo - 1);
+#ifndef NDEBUG
+		if(row >= (int64_t)mod) {
+			size_t rowc = row - mod, colc = col;
+			if(rowc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc)) {
+				TAlScore al = prev1[0].sc[0];
+				if(al == MIN_I16) al = MIN_I64;
+				assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc, 0), al);
+			}
+			if(rowc > 0 && colc > 0 && prob_.cper_->isCheckpointed(rowc-1, colc-1)) {
+				TAlScore al = prev2[0].sc[0];
+				if(al == MIN_I16) al = MIN_I64;
+				assert_eq(prob_.cper_->scoreTriangle(rowc-1, colc-1, 0), al);
+			}
+		}
+#endif
+	}
+	// Pointer to cells in current diagonal
+	// For each diagonal we need to fill in
+	for(size_t i = 0; i < depth; i++) {
+		CpQuad * cur = tri_[i].ptr();
+		CpQuad * curc = cur;
+		size_t doff = mod - i; // # diagonals we are away from target diag
+		//assert_geq(row, (int64_t)doff);
+		int64_t rowc = row - doff;
+		int64_t colc = col;
+		size_t neval = 0; // # cells evaluated in this diag
+		ASSERT_ONLY(const CpQuad *last = NULL);
+		// Fill this diagonal from upper right to lower left
+		for(size_t j = 0; j < breadth; j++) {
+			if(rowc >= rowmin && rowc <= rowmax &&
+			   colc >= colmin && colc <= colmax)
+			{
+				neval++;
+				int64_t fromend = prob_.qrylen_ - rowc - 1;
+				bool allowGaps = fromend >= prob_.sc_->gapbar && rowc >= prob_.sc_->gapbar;
+				// Fill this cell
+				// Some things we might want to calculate about this cell up front:
+				// 1. How many matches are possible from this cell to the cell in
+				//    row, col, in case this allows us to prune
+				// Get character from read
+				int qc = prob_.qry_[rowc];
+				// Get quality value from read
+				int qq = prob_.qual_[rowc];
+				assert_geq(qq, 33);
+				// Get character from reference
+				int rc = prob_.ref_[colc];
+				assert_range(0, 16, rc);
+				int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
+				int16_t sc_h_up = MIN_I16;
+				int16_t sc_f_up = MIN_I16;
+				int16_t sc_h_lf = MIN_I16;
+				int16_t sc_e_lf = MIN_I16;
+				if(allowGaps) {
+					if(rowc > 0) {
+						assert(local || prev1[j+0].sc[2] < 0);
+						if(prev1[j+0].sc[0] > MIN_I16) {
+							sc_h_up = prev1[j+0].sc[0] - sc_rfo;
+							if(local) sc_h_up = max<int16_t>(sc_h_up, 0);
+						}
+						if(prev1[j+0].sc[2] > MIN_I16) {
+							sc_f_up = prev1[j+0].sc[2] - sc_rfe;
+							if(local) sc_f_up = max<int16_t>(sc_f_up, 0);
+						}
+#ifndef NDEBUG
+						TAlScore hup = prev1[j+0].sc[0];
+						TAlScore fup = prev1[j+0].sc[2];
+						if(hup == MIN_I16) hup = MIN_I64;
+						if(fup == MIN_I16) fup = MIN_I64;
+						if(local) {
+							hup = max<int16_t>(hup, 0);
+							fup = max<int16_t>(fup, 0);
+						}
+						if(prob_.cper_->isCheckpointed(rowc-1, colc)) {
+							assert_eq(hup, prob_.cper_->scoreTriangle(rowc-1, colc, 0));
+							assert_eq(fup, prob_.cper_->scoreTriangle(rowc-1, colc, 2));
+						}
+#endif
+					}
+					if(colc > 0) {
+						assert(local || prev1[j+1].sc[1] < 0);
+						if(prev1[j+1].sc[0] > MIN_I16) {
+							sc_h_lf = prev1[j+1].sc[0] - sc_rdo;
+							if(local) sc_h_lf = max<int16_t>(sc_h_lf, 0);
+						}
+						if(prev1[j+1].sc[1] > MIN_I16) {
+							sc_e_lf = prev1[j+1].sc[1] - sc_rde;
+							if(local) sc_e_lf = max<int16_t>(sc_e_lf, 0);
+						}
+#ifndef NDEBUG
+						TAlScore hlf = prev1[j+1].sc[0];
+						TAlScore elf = prev1[j+1].sc[1];
+						if(hlf == MIN_I16) hlf = MIN_I64;
+						if(elf == MIN_I16) elf = MIN_I64;
+						if(local) {
+							hlf = max<int16_t>(hlf, 0);
+							elf = max<int16_t>(elf, 0);
+						}
+						if(prob_.cper_->isCheckpointed(rowc, colc-1)) {
+							assert_eq(hlf, prob_.cper_->scoreTriangle(rowc, colc-1, 0));
+							assert_eq(elf, prob_.cper_->scoreTriangle(rowc, colc-1, 1));
+						}
+#endif
+					}
+				}
+				assert(rowc <= 1 || colc <= 0 || prev2 != NULL);
+				int16_t sc_h_dg = ((rowc > 0 && colc > 0) ? prev2[j+0].sc[0] : 0);
+				if(colc == 0 && rowc > 0 && !local) {
+					sc_h_dg = MIN_I16;
+				}
+				if(sc_h_dg > MIN_I16) {
+					sc_h_dg += sc_diag;
+				}
+				if(local) sc_h_dg = max<int16_t>(sc_h_dg, 0);
+				// cerr << sc_diag << " " << sc_h_dg << " " << sc_h_up << " " << sc_f_up << " " << sc_h_lf << " " << sc_e_lf << endl;
+				int mask = 0;
+				// Calculate best ways into H, E, F cells starting with H.
+				// Mask bits:
+				// H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
+				// E: 32=hhoriz, 64=ehoriz
+				// F: 128=hvert, 256=fvert
+				int16_t sc_best = sc_h_dg;
+				if(sc_h_dg > MIN_I64) {
+					mask = 1;
+				}
+				if(colc > 0 && sc_h_lf >= sc_best && sc_h_lf > MIN_I64) {
+					if(sc_h_lf > sc_best) mask = 0;
+					mask |= 2;
+					sc_best = sc_h_lf;
+				}
+				if(colc > 0 && sc_e_lf >= sc_best && sc_e_lf > MIN_I64) {
+					if(sc_e_lf > sc_best) mask = 0;
+					mask |= 4;
+					sc_best = sc_e_lf;
+				}
+				if(rowc > 0 && sc_h_up >= sc_best && sc_h_up > MIN_I64) {
+					if(sc_h_up > sc_best) mask = 0;
+					mask |= 8;
+					sc_best = sc_h_up;
+				}
+				if(rowc > 0 && sc_f_up >= sc_best && sc_f_up > MIN_I64) {
+					if(sc_f_up > sc_best) mask = 0;
+					mask |= 16;
+					sc_best = sc_f_up;
+				}
+				// Calculate best way into E cell
+				int16_t sc_e_best = sc_h_lf;
+				if(colc > 0) {
+					if(sc_h_lf >= sc_e_lf && sc_h_lf > MIN_I64) {
+						if(sc_h_lf == sc_e_lf) {
+							mask |= 64;
+						}
+						mask |= 32;
+					} else if(sc_e_lf > MIN_I64) {
+						sc_e_best = sc_e_lf;
+						mask |= 64;
+					}
+				}
+				if(sc_e_best > sc_best) {
+					sc_best = sc_e_best;
+					mask &= ~31; // don't go diagonal
+				}
+				// Calculate best way into F cell
+				int16_t sc_f_best = sc_h_up;
+				if(rowc > 0) {
+					if(sc_h_up >= sc_f_up && sc_h_up > MIN_I64) {
+						if(sc_h_up == sc_f_up) {
+							mask |= 256;
+						}
+						mask |= 128;
+					} else if(sc_f_up > MIN_I64) {
+						sc_f_best = sc_f_up;
+						mask |= 256;
+					}
+				}
+				if(sc_f_best > sc_best) {
+					sc_best = sc_f_best;
+					mask &= ~127; // don't go horizontal or diagonal
+				}
+				// Install results in cur
+				assert(!prob_.sc_->monotone || sc_best <= 0);
+				assert(!prob_.sc_->monotone || sc_e_best <= 0);
+				assert(!prob_.sc_->monotone || sc_f_best <= 0);
+				curc->sc[0] = sc_best;
+				assert( local || sc_e_best < 0);
+				assert( local || sc_f_best < 0);
+				assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
+				assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
+				curc->sc[1] = sc_e_best;
+				curc->sc[2] = sc_f_best;
+				curc->sc[3] = mask;
+				// cerr << curc->sc[0] << " " << curc->sc[1] << " " << curc->sc[2] << " " << curc->sc[3] << endl;
+				ASSERT_ONLY(last = curc);
+#ifndef NDEBUG
+				if(prob_.cper_->isCheckpointed(rowc, colc)) {
+					if(local) {
+						sc_e_best = max<int16_t>(sc_e_best, 0);
+						sc_f_best = max<int16_t>(sc_f_best, 0);
+					}
+					TAlScore sc_best64   = sc_best;   if(sc_best   == MIN_I16) sc_best64   = MIN_I64;
+					TAlScore sc_e_best64 = sc_e_best; if(sc_e_best == MIN_I16) sc_e_best64 = MIN_I64;
+					TAlScore sc_f_best64 = sc_f_best; if(sc_f_best == MIN_I16) sc_f_best64 = MIN_I64;
+					assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 0), sc_best64);
+					assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 1), sc_e_best64);
+					assert_eq(prob_.cper_->scoreTriangle(rowc, colc, 2), sc_f_best64);
+				}
+#endif
+			}
+			// Update row, col
+			assert_lt(rowc, (int64_t)prob_.qrylen_);
+			rowc++;
+			colc--;
+			curc++;
+		} // for(size_t j = 0; j < breadth; j++)
+		if(i == depth-1) {
+			// Final iteration
+			assert(last != NULL);
+			assert_eq(1, neval);
+			assert_neq(0, last->sc[3]);
+			assert_eq(targ, last->sc[hef]);
+		} else {
+			breadth--;
+			prev2 = prev1 + 1;
+			prev1 = cur;
+		}
+	} // for(size_t i = 0; i < depth; i++)
+	//
+	// Now backtrack through the triangle.  Abort as soon as we enter a cell
+	// that was visited by a previous backtrace.
+	//
+	int64_t rowc = row, colc = col;
+	size_t curid;
+	int hefc = hef;
+	if(bs_.empty()) {
+		// Start an initial branch
+		CHECK_ROW_COL(rowc, colc);
+		curid = bs_.alloc();
+		assert_eq(0, curid);
+		Edit e;
+		bs_[curid].init(
+			prob_,
+			0,      // parent ID
+			0,      // penalty
+			0,      // score_en
+			rowc,   // row
+			colc,   // col
+			e,      // edit
+			0,      // hef
+			true,   // I am the root
+			false); // don't try to extend with exact matches
+		bs_[curid].len_ = 0;
+	} else {
+		curid = bs_.size()-1;
+	}
+	size_t idx_orig = (row + col) >> prob_.cper_->perpow2();
+	while(true) {
+		// What depth are we?
+		size_t mod = (rowc + colc) & prob_.cper_->lomask();
+		assert_lt(mod, prob_.cper_->per());
+		CpQuad * cur = tri_[mod].ptr();
+		int64_t row_off = rowc - row_lo - mod;
+		assert(!local || cur[row_off].sc[0] > 0);
+		assert_geq(row_off, 0);
+		int mask = cur[row_off].sc[3];
+		assert_gt(mask, 0);
+		int sel = -1;
+		// Select what type of move to make, which depends on whether we're
+		// currently in H, E, F:
+		if(hefc == 0) {
+			if(       (mask & 1) != 0) {
+				// diagonal
+				sel = 0;
+			} else if((mask & 8) != 0) {
+				// up to H
+				sel = 3;
+			} else if((mask & 16) != 0) {
+				// up to F
+				sel = 4;
+			} else if((mask & 2) != 0) {
+				// left to H
+				sel = 1;
+			} else if((mask & 4) != 0) {
+				// left to E
+				sel = 2;
+			}
+		} else if(hefc == 1) {
+			if(       (mask & 32) != 0) {
+				// left to H
+				sel = 5;
+			} else if((mask & 64) != 0) {
+				// left to E
+				sel = 6;
+			}
+		} else {
+			assert_eq(2, hefc);
+			if(       (mask & 128) != 0) {
+				// up to H
+				sel = 7;
+			} else if((mask & 256) != 0) {
+				// up to F
+				sel = 8;
+			}
+		}
+		assert_geq(sel, 0);
+		// Get character from read
+		int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
+		// Get character from reference
+		int rc = prob_.ref_[colc];
+		assert_range(0, 16, rc);
+		// Now that we know what type of move to make, make it, updating our
+		// row and column and moving updating the branch.
+		if(sel == 0) {
+			assert_geq(rowc, 0);
+			assert_geq(colc, 0);
+			TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
+			if((rc & (1 << qc)) == 0) {
+				// Mismatch
+				size_t id = curid;
+				// Check if the previous branch was the initial (bottommost)
+				// branch with no matches.  If so, the mismatch should be added
+				// to the initial branch, instead of starting a new branch.
+				bool empty = (bs_[curid].len_ == 0 && curid == 0);
+				if(!empty) {
+					id = bs_.alloc();
+				}
+				Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
+				assert_lt(scd, 0);
+				TAlScore score_en = bs_[curid].score_st_ + scd;
+				bs_[id].init(
+					prob_,
+					curid,    // parent ID
+					-scd,     // penalty
+					score_en, // score_en
+					rowc,     // row
+					colc,     // col
+					e,        // edit
+					hefc,     // hef
+					empty,    // root?
+					false);   // don't try to extend with exact matches
+				//assert(!local || bs_[id].score_st_ >= 0);
+				curid = id;
+			} else {
+				// Match
+				bs_[curid].score_st_ += prob_.sc_->match();
+				bs_[curid].len_++;
+				assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
+			}
+			rowc--;
+			colc--;
+			assert(local || bs_[curid].score_st_ >= targ_final);
+			hefc = 0;
+		} else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
+			assert_gt(colc, 0);
+			// Read gap
+			size_t id = bs_.alloc();
+			Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
+			TAlScore gapp = prob_.sc_->readGapOpen();
+			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
+				gapp = prob_.sc_->readGapExtend();
+			}
+			TAlScore score_en = bs_[curid].score_st_ - gapp;
+			bs_[id].init(
+				prob_,
+				curid,    // parent ID
+				gapp,     // penalty
+				score_en, // score_en
+				rowc,     // row
+				colc-1,   // col
+				e,        // edit
+				hefc,     // hef
+				false,    // root?
+				false);   // don't try to extend with exact matches
+			colc--;
+			curid = id;
+			assert( local || bs_[curid].score_st_ >= targ_final);
+			//assert(!local || bs_[curid].score_st_ >= 0);
+			if(sel == 1 || sel == 5) {
+				hefc = 0;
+			} else {
+				hefc = 1;
+			}
+		} else {
+			assert_gt(rowc, 0);
+			// Reference gap
+			size_t id = bs_.alloc();
+			Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
+			TAlScore gapp = prob_.sc_->refGapOpen();
+			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
+				gapp = prob_.sc_->refGapExtend();
+			}
+			TAlScore score_en = bs_[curid].score_st_ - gapp;
+			bs_[id].init(
+				prob_,
+				curid,    // parent ID
+				gapp,     // penalty
+				score_en, // score_en
+				rowc-1,   // row
+				colc,     // col
+				e,        // edit
+				hefc,     // hef
+				false,    // root?
+				false);   // don't try to extend with exact matches
+			rowc--;
+			curid = id;
+			//assert(!local || bs_[curid].score_st_ >= 0);
+			if(sel == 3 || sel == 7) {
+				hefc = 0;
+			} else {
+				hefc = 2;
+			}
+		}
+		CHECK_ROW_COL(rowc, colc);
+		size_t mod_new = (rowc + colc) & prob_.cper_->lomask();
+		size_t idx = (rowc + colc) >> prob_.cper_->perpow2();
+		assert_lt(mod_new, prob_.cper_->per());
+		int64_t row_off_new = rowc - row_lo - mod_new;
+		CpQuad * cur_new = NULL;
+		if(colc >= 0 && rowc >= 0 && idx == idx_orig) {
+			cur_new = tri_[mod_new].ptr();
+		}
+		bool hit_new_tri = (idx < idx_orig && colc >= 0 && rowc >= 0);
+		// Check whether we made it to the top row or to a cell with score 0
+		if(colc < 0 || rowc < 0 ||
+		   (cur_new != NULL && (local && cur_new[row_off_new].sc[0] == 0)))
+		{
+			done = true;
+			assert(bs_[curid].isSolution(prob_));
+			addSolution(curid);
+#ifndef NDEBUG
+			// A check to see if any two adjacent branches in the backtrace
+			// overlap.  If they do, the whole alignment will be filtered out
+			// in trySolution(...)
+			size_t cur = curid;
+			if(!bs_[cur].root_) {
+				size_t next = bs_[cur].parentId_;
+				while(!bs_[next].root_) {
+					assert_neq(cur, next);
+					if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
+						assert(!bs_[cur].overlap(prob_, bs_[next]));
+					}
+					cur = next;
+					next = bs_[cur].parentId_;
+				}
+			}
+#endif
+			return;
+		}
+		if(hit_new_tri) {
+			assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
+			row_new = rowc; col_new = colc;
+			hef_new = hefc;
+			done = false;
+			if(rowc < 0 || colc < 0) {
+				assert(local);
+				targ_new = 0;
+			} else {
+				targ_new = prob_.cper_->scoreTriangle(rowc, colc, hefc);
+			}
+			if(local && targ_new == 0) {
+				done = true;
+				assert(bs_[curid].isSolution(prob_));
+				addSolution(curid);
+			}
+			assert((row_new >= 0 && col_new >= 0) || done);
+			return;
+		}
+	}
+	assert(false);
+}
+
+#ifndef NDEBUG
+#define DEBUG_CHECK(ss, row, col, hef) { \
+	if(prob_.cper_->debug() && row >= 0 && col >= 0) { \
+		TAlScore s = ss; \
+		if(s == MIN_I16) s = MIN_I64; \
+		if(local && s < 0) s = 0; \
+		TAlScore deb = prob_.cper_->debugCell(row, col, hef); \
+		if(local && deb < 0) deb = 0; \
+		assert_eq(s, deb); \
+	} \
+}
+#else
+#define DEBUG_CHECK(ss, row, col, hef)
+#endif
+
+
+/**
+ * Fill in a square of the DP table and backtrace from the given cell to
+ * a cell in the previous checkpoint, or to the terminal cell.
+ */
+void BtBranchTracer::squareFill(
+	int64_t rw,          // row of cell to backtrace from
+	int64_t cl,          // column of cell to backtrace from
+	int hef,             // cell to backtrace from is H (0), E (1), or F (2)
+	TAlScore targ,       // score of cell to backtrace from
+	TAlScore targ_final, // score of alignment we're looking for
+	RandomSource& rnd,   // pseudo-random generator
+	int64_t& row_new,    // out: row we ended up in after backtrace
+	int64_t& col_new,    // out: column we ended up in after backtrace
+	int& hef_new,        // out: H/E/F after backtrace
+	TAlScore& targ_new,  // out: score up to cell we ended up in
+	bool& done,          // out: finished tracing out an alignment?
+	bool& abort)         // out: aborted b/c cell was seen before?
+{
+	assert_geq(rw, 0);
+	assert_geq(cl, 0);
+	assert_range(0, 2, hef);
+	assert_lt(rw, (int64_t)prob_.qrylen_);
+	assert_lt(cl, (int64_t)prob_.reflen_);
+	assert(prob_.usecp_ && prob_.fill_);
+	const bool is8_ = prob_.cper_->is8_;
+	int64_t row = rw, col = cl;
+	assert_leq(prob_.reflen_, (TRefOff)sawcell_.size());
+	assert_leq(col, (int64_t)prob_.cper_->hicol());
+	assert_geq(col, (int64_t)prob_.cper_->locol());
+	assert_geq(prob_.cper_->per(), 2);
+	size_t xmod = col & prob_.cper_->lomask();
+	size_t ymod = row & prob_.cper_->lomask();
+	size_t xdiv = col >> prob_.cper_->perpow2();
+	size_t ydiv = row >> prob_.cper_->perpow2();
+	size_t sq_ncol = xmod+1, sq_nrow = ymod+1;
+	sq_.resize(sq_ncol * sq_nrow);
+	bool upper = ydiv == 0;
+	bool left  = xdiv == 0;
+	const TAlScore sc_rdo = prob_.sc_->readGapOpen();
+	const TAlScore sc_rde = prob_.sc_->readGapExtend();
+	const TAlScore sc_rfo = prob_.sc_->refGapOpen();
+	const TAlScore sc_rfe = prob_.sc_->refGapExtend();
+	const bool local = !prob_.sc_->monotone;
+	const CpQuad *qup = NULL;
+	const __m128i *qlf = NULL;
+	size_t per = prob_.cper_->per_;
+	ASSERT_ONLY(size_t nrow = prob_.cper_->nrow());
+	size_t ncol = prob_.cper_->ncol();
+	assert_eq(prob_.qrylen_, nrow);
+	assert_eq(prob_.reflen_, (TRefOff)ncol);
+	size_t niter = prob_.cper_->niter_;
+	if(!upper) {
+		qup = prob_.cper_->qrows_.ptr() + (ncol * (ydiv-1)) + xdiv * per;
+	}
+	if(!left) {
+		// Set up the column pointers to point to the first __m128i word in the
+		// relevant column
+		size_t off = (niter << 2) * (xdiv-1);
+		qlf = prob_.cper_->qcols_.ptr() + off;
+	}
+	size_t xedge = xdiv * per; // absolute offset of leftmost cell in square
+	size_t yedge = ydiv * per; // absolute offset of topmost cell in square
+	size_t xi = xedge, yi = yedge; // iterators for columns, rows
+	size_t ii = 0; // iterator into packed square
+	// Iterate over rows, then over columns
+	size_t m128mod = yi % prob_.cper_->niter_;
+	size_t m128div = yi / prob_.cper_->niter_;
+	int16_t sc_h_dg_lastrow = MIN_I16;
+	for(size_t i = 0; i <= ymod; i++, yi++) {
+		assert_lt(yi, nrow);
+ 		xi = xedge;
+		// Handling for first column is done outside the loop
+		size_t fromend = prob_.qrylen_ - yi - 1;
+		bool allowGaps = fromend >= (size_t)prob_.sc_->gapbar && yi >= (size_t)prob_.sc_->gapbar;
+		// Get character, quality from read
+		int qc = prob_.qry_[yi], qq = prob_.qual_[yi];
+		assert_geq(qq, 33);
+		int16_t sc_h_lf_last = MIN_I16;
+		int16_t sc_e_lf_last = MIN_I16;
+		for(size_t j = 0; j <= xmod; j++, xi++) {
+			assert_lt(xi, ncol);
+			// Get character from reference
+			int rc = prob_.ref_[xi];
+			assert_range(0, 16, rc);
+			int16_t sc_diag = prob_.sc_->score(qc, rc, qq - 33);
+			int16_t sc_h_up = MIN_I16, sc_f_up = MIN_I16,
+			        sc_h_lf = MIN_I16, sc_e_lf = MIN_I16,
+					sc_h_dg = MIN_I16;
+			int16_t sc_h_up_c = MIN_I16, sc_f_up_c = MIN_I16,
+			        sc_h_lf_c = MIN_I16, sc_e_lf_c = MIN_I16,
+					sc_h_dg_c = MIN_I16;
+			if(yi == 0) {
+				// If I'm in the first first row or column set it to 0
+				sc_h_dg = 0;
+			} else if(xi == 0) {
+				// Do nothing; leave it at min
+				if(local) {
+					sc_h_dg = 0;
+				}
+			} else if(i == 0 && j == 0) {
+				// Otherwise, if I'm in the upper-left square corner, I can get
+				// it from the checkpoint 
+				sc_h_dg = qup[-1].sc[0];
+			} else if(j == 0) {
+				// Otherwise, if I'm in the leftmost cell of this row, I can
+				// get it from sc_h_lf in first column of previous row
+				sc_h_dg = sc_h_dg_lastrow;
+			} else {
+				// Otherwise, I can get it from qup
+				sc_h_dg = qup[j-1].sc[0];
+			}
+			if(yi > 0 && xi > 0) DEBUG_CHECK(sc_h_dg, yi-1, xi-1, 2);
+			
+			// If we're in the leftmost column, calculate sc_h_lf regardless of
+			// allowGaps.
+			if(j == 0 && xi > 0) {
+				// Get values for left neighbors from the checkpoint
+				if(is8_) {
+					size_t vecoff = (m128mod << 6) + m128div;
+					sc_e_lf = ((uint8_t*)(qlf + 0))[vecoff];
+					sc_h_lf = ((uint8_t*)(qlf + 2))[vecoff];
+					if(local) {
+						// No adjustment
+					} else {
+						if(sc_h_lf == 0) sc_h_lf = MIN_I16;
+						else sc_h_lf -= 0xff;
+						if(sc_e_lf == 0) sc_e_lf = MIN_I16;
+						else sc_e_lf -= 0xff;
+					}
+				} else {
+					size_t vecoff = (m128mod << 5) + m128div;
+					sc_e_lf = ((int16_t*)(qlf + 0))[vecoff];
+					sc_h_lf = ((int16_t*)(qlf + 2))[vecoff];
+					if(local) {
+						sc_h_lf += 0x8000; assert_geq(sc_h_lf, 0);
+						sc_e_lf += 0x8000; assert_geq(sc_e_lf, 0);
+					} else {
+						if(sc_h_lf != MIN_I16) sc_h_lf -= 0x7fff;
+						if(sc_e_lf != MIN_I16) sc_e_lf -= 0x7fff;
+					}
+				}
+				DEBUG_CHECK(sc_e_lf, yi, xi-1, 0);
+				DEBUG_CHECK(sc_h_lf, yi, xi-1, 2);
+				sc_h_dg_lastrow = sc_h_lf;
+			}
+			
+			if(allowGaps) {
+				if(j == 0 /* at left edge */ && xi > 0 /* not extreme */) {
+					sc_h_lf_c = sc_h_lf;
+					sc_e_lf_c = sc_e_lf;
+					if(sc_h_lf_c != MIN_I16) sc_h_lf_c -= sc_rdo;
+					if(sc_e_lf_c != MIN_I16) sc_e_lf_c -= sc_rde;
+					assert_leq(sc_h_lf_c, prob_.cper_->perf_);
+					assert_leq(sc_e_lf_c, prob_.cper_->perf_);
+				} else if(xi > 0) {
+					// Get values for left neighbors from the previous iteration
+					if(sc_h_lf_last != MIN_I16) {
+						sc_h_lf = sc_h_lf_last;
+						sc_h_lf_c = sc_h_lf - sc_rdo;
+					}
+					if(sc_e_lf_last != MIN_I16) {
+						sc_e_lf = sc_e_lf_last;
+						sc_e_lf_c = sc_e_lf - sc_rde;
+					}
+				}
+				if(yi > 0 /* not extreme */) {
+					// Get column values
+					assert(qup != NULL);
+					assert(local || qup[j].sc[2] < 0);
+					if(qup[j].sc[0] > MIN_I16) {
+						DEBUG_CHECK(qup[j].sc[0], yi-1, xi, 2);
+						sc_h_up = qup[j].sc[0];
+						sc_h_up_c = sc_h_up - sc_rfo;
+					}
+					if(qup[j].sc[2] > MIN_I16) {
+						DEBUG_CHECK(qup[j].sc[2], yi-1, xi, 1);
+						sc_f_up = qup[j].sc[2];
+						sc_f_up_c = sc_f_up - sc_rfe;
+					}
+				}
+				if(local) {
+					sc_h_up_c = max<int16_t>(sc_h_up_c, 0);
+					sc_f_up_c = max<int16_t>(sc_f_up_c, 0);
+					sc_h_lf_c = max<int16_t>(sc_h_lf_c, 0);
+					sc_e_lf_c = max<int16_t>(sc_e_lf_c, 0);
+				}
+			}
+			
+			if(sc_h_dg > MIN_I16) {
+				sc_h_dg_c = sc_h_dg + sc_diag;
+			}
+			if(local) sc_h_dg_c = max<int16_t>(sc_h_dg_c, 0);
+			
+			int mask = 0;
+			// Calculate best ways into H, E, F cells starting with H.
+			// Mask bits:
+			// H: 1=diag, 2=hhoriz, 4=ehoriz, 8=hvert, 16=fvert
+			// E: 32=hhoriz, 64=ehoriz
+			// F: 128=hvert, 256=fvert
+			int16_t sc_best = sc_h_dg_c;
+			if(sc_h_dg_c > MIN_I64) {
+				mask = 1;
+			}
+			if(xi > 0 && sc_h_lf_c >= sc_best && sc_h_lf_c > MIN_I64) {
+				if(sc_h_lf_c > sc_best) mask = 0;
+				mask |= 2;
+				sc_best = sc_h_lf_c;
+			}
+			if(xi > 0 && sc_e_lf_c >= sc_best && sc_e_lf_c > MIN_I64) {
+				if(sc_e_lf_c > sc_best) mask = 0;
+				mask |= 4;
+				sc_best = sc_e_lf_c;
+			}
+			if(yi > 0 && sc_h_up_c >= sc_best && sc_h_up_c > MIN_I64) {
+				if(sc_h_up_c > sc_best) mask = 0;
+				mask |= 8;
+				sc_best = sc_h_up_c;
+			}
+			if(yi > 0 && sc_f_up_c >= sc_best && sc_f_up_c > MIN_I64) {
+				if(sc_f_up_c > sc_best) mask = 0;
+				mask |= 16;
+				sc_best = sc_f_up_c;
+			}
+			// Calculate best way into E cell
+			int16_t sc_e_best = sc_h_lf_c;
+			if(xi > 0) {
+				if(sc_h_lf_c >= sc_e_lf_c && sc_h_lf_c > MIN_I64) {
+					if(sc_h_lf_c == sc_e_lf_c) {
+						mask |= 64;
+					}
+					mask |= 32;
+				} else if(sc_e_lf_c > MIN_I64) {
+					sc_e_best = sc_e_lf_c;
+					mask |= 64;
+				}
+			}
+			if(sc_e_best > sc_best) {
+				sc_best = sc_e_best;
+				mask &= ~31; // don't go diagonal
+			}
+			// Calculate best way into F cell
+			int16_t sc_f_best = sc_h_up_c;
+			if(yi > 0) {
+				if(sc_h_up_c >= sc_f_up_c && sc_h_up_c > MIN_I64) {
+					if(sc_h_up_c == sc_f_up_c) {
+						mask |= 256;
+					}
+					mask |= 128;
+				} else if(sc_f_up_c > MIN_I64) {
+					sc_f_best = sc_f_up_c;
+					mask |= 256;
+				}
+			}
+			if(sc_f_best > sc_best) {
+				sc_best = sc_f_best;
+				mask &= ~127; // don't go horizontal or diagonal
+			}
+			// Install results in cur
+			assert( local || sc_best <= 0);
+			sq_[ii+j].sc[0] = sc_best;
+			assert( local || sc_e_best < 0);
+			assert( local || sc_f_best < 0);
+			assert(!local || sc_e_best >= 0 || sc_e_best == MIN_I16);
+			assert(!local || sc_f_best >= 0 || sc_f_best == MIN_I16);
+			sq_[ii+j].sc[1] = sc_e_best;
+			sq_[ii+j].sc[2] = sc_f_best;
+			sq_[ii+j].sc[3] = mask;
+			DEBUG_CHECK(sq_[ii+j].sc[0], yi, xi, 2); // H
+			DEBUG_CHECK(sq_[ii+j].sc[1], yi, xi, 0); // E
+			DEBUG_CHECK(sq_[ii+j].sc[2], yi, xi, 1); // F
+			// Update sc_h_lf_last, sc_e_lf_last
+			sc_h_lf_last = sc_best;
+			sc_e_lf_last = sc_e_best;
+		}
+		// Update m128mod, m128div
+		m128mod++;
+		if(m128mod == prob_.cper_->niter_) {
+			m128mod = 0;
+			m128div++;
+		}
+		// update qup
+		ii += sq_ncol;
+		// dimensions of sq_
+		qup = sq_.ptr() + sq_ncol * i;
+	}
+	assert_eq(targ, sq_[ymod * sq_ncol + xmod].sc[hef]);
+	//
+	// Now backtrack through the triangle.  Abort as soon as we enter a cell
+	// that was visited by a previous backtrace.
+	//
+	int64_t rowc = row, colc = col;
+	size_t curid;
+	int hefc = hef;
+	if(bs_.empty()) {
+		// Start an initial branch
+		CHECK_ROW_COL(rowc, colc);
+		curid = bs_.alloc();
+		assert_eq(0, curid);
+		Edit e;
+		bs_[curid].init(
+			prob_,
+			0,      // parent ID
+			0,      // penalty
+			0,      // score_en
+			rowc,   // row
+			colc,   // col
+			e,      // edit
+			0,      // hef
+			true,   // root?
+			false); // don't try to extend with exact matches
+		bs_[curid].len_ = 0;
+	} else {
+		curid = bs_.size()-1;
+	}
+	size_t ymodTimesNcol = ymod * sq_ncol;
+	while(true) {
+		// What depth are we?
+		assert_eq(ymodTimesNcol, ymod * sq_ncol);
+		CpQuad * cur = sq_.ptr() + ymodTimesNcol + xmod;
+		int mask = cur->sc[3];
+		assert_gt(mask, 0);
+		int sel = -1;
+		// Select what type of move to make, which depends on whether we're
+		// currently in H, E, F:
+		if(hefc == 0) {
+			if(       (mask & 1) != 0) {
+				// diagonal
+				sel = 0;
+			} else if((mask & 8) != 0) {
+				// up to H
+				sel = 3;
+			} else if((mask & 16) != 0) {
+				// up to F
+				sel = 4;
+			} else if((mask & 2) != 0) {
+				// left to H
+				sel = 1;
+			} else if((mask & 4) != 0) {
+				// left to E
+				sel = 2;
+			}
+		} else if(hefc == 1) {
+			if(       (mask & 32) != 0) {
+				// left to H
+				sel = 5;
+			} else if((mask & 64) != 0) {
+				// left to E
+				sel = 6;
+			}
+		} else {
+			assert_eq(2, hefc);
+			if(       (mask & 128) != 0) {
+				// up to H
+				sel = 7;
+			} else if((mask & 256) != 0) {
+				// up to F
+				sel = 8;
+			}
+		}
+		assert_geq(sel, 0);
+		// Get character from read
+		int qc = prob_.qry_[rowc], qq = prob_.qual_[rowc];
+		// Get character from reference
+		int rc = prob_.ref_[colc];
+		assert_range(0, 16, rc);
+		bool xexit = false, yexit = false;
+		// Now that we know what type of move to make, make it, updating our
+		// row and column and moving updating the branch.
+		if(sel == 0) {
+			assert_geq(rowc, 0);
+			assert_geq(colc, 0);
+			TAlScore scd = prob_.sc_->score(qc, rc, qq - 33);
+			if((rc & (1 << qc)) == 0) {
+				// Mismatch
+				size_t id = curid;
+				// Check if the previous branch was the initial (bottommost)
+				// branch with no matches.  If so, the mismatch should be added
+				// to the initial branch, instead of starting a new branch.
+				bool empty = (bs_[curid].len_ == 0 && curid == 0);
+				if(!empty) {
+					id = bs_.alloc();
+				}
+				Edit e((int)rowc, mask2dna[rc], "ACGTN"[qc], EDIT_TYPE_MM);
+				assert_lt(scd, 0);
+				TAlScore score_en = bs_[curid].score_st_ + scd;
+				bs_[id].init(
+					prob_,
+					curid,    // parent ID
+					-scd,     // penalty
+					score_en, // score_en
+					rowc,     // row
+					colc,     // col
+					e,        // edit
+					hefc,     // hef
+					empty,    // root?
+					false);   // don't try to extend with exact matches
+				curid = id;
+				//assert(!local || bs_[curid].score_st_ >= 0);
+			} else {
+				// Match
+				bs_[curid].score_st_ += prob_.sc_->match();
+				bs_[curid].len_++;
+				assert_leq((int64_t)bs_[curid].len_, bs_[curid].row_ + 1);
+			}
+			if(xmod == 0) xexit = true;
+			if(ymod == 0) yexit = true;
+			rowc--; ymod--; ymodTimesNcol -= sq_ncol;
+			colc--; xmod--;
+			assert(local || bs_[curid].score_st_ >= targ_final);
+			hefc = 0;
+		} else if((sel >= 1 && sel <= 2) || (sel >= 5 && sel <= 6)) {
+			assert_gt(colc, 0);
+			// Read gap
+			size_t id = bs_.alloc();
+			Edit e((int)rowc+1, mask2dna[rc], '-', EDIT_TYPE_READ_GAP);
+			TAlScore gapp = prob_.sc_->readGapOpen();
+			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isReadGap()) {
+				gapp = prob_.sc_->readGapExtend();
+			}
+			//assert(!local || bs_[curid].score_st_ >= gapp);
+			TAlScore score_en = bs_[curid].score_st_ - gapp;
+			bs_[id].init(
+				prob_,
+				curid,    // parent ID
+				gapp,     // penalty
+				score_en, // score_en
+				rowc,     // row
+				colc-1,   // col
+				e,        // edit
+				hefc,     // hef
+				false,    // root?
+				false);   // don't try to extend with exact matches
+			if(xmod == 0) xexit = true;
+			colc--; xmod--;
+			curid = id;
+			assert( local || bs_[curid].score_st_ >= targ_final);
+			//assert(!local || bs_[curid].score_st_ >= 0);
+			if(sel == 1 || sel == 5) {
+				hefc = 0;
+			} else {
+				hefc = 1;
+			}
+		} else {
+			assert_gt(rowc, 0);
+			// Reference gap
+			size_t id = bs_.alloc();
+			Edit e((int)rowc, '-', "ACGTN"[qc], EDIT_TYPE_REF_GAP);
+			TAlScore gapp = prob_.sc_->refGapOpen();
+			if(bs_[curid].len_ == 0 && bs_[curid].e_.inited() && bs_[curid].e_.isRefGap()) {
+				gapp = prob_.sc_->refGapExtend();
+			}
+			//assert(!local || bs_[curid].score_st_ >= gapp);
+			TAlScore score_en = bs_[curid].score_st_ - gapp;
+			bs_[id].init(
+				prob_,
+				curid,    // parent ID
+				gapp,     // penalty
+				score_en, // score_en
+				rowc-1,   // row
+				colc,     // col
+				e,        // edit
+				hefc,     // hef
+				false,    // root?
+				false);   // don't try to extend with exact matches
+			if(ymod == 0) yexit = true;
+			rowc--; ymod--; ymodTimesNcol -= sq_ncol;
+			curid = id;
+			assert( local || bs_[curid].score_st_ >= targ_final);
+			//assert(!local || bs_[curid].score_st_ >= 0);
+			if(sel == 3 || sel == 7) {
+				hefc = 0;
+			} else {
+				hefc = 2;
+			}
+		}
+		CHECK_ROW_COL(rowc, colc);
+		CpQuad * cur_new = NULL;
+		if(!xexit && !yexit) {
+			cur_new = sq_.ptr() + ymodTimesNcol + xmod;
+		}
+		// Check whether we made it to the top row or to a cell with score 0
+		if(colc < 0 || rowc < 0 ||
+		   (cur_new != NULL && local && cur_new->sc[0] == 0))
+		{
+			done = true;
+			assert(bs_[curid].isSolution(prob_));
+			addSolution(curid);
+#ifndef NDEBUG
+			// A check to see if any two adjacent branches in the backtrace
+			// overlap.  If they do, the whole alignment will be filtered out
+			// in trySolution(...)
+			size_t cur = curid;
+			if(!bs_[cur].root_) {
+				size_t next = bs_[cur].parentId_;
+				while(!bs_[next].root_) {
+					assert_neq(cur, next);
+					if(bs_[next].len_ != 0 || bs_[cur].len_ == 0) {
+						assert(!bs_[cur].overlap(prob_, bs_[next]));
+					}
+					cur = next;
+					next = bs_[cur].parentId_;
+				}
+			}
+#endif
+			return;
+		}
+		assert(!xexit || hefc == 0 || hefc == 1);
+		assert(!yexit || hefc == 0 || hefc == 2);
+		if(xexit || yexit) {
+			//assert(rowc < 0 || colc < 0 || prob_.cper_->isCheckpointed(rowc, colc));
+			row_new = rowc; col_new = colc;
+			hef_new = hefc;
+			done = false;
+			if(rowc < 0 || colc < 0) {
+				assert(local);
+				targ_new = 0;
+			} else {
+				// TODO: Don't use scoreSquare
+				targ_new = prob_.cper_->scoreSquare(rowc, colc, hefc);
+				assert(local || targ_new >= targ);
+				assert(local || targ_new >= targ_final);
+			}
+			if(local && targ_new == 0) {
+				assert_eq(0, hefc);
+				done = true;
+				assert(bs_[curid].isSolution(prob_));
+				addSolution(curid);
+			}
+			assert((row_new >= 0 && col_new >= 0) || done);
+			return;
+		}
+	}
+	assert(false);
+}
+
+/**
+ * Caller gives us score_en, row and col.  We figure out score_st and len_
+ * by comparing characters from the strings.
+ *
+ * If this branch comes after a mismatch, (row, col) describe the cell that the
+ * mismatch occurs in.  len_ is initially set to 1, and the next cell we test
+ * is the next cell up and to the left (row-1, col-1).
+ *
+ * If this branch comes after a read gap, (row, col) describe the leftmost cell
+ * involved in the gap.  len_ is initially set to 0, and the next cell we test
+ * is the current cell (row, col).
+ *
+ * If this branch comes after a reference gap, (row, col) describe the upper
+ * cell involved in the gap.  len_ is initially set to 0, and the next cell we
+ * test is the current cell (row, col).
+ */
+void BtBranch::init(
+	const BtBranchProblem& prob,
+	size_t parentId,
+	TAlScore penalty,
+	TAlScore score_en,
+	int64_t row,
+	int64_t col,
+	Edit e,
+	int hef,
+	bool root,
+	bool extend)
+{
+	score_en_ = score_en;
+	penalty_ = penalty;
+	score_st_ = score_en_;
+	row_ = row;
+	col_ = col;
+	parentId_ = parentId;
+	e_ = e;
+	root_ = root;
+	assert(!root_ || parentId == 0);
+	assert_lt(row, (int64_t)prob.qrylen_);
+	assert_lt(col, (int64_t)prob.reflen_);
+	// First match to check is diagonally above and to the left of the cell
+	// where the edit occurs
+	int64_t rowc = row;
+	int64_t colc = col;
+	len_ = 0;
+	if(e.inited() && e.isMismatch()) {
+		rowc--; colc--;
+		len_ = 1;
+	}
+	int64_t match = prob.sc_->match();
+	bool cp = prob.usecp_;
+	size_t iters = 0;
+	curtailed_ = false;
+	if(extend) {
+		while(rowc >= 0 && colc >= 0) {
+			int rfm = prob.ref_[colc];
+			assert_range(0, 16, rfm);
+			int rdc = prob.qry_[rowc];
+			bool matches = (rfm & (1 << rdc)) != 0;
+			if(!matches) {
+				// What's the mismatch penalty?
+				break;
+			}
+			// Get score from checkpointer
+			score_st_ += match;
+			if(cp && rowc - 1 >= 0 && colc - 1 >= 0 &&
+			   prob.cper_->isCheckpointed(rowc - 1, colc - 1))
+			{
+				// Possibly prune
+				int16_t cpsc;
+				cpsc = prob.cper_->scoreTriangle(rowc - 1, colc - 1, hef);
+				if(cpsc + score_st_ < prob.targ_) {
+					curtailed_ = true;
+					break;
+				}
+			}
+			iters++;
+			rowc--; colc--;
+		}
+	}
+	assert_geq(rowc, -1);
+	assert_geq(colc, -1);
+	len_ = (int64_t)row - rowc;
+	assert_leq((int64_t)len_, row_+1);
+	assert_leq((int64_t)len_, col_+1);
+	assert_leq((int64_t)score_st_, (int64_t)prob.qrylen_ * match);
+}
+
+/**
+ * Given a potential branch to add to the queue, see if we can follow the
+ * branch a little further first.  If it's still valid, or if we reach a
+ * choice between valid outgoing paths, go ahead and add it to the queue.
+ */
+void BtBranchTracer::examineBranch(
+	int64_t row,
+	int64_t col,
+	const Edit& e,
+	TAlScore pen,  // penalty associated with edit
+	TAlScore sc,
+	size_t parentId)
+{
+	size_t id = bs_.alloc();
+	bs_[id].init(prob_, parentId, pen, sc, row, col, e, 0, false, true);
+	if(bs_[id].isSolution(prob_)) {
+		assert(bs_[id].isValid(prob_));
+		addSolution(id);
+	} else {
+		// Check if this branch is legit
+		if(bs_[id].isValid(prob_)) {
+			add(id);
+		} else {
+			bs_.pop();
+		}
+	}
+}
+
+/**
+ * Take all possible ways of leaving the given branch and add them to the
+ * branch queue.
+ */
+void BtBranchTracer::addOffshoots(size_t bid) {
+	BtBranch& b = bs_[bid];
+	TAlScore sc = b.score_en_;
+	int64_t match = prob_.sc_->match();
+	int64_t scoreFloor = prob_.sc_->monotone ? MIN_I64 : 0;
+	bool cp = prob_.usecp_; // Are there are any checkpoints?
+	ASSERT_ONLY(TAlScore perfectScore = prob_.sc_->perfectScore(prob_.qrylen_));
+	assert_leq(prob_.targ_, perfectScore);
+	// For each cell in the branch
+	for(size_t i = 0 ; i < b.len_; i++) {
+		assert_leq((int64_t)i, b.row_+1);
+		assert_leq((int64_t)i, b.col_+1);
+		int64_t row = b.row_ - i, col = b.col_ - i;
+		int64_t bonusLeft = (row + 1) * match;
+		int64_t fromend = prob_.qrylen_ - row - 1;
+		bool allowGaps = fromend >= prob_.sc_->gapbar && row >= prob_.sc_->gapbar;
+		if(allowGaps && row >= 0 && col >= 0) {
+			if(col > 0) {
+				// Try a read gap - it's either an extension or an open
+				bool extend = b.e_.inited() && b.e_.isReadGap() && i == 0;
+				TAlScore rdgapPen = extend ?
+					prob_.sc_->readGapExtend() : prob_.sc_->readGapOpen();
+				bool prune = false;
+				assert_gt(rdgapPen, 0);
+				if(cp && prob_.cper_->isCheckpointed(row, col - 1)) {
+					// Possibly prune
+					int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row, col - 1, 0);
+					assert_leq(cpsc, perfectScore);
+					assert_geq(prob_.sc_->readGapOpen(), prob_.sc_->readGapExtend());
+					TAlScore bonus = prob_.sc_->readGapOpen() - prob_.sc_->readGapExtend();
+					assert_geq(bonus, 0);
+					if(cpsc + bonus + sc - rdgapPen < prob_.targ_) {
+						prune = true;
+					}
+				}
+				if(prune) {
+					if(extend) { nrdexPrune_++; } else { nrdopPrune_++; }
+				} else if(sc - rdgapPen >= scoreFloor && sc - rdgapPen + bonusLeft >= prob_.targ_) {
+					// Yes, we can introduce a read gap here
+					Edit e((int)row + 1, mask2dna[(int)prob_.ref_[col]], '-', EDIT_TYPE_READ_GAP);
+					assert(e.isReadGap());
+					examineBranch(row, col - 1, e, rdgapPen, sc - rdgapPen, bid);
+					if(extend) { nrdex_++; } else { nrdop_++; }
+				}
+			}
+			if(row > 0) {
+				// Try a reference gap - it's either an extension or an open
+				bool extend = b.e_.inited() && b.e_.isRefGap() && i == 0;
+				TAlScore rfgapPen = (b.e_.inited() && b.e_.isRefGap()) ?
+					prob_.sc_->refGapExtend() : prob_.sc_->refGapOpen();
+				bool prune = false;
+				assert_gt(rfgapPen, 0);
+				if(cp && prob_.cper_->isCheckpointed(row - 1, col)) {
+					// Possibly prune
+					int16_t cpsc = (int16_t)prob_.cper_->scoreTriangle(row - 1, col, 0);
+					assert_leq(cpsc, perfectScore);
+					assert_geq(prob_.sc_->refGapOpen(), prob_.sc_->refGapExtend());
+					TAlScore bonus = prob_.sc_->refGapOpen() - prob_.sc_->refGapExtend();
+					assert_geq(bonus, 0);
+					if(cpsc + bonus + sc - rfgapPen < prob_.targ_) {
+						prune = true;
+					}
+				}
+				if(prune) {
+					if(extend) { nrfexPrune_++; } else { nrfopPrune_++; }
+				} else if(sc - rfgapPen >= scoreFloor && sc - rfgapPen + bonusLeft >= prob_.targ_) {
+					// Yes, we can introduce a ref gap here
+					Edit e((int)row, '-', "ACGTN"[(int)prob_.qry_[row]], EDIT_TYPE_REF_GAP);
+					assert(e.isRefGap());
+					examineBranch(row - 1, col, e, rfgapPen, sc - rfgapPen, bid);
+					if(extend) { nrfex_++; } else { nrfop_++; }
+				}
+			}
+		}
+		// If we're at the top of the branch but not yet at the top of
+		// the DP table, a mismatch branch is also possible.
+		if(i == b.len_ && !b.curtailed_ && row >= 0 && col >= 0) {
+			int rfm = prob_.ref_[col];
+			assert_lt(row, (int64_t)prob_.qrylen_);
+			int rdc = prob_.qry_[row];
+			int rdq = prob_.qual_[row];
+			int scdiff = prob_.sc_->score(rdc, rfm, rdq - 33);
+			assert_lt(scdiff, 0); // at end of branch, so can't match
+			bool prune = false;
+			if(cp && row > 0 && col > 0 && prob_.cper_->isCheckpointed(row - 1, col - 1)) {
+				// Possibly prune
+				int16_t cpsc = prob_.cper_->scoreTriangle(row - 1, col - 1, 0);
+				assert_leq(cpsc, perfectScore);
+				assert_leq(cpsc + scdiff + sc, perfectScore);
+				if(cpsc + scdiff + sc < prob_.targ_) {
+					prune = true;
+				}
+			}
+			if(prune) {
+				nmm_++;
+			} else  {
+				// Yes, we can introduce a mismatch here
+				if(sc + scdiff >= scoreFloor && sc + scdiff + bonusLeft >= prob_.targ_) {
+					Edit e((int)row, mask2dna[rfm], "ACGTN"[rdc], EDIT_TYPE_MM);
+					bool nmm = (mask2dna[rfm] == 'N' || rdc > 4);
+					assert_neq(e.chr, e.qchr);
+					assert_lt(scdiff, 0);
+					examineBranch(row - 1, col - 1, e, -scdiff, sc + scdiff, bid);
+					if(nmm) { nnmm_++; } else { nmm_++; }
+				}
+			}
+		}
+		sc += match;
+	}
+}
+
+/**
+ * Sort unsorted branches, merge them with master sorted list.
+ */
+void BtBranchTracer::flushUnsorted() {
+	if(unsorted_.empty()) {
+		return;
+	}
+	unsorted_.sort();
+	unsorted_.reverse();
+#ifndef NDEBUG
+	for(size_t i = 1; i < unsorted_.size(); i++) {
+		assert_leq(bs_[unsorted_[i].second].score_st_, bs_[unsorted_[i-1].second].score_st_);
+	}
+#endif
+	EList<size_t> *src2 = sortedSel_ ? &sorted1_ : &sorted2_;
+	EList<size_t> *dest = sortedSel_ ? &sorted2_ : &sorted1_;
+	// Merge src1 and src2 into dest
+	dest->clear();
+	size_t cur1 = 0, cur2 = cur_;
+	while(cur1 < unsorted_.size() || cur2 < src2->size()) {
+		// Take from 1 or 2 next?
+		bool take1 = true;
+		if(cur1 == unsorted_.size()) {
+			take1 = false;
+		} else if(cur2 == src2->size()) {
+			take1 = true;
+		} else {
+			assert_neq(unsorted_[cur1].second, (*src2)[cur2]);
+			take1 = bs_[unsorted_[cur1].second] < bs_[(*src2)[cur2]];
+		}
+		if(take1) {
+			dest->push_back(unsorted_[cur1++].second); // Take from list 1
+		} else {
+			dest->push_back((*src2)[cur2++]); // Take from list 2
+		}
+	}
+	assert_eq(cur1, unsorted_.size());
+	assert_eq(cur2, src2->size());
+	sortedSel_ = !sortedSel_;
+	cur_ = 0;
+	unsorted_.clear();
+}
+
+/**
+ * Try all the solutions accumulated so far.  Solutions might be rejected
+ * if they, for instance, overlap a previous solution, have too many Ns,
+ * fail to overlap a core diagonal, etc.
+ */
+bool BtBranchTracer::trySolutions(
+	bool lookForOlap,
+	SwResult& res,
+	size_t& off,
+	size_t& nrej,
+	RandomSource& rnd,
+	bool& success)
+{
+	if(solutions_.size() > 0) {
+		for(size_t i = 0; i < solutions_.size(); i++) {
+			int ret = trySolution(solutions_[i], lookForOlap, res, off, nrej, rnd);
+			if(ret == BT_FOUND) {
+				success = true;
+				return true; // there were solutions and one was good
+			}
+		}
+		solutions_.clear();
+		success = false;
+		return true; // there were solutions but none were good
+	}
+	return false; // there were no solutions to check
+}
+
+/**
+ * Given the id of a branch that completes a successful backtrace, turn the
+ * chain of branches into 
+ */
+int BtBranchTracer::trySolution(
+	size_t id,
+	bool lookForOlap,
+	SwResult& res,
+	size_t& off,
+	size_t& nrej,
+	RandomSource& rnd)
+{
+#if 0
+	AlnScore score;
+	BtBranch *br = &bs_[id];
+	// 'br' corresponds to the leftmost edit in a right-to-left
+	// chain of edits.  
+	EList<Edit>& ned = res.alres.ned();
+	const BtBranch *cur = br, *prev = NULL;
+	size_t ns = 0, nrefns = 0;
+	size_t ngap = 0;
+	while(true) {
+		if(cur->e_.inited()) {
+			if(cur->e_.isMismatch()) {
+				if(cur->e_.qchr == 'N' || cur->e_.chr == 'N') {
+					ns++;
+				}
+			} else if(cur->e_.isGap()) {
+				ngap++;
+			}
+			if(cur->e_.chr == 'N') {
+				nrefns++;
+			}
+			ned.push_back(cur->e_);
+		}
+		if(cur->root_) {
+			break;
+		}
+		cur = &bs_[cur->parentId_];
+	}
+	if(ns > prob_.nceil_) {
+		// Alignment has too many Ns in it!
+		res.reset();
+		assert(res.alres.ned().empty());
+		nrej++;
+		return BT_REJECTED_N;
+	}
+	// Update 'seenPaths_'
+	cur = br;
+	bool rejSeen = false; // set =true if we overlap prev path
+	bool rejCore = true; // set =true if we don't touch core diag
+	while(true) {
+		// Consider row, col, len, then do something
+		int64_t row = cur->row_, col = cur->col_;
+		assert_lt(row, (int64_t)prob_.qrylen_);
+		size_t fromend = prob_.qrylen_ - row - 1;
+		size_t diag = fromend + col;
+		// Calculate the diagonal within the *trimmed* rectangle,
+		// i.e. the rectangle we dealt with in align, gather and
+		// backtrack.
+		int64_t diagi = col - row;
+		// Now adjust to the diagonal within the *untrimmed*
+		// rectangle by adding on the amount trimmed from the left.
+		diagi += prob_.rect_->triml;
+		assert_lt(diag, seenPaths_.size());
+		// Does it overlap a core diagonal?
+		if(diagi >= 0) {
+			size_t diag = (size_t)diagi;
+			if(diag >= prob_.rect_->corel &&
+			   diag <= prob_.rect_->corer)
+			{
+				// Yes it does - it's OK
+				rejCore = false;
+			}
+		}
+		if(lookForOlap) {
+			int64_t newlo, newhi;
+			if(cur->len_ == 0) {
+				if(prev != NULL && prev->len_ > 0) {
+					// If there's a gap at the base of a non-0 length branch, the
+					// gap will appear to overlap the branch if we give it length 1.
+					newhi = newlo = 0;
+				} else {
+					// Read or ref gap with no matches coming off of it
+					newlo = row;
+					newhi = row + 1;
+				}
+			} else {
+				// Diagonal with matches
+				newlo = row - (cur->len_ - 1);
+				newhi = row + 1;
+			}
+			assert_geq(newlo, 0);
+			assert_geq(newhi, 0);
+			// Does the diagonal cover cells?
+			if(newhi > newlo) {
+				// Check whether there is any overlap with previously traversed
+				// cells
+				bool added = false;
+				const size_t sz = seenPaths_[diag].size();
+				for(size_t i = 0; i < sz; i++) {
+					// Does the new interval overlap this already-seen
+					// interval?  Also of interest: does it abut this
+					// already-seen interval?  If so, we should merge them.
+					size_t lo = seenPaths_[diag][i].first;
+					size_t hi = seenPaths_[diag][i].second;
+					assert_lt(lo, hi);
+					size_t lo_sm = newlo, hi_sm = newhi;
+					if(hi - lo < hi_sm - lo_sm) {
+						swap(lo, lo_sm);
+						swap(hi, hi_sm);
+					}
+					if((lo <= lo_sm && hi > lo_sm) ||
+					   (lo <  hi_sm && hi >= hi_sm))
+					{
+						// One or both of the shorter interval's end points
+						// are contained in the longer interval - so they
+						// overlap.
+						rejSeen = true;
+						// Merge them into one longer interval
+						seenPaths_[diag][i].first = min(lo, lo_sm);
+						seenPaths_[diag][i].second = max(hi, hi_sm);
+#ifndef NDEBUG
+						for(int64_t ii = seenPaths_[diag][i].first;
+							ii < (int64_t)seenPaths_[diag][i].second;
+							ii++)
+						{
+							//cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
+						}
+#endif
+						added = true;
+						break;
+					} else if(hi == lo_sm || lo == hi_sm) {
+						// Merge them into one longer interval
+						seenPaths_[diag][i].first = min(lo, lo_sm);
+						seenPaths_[diag][i].second = max(hi, hi_sm);
+#ifndef NDEBUG
+						for(int64_t ii = seenPaths_[diag][i].first;
+							ii < (int64_t)seenPaths_[diag][i].second;
+							ii++)
+						{
+							//cerr << "trySolution rejected (" << ii << ", " << (ii + col - row) << ")" << endl;
+						}
+#endif
+						added = true;
+						// Keep going in case it overlaps one of the other
+						// intervals
+					}
+				}
+				if(!added) {
+					seenPaths_[diag].push_back(make_pair(newlo, newhi));
+				}
+			}
+		}
+		// After the merging that may have occurred above, it's no
+		// longer guarnateed that all the overlapping intervals in
+		// the list have been merged.  That's OK though.  We'll
+		// still get correct answers to overlap queries.
+		if(cur->root_) {
+			assert_eq(0, cur->parentId_);
+			break;
+		}
+		prev = cur;
+		cur = &bs_[cur->parentId_];
+	} // while(cur->e_.inited())
+	if(rejSeen) {
+		res.reset();
+		assert(res.alres.ned().empty());
+		nrej++;
+		return BT_NOT_FOUND;
+	}
+	if(rejCore) {
+		res.reset();
+		assert(res.alres.ned().empty());
+		nrej++;
+		return BT_REJECTED_CORE_DIAG;
+	}
+	off = br->leftmostCol();
+	score.score_ = prob_.targ_;
+	score.ns_    = ns;
+	score.gaps_  = ngap;
+	res.alres.setScore(score);
+	res.alres.setRefNs(nrefns);
+	size_t trimBeg = br->uppermostRow();
+	size_t trimEnd = prob_.qrylen_ - prob_.row_ - 1;
+	assert_leq(trimBeg, prob_.qrylen_);
+	assert_leq(trimEnd, prob_.qrylen_);
+	TRefOff refoff = off + prob_.refoff_ + prob_.rect_->refl;
+	res.alres.setShape(
+		prob_.refid_,                   // ref id
+		refoff,                         // 0-based ref offset
+		prob_.treflen(),                // ref length
+		prob_.fw_,                      // aligned to Watson?
+		prob_.qrylen_,                  // read length
+		true,                           // pretrim soft?
+		0,                              // pretrim 5' end
+		0,                              // pretrim 3' end
+		true,                           // alignment trim soft?
+		prob_.fw_ ? trimBeg : trimEnd,  // alignment trim 5' end
+		prob_.fw_ ? trimEnd : trimBeg); // alignment trim 3' end
+#endif
+	return BT_FOUND;
+}
+
+/**
+ * Get the next valid alignment given a backtrace problem.  Return false
+ * if there is no valid solution.  Use a backtracking search to find the
+ * solution.  This can be very slow.
+ */
+bool BtBranchTracer::nextAlignmentBacktrace(
+	size_t maxiter,
+	SwResult& res,
+	size_t& off,
+	size_t& nrej,
+	size_t& niter,
+	RandomSource& rnd)
+{
+	assert(!empty() || !emptySolution());
+	assert(prob_.inited());
+	// There's a subtle case where we might fail to backtracing in
+	// local-alignment mode.  The basic fact to remember is that when we're
+	// backtracing from the highest-scoring cell in the table, we're guaranteed
+	// to be able to backtrace without ever dipping below 0.  But if we're
+	// backtracing from a cell other than the highest-scoring cell in the
+	// table, we might dip below 0.  Dipping below 0 implies that there's a
+	// shorted local alignment with a better score.  In which case, it's
+	// perfectly fair for us to abandon any path that dips below the floor, and
+	// this might result in the queue becoming empty before we finish.
+	bool result = false;
+	niter = 0;
+	while(!empty()) {
+		if(trySolutions(true, res, off, nrej, rnd, result)) {
+			return result;
+		}
+		if(niter++ >= maxiter) {
+			break;
+		}
+		size_t brid = best(rnd); // put best branch in 'br'
+		assert(!seen_.contains(brid));
+		ASSERT_ONLY(seen_.insert(brid));
+#if 0
+		BtBranch *br = &bs_[brid];
+		cerr << brid
+		     << ": targ:" << prob_.targ_
+		     << ", sc:" << br->score_st_
+		     << ", row:" << br->uppermostRow()
+			 << ", nmm:" << nmm_
+			 << ", nnmm:" << nnmm_
+			 << ", nrdop:" << nrdop_
+			 << ", nrfop:" << nrfop_
+			 << ", nrdex:" << nrdex_
+			 << ", nrfex:" << nrfex_
+			 << ", nrdop_pr: " << nrdopPrune_
+			 << ", nrfop_pr: " << nrfopPrune_
+			 << ", nrdex_pr: " << nrdexPrune_
+			 << ", nrfex_pr: " << nrfexPrune_
+			 << endl;
+#endif
+		addOffshoots(brid);
+	}
+	if(trySolutions(true, res, off, nrej, rnd, result)) {
+		return result;
+	}
+	return false;
+}
+
+/**
+ * Get the next valid alignment given a backtrace problem.  Return false
+ * if there is no valid solution.  Use a triangle-fill backtrace to find
+ * the solution.  This is usually fast (it's O(m + n)).
+ */
+bool BtBranchTracer::nextAlignmentFill(
+	size_t maxiter,
+	SwResult& res,
+	size_t& off,
+	size_t& nrej,
+	size_t& niter,
+	RandomSource& rnd)
+{
+	assert(prob_.inited());
+	assert(!emptySolution());
+	bool result = false;
+	if(trySolutions(false, res, off, nrej, rnd, result)) {
+		return result;
+	}
+	return false;
+}
+
+/**
+ * Get the next valid alignment given the backtrace problem.  Return false
+ * if there is no valid solution, e.g., if 
+ */
+bool BtBranchTracer::nextAlignment(
+	size_t maxiter,
+	SwResult& res,
+	size_t& off,
+	size_t& nrej,
+	size_t& niter,
+	RandomSource& rnd)
+{
+	if(prob_.fill_) {
+		return nextAlignmentFill(
+			maxiter,
+			res,
+			off,
+			nrej,
+			niter,
+			rnd);
+	} else {
+		return nextAlignmentBacktrace(
+			maxiter,
+			res,
+			off,
+			nrej,
+			niter,
+			rnd);
+	}
+}
+
+#ifdef MAIN_ALIGNER_BT
+
+#include <iostream>
+
+int main(int argc, char **argv) {
+	size_t off = 0;
+	RandomSource rnd(77);
+	BtBranchTracer tr;
+	Scoring sc = Scoring::base1();
+	SwResult res;
+	tr.init(
+		"ACGTACGT", // in: read sequence
+		"IIIIIIII", // in: quality sequence
+		8,          // in: read sequence length
+		"ACGTACGT", // in: reference sequence
+		8,          // in: reference sequence length
+		0,          // in: reference id
+		0,          // in: reference offset
+		true,       // in: orientation
+		sc,         // in: scoring scheme
+		0,          // in: N ceiling
+		8,          // in: alignment score
+		7,          // start in this row
+		7,          // start in this column
+		rnd);       // random gen, to choose among equal paths
+	size_t nrej = 0;
+	tr.nextAlignment(
+		res,
+		off,
+		nrej,
+		rnd);
+}
+
+#endif /*def MAIN_ALIGNER_BT*/
diff --git a/aligner_bt.h b/aligner_bt.h
new file mode 100644
index 0000000..8056b7a
--- /dev/null
+++ b/aligner_bt.h
@@ -0,0 +1,947 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_BT_H_
+#define ALIGNER_BT_H_
+
+#include <utility>
+#include <stdint.h>
+#include "aligner_sw_common.h"
+#include "aligner_result.h"
+#include "scoring.h"
+#include "edit.h"
+#include "limit.h"
+#include "dp_framer.h"
+#include "sse_util.h"
+
+/* Say we've filled in a DP matrix in a cost-only manner, not saving the scores
+ * for each of the cells.  At the end, we obtain a list of candidate cells and
+ * we'd like to backtrace from them.  The per-cell scores are gone, but we have
+ * to re-create the correct path somehow.  Hopefully we can do this without
+ * recreating most or al of the score matrix, since this takes too much memory.
+ *
+ * Approach 1: Naively refill the matrix.
+ *
+ *  Just refill the matrix, perhaps backwards starting from the backtrace cell.
+ *  Since this involves recreating all or most of the score matrix, this is not
+ *  a good approach.
+ *
+ * Approach 2: Naive backtracking.
+ *
+ *  Conduct a search through the space of possible backtraces, rooted at the
+ *  candidate cell.  To speed things along, we can prioritize paths that have a
+ *  high score and that align more characters from the read.
+ *
+ *  The approach is simple, but it's neither fast nor memory-efficient in
+ *  general.
+ *
+ * Approach 3: Refilling with checkpoints.
+ *
+ *  Refill the matrix "backwards" starting from the candidate cell, but use
+ *  checkpoints to ensure that only a series of relatively small triangles or
+ *  rectangles need to be refilled.  The checkpoints must include elements from
+ *  the H, E and F matrices; not just H.  After each refill, we backtrace
+ *  through the refilled area, then discard/reuse the fill memory.  I call each
+ *  such fill/backtrace a mini-fill/backtrace.
+ *
+ *  If there's only one path to be found, then this is O(m+n).  But what if
+ *  there are many?  And what if we would like to avoid paths that overlap in
+ *  one or more cells?  There are two ways we can make this more efficient:
+ *
+ *   1. Remember the re-calculated E/F/H values and try to retrieve them
+ *   2. Keep a record of cells that have already been traversed
+ *
+ *  Legend:
+ *
+ *  1: Candidate cell
+ *  2: Final cell from first mini-fill/backtrace
+ *  3: Final cell from second mini-fill/backtrace (third not shown)
+ *  +: Checkpointed cell
+ *  *: Cell filled from first or second mini-fill/backtrace
+ *  -: Unfilled cell
+ *
+ *        ---++--------++--------++----
+ *        --++--------++*-------++-----
+ *        -++--(etc)-++**------++------
+ *        ++--------+3***-----++-------
+ *        +--------++****----++--------
+ *        --------++*****---++--------+
+ *        -------++******--++--------++
+ *        ------++*******-++*-------++-
+ *        -----++********++**------++--
+ *        ----++********2+***-----++---
+ *        ---++--------++****----++----
+ *        --++--------++*****---++-----
+ *        -++--------++*****1--++------
+ *        ++--------++--------++-------
+ *
+ * Approach 4: Backtracking with checkpoints.
+ *
+ *  Conduct a search through the space of possible backtraces, rooted at the
+ *  candidate cell.  Use "checkpoints" to prune.  That is, when a backtrace
+ *  moves through a cell with a checkpointed score, consider the score
+ *  accumulated so far and the cell's saved score; abort if those two scores
+ *  add to something less than a valid score.  Note we're only checkpointing H
+ *  in this case (possibly; see "subtle point"), not E or F.
+ *
+ *  Subtle point: checkpoint scores are a result of moving forward through
+ *  the matrix whereas backtracking scores result from moving backward.  This
+ *  matters becuase the two paths that meet up at a cell might have both
+ *  factored in a gap open penalty for the same gap, in which case we will
+ *  underestimate the overall score and prune a good path.  Here are two ideas
+ *  for how to resolve this:
+ *
+ *   Idea 1: when we combine the forward and backward scores to find an overall
+ *   score, and our backtrack procedure *just* made a horizontal or vertical
+ *   move, add in a "bonus" equal to the gap open penalty of the appropraite
+ *   type (read gap open for horizontal, ref gap open for vertical). This might
+ *   overcompensate, since
+ *
+ *   Idea 2: keep the E and F values for the checkpoints around, in addition to
+ *   the H values.  When it comes time to combine the score from the forward
+ *   and backward paths, we consider the last move we made in the backward
+ *   backtrace.  If it's a read gap (horizontal move), then we calculate the
+ *   overall score as:
+ *
+ *     max(Score-backward + H-forward, Score-backward + E-forward + read-open)
+ *
+ *   If it's a reference gap (vertical move), then we calculate the overall
+ *   score as:
+ *
+ *     max(Score-backward + H-forward, Score-backward + F-forward + ref-open)
+ *
+ *   What does it mean to abort a backtrack?  If we're starting a new branch
+ *   and there is a checkpoing in the bottommost cell of the branch, and the
+ *   overall score is less than the target, then we can simply ignore the
+ *   branch.  If the checkpoint occurs in the middle of a string of matches, we
+ *   need to curtail the branch such that it doesn't include the checkpointed
+ *   cell and we won't ever try to enter the checkpointed cell, e.g., on a
+ *   mismatch.
+ *
+ * Approaches 3 and 4 seem reasonable, and could be combined.  For simplicity,
+ * we implement only approach 4 for now.
+ *
+ * Checkpoint information is propagated from the fill process to the backtracer
+ * via a 
+ */
+
+enum {
+	BT_NOT_FOUND = 1,      // could not obtain the backtrace because it
+	                       // overlapped a previous solution
+	BT_FOUND,              // obtained a valid backtrace
+	BT_REJECTED_N,         // backtrace rejected because it had too many Ns
+	BT_REJECTED_CORE_DIAG  // backtrace rejected because it failed to overlap a
+	                       // core diagonal
+};
+
+/**
+ * Parameters for a matrix of potential backtrace problems to solve.
+ * Encapsulates information about:
+ *
+ * The problem given a particular reference substring:
+ *
+ * - The query string (nucleotides and qualities)
+ * - The reference substring (incl. orientation, offset into overall sequence)
+ * - Checkpoints (i.e. values of matrix cells)
+ * - Scoring scheme and other thresholds
+ *
+ * The problem given a particular reference substring AND a particular row and
+ * column from which to backtrace:
+ *
+ * - The row and column
+ * - The target score
+ */
+class BtBranchProblem {
+
+public:
+
+	/**
+	 * Create new uninitialized problem.
+	 */
+	BtBranchProblem() { reset(); }
+
+	/**
+	 * Initialize a new problem.
+	 */
+	void initRef(
+		const char          *qry,    // query string (along rows)
+		const char          *qual,   // query quality string (along rows)
+		size_t               qrylen, // query string (along rows) length
+		const char          *ref,    // reference string (along columns)
+		TRefOff              reflen, // in-rectangle reference string length
+		TRefOff              treflen,// total reference string length
+		TRefId               refid,  // reference id
+		TRefOff              refoff, // reference offset
+		bool                 fw,     // orientation of problem
+		const DPRect*        rect,   // dynamic programming rectangle filled out
+		const Checkpointer*  cper,   // checkpointer
+		const Scoring       *sc,     // scoring scheme
+		size_t               nceil)  // max # Ns allowed in alignment
+	{
+		qry_     = qry;
+		qual_    = qual;
+		qrylen_  = qrylen;
+		ref_     = ref;
+		reflen_  = reflen;
+		treflen_ = treflen;
+		refid_   = refid;
+		refoff_  = refoff;
+		fw_      = fw;
+		rect_    = rect;
+		cper_    = cper;
+		sc_      = sc;
+		nceil_   = nceil;
+	}
+
+	/**
+	 * Initialize a new problem.
+	 */
+	void initBt(
+		size_t   row,   // row
+		size_t   col,   // column
+		bool     fill,  // use a filling rather than a backtracking strategy
+		bool     usecp, // use checkpoints to short-circuit while backtracking
+		TAlScore targ)  // target score
+	{
+		row_    = row;
+		col_    = col;
+		targ_   = targ;
+		fill_   = fill;
+		usecp_  = usecp;
+		if(fill) {
+			assert(usecp_);
+		}
+	}
+
+	/**
+	 * Reset to uninitialized state.
+	 */
+	void reset() {
+		qry_ = qual_ = ref_ = NULL;
+		cper_ = NULL;
+		rect_ = NULL;
+		sc_ = NULL;
+		qrylen_ = reflen_ = treflen_ = refid_ = refoff_ = row_ = col_ = targ_ = nceil_ = 0;
+		fill_ = fw_ = usecp_ = false;
+	}
+	
+	/**
+	 * Return true iff the BtBranchProblem has been initialized.
+	 */
+	bool inited() const {
+		return qry_ != NULL;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Sanity-check the problem.
+	 */
+	bool repOk() const {
+		assert_gt(qrylen_, 0);
+		assert_gt(reflen_, 0);
+		assert_gt(treflen_, 0);
+		assert_lt(row_, qrylen_);
+		assert_lt((TRefOff)col_, reflen_);
+		return true;
+	}
+#endif
+	
+	size_t reflen() const { return reflen_; }
+	size_t treflen() const { return treflen_; }
+
+protected:
+
+	const char         *qry_;    // query string (along rows)
+	const char         *qual_;   // query quality string (along rows)
+	size_t              qrylen_; // query string (along rows) length
+	const char         *ref_;    // reference string (along columns)
+	TRefOff             reflen_; // in-rectangle reference string length
+	TRefOff             treflen_;// total reference string length
+	TRefId              refid_;  // reference id
+	TRefOff             refoff_; // reference offset
+	bool                fw_;     // orientation of problem
+	const DPRect*       rect_;   // dynamic programming rectangle filled out
+	size_t              row_;    // starting row
+	size_t              col_;    // starting column
+	TAlScore            targ_;   // target score
+	const Checkpointer *cper_;   // checkpointer
+	bool                fill_;   // use mini-fills
+	bool                usecp_;  // use checkpointing?
+	const Scoring      *sc_;     // scoring scheme
+	size_t              nceil_;  // max # Ns allowed in alignment
+	
+	friend class BtBranch;
+	friend class BtBranchQ;
+	friend class BtBranchTracer;
+};
+
+/**
+ * Encapsulates a "branch" which is a diagonal of cells (possibly of length 0)
+ * in the matrix where all the cells are matches.  These stretches are linked
+ * together by edits to form a full backtrace path through the matrix.  Lengths
+ * are measured w/r/t to the number of rows traversed by the path, so a branch
+ * that represents a read gap extension could have length = 0.
+ *
+ * At the end of the day, the full backtrace path is represented as a list of
+ * BtBranch's where each BtBranch represents a stretch of matching cells (and
+ * up to one mismatching cell at its bottom extreme) ending in an edit (or in
+ * the bottommost row, in which case the edit is uninitialized).  Each
+ * BtBranch's row and col fields indicate the bottommost cell involved in the
+ * diagonal stretch of matches, and the len_ field indicates the length of the
+ * stretch of matches.  Note that the edits themselves also correspond to
+ * movement through the matrix.
+ *
+ * A related issue is how we record which cells have been visited so that we
+ * never report a pair of paths both traversing the same (row, col) of the
+ * overall DP matrix.  This gets a little tricky because we have to take into
+ * account the cells covered by *edits* in addition to the cells covered by the
+ * stretches of matches.  For instance: imagine a mismatch.  That takes up a
+ * cell of the DP matrix, but it may or may not be preceded by a string of
+ * matches.  It's hard to imagine how to represent this unless we let the
+ * mismatch "count toward" the len_ of the branch and let (row, col) refer to
+ * the cell where the mismatch occurs.
+ *
+ * We need BtBranches to "live forever" so that we can make some BtBranches
+ * parents of others using parent pointers.  For this reason, BtBranch's are
+ * stored in an EFactory object in the BtBranchTracer class.
+ */
+class BtBranch {
+
+public:
+
+	BtBranch() { reset(); }
+
+	BtBranch(
+		const BtBranchProblem& prob,
+		size_t parentId,
+		TAlScore penalty,
+		TAlScore score_en,
+		int64_t row,
+		int64_t col,
+		Edit e,
+		int hef,
+		bool root,
+		bool extend)
+	{
+		init(prob, parentId, penalty, score_en, row, col, e, hef, root, extend);
+	}
+	
+	/**
+	 * Reset to uninitialized state.
+	 */
+	void reset() {
+		parentId_ = 0;
+		score_st_ = score_en_ = len_ = row_ = col_ = 0;
+		curtailed_ = false;
+		e_.reset();
+	}
+	
+	/**
+	 * Caller gives us score_en, row and col.  We figure out score_st and len_
+	 * by comparing characters from the strings.
+	 */
+	void init(
+		const BtBranchProblem& prob,
+		size_t parentId,
+		TAlScore penalty,
+		TAlScore score_en,
+		int64_t row,
+		int64_t col,
+		Edit e,
+		int hef,
+		bool root,
+		bool extend);
+	
+	/**
+	 * Return true iff this branch ends in a solution to the backtrace problem.
+	 */
+	bool isSolution(const BtBranchProblem& prob) const {
+		const bool end2end = prob.sc_->monotone;
+		return score_st_ == prob.targ_ && (!end2end || endsInFirstRow());
+	}
+	
+	/**
+	 * Return true iff this branch could potentially lead to a valid alignment.
+	 */
+	bool isValid(const BtBranchProblem& prob) const {
+		int64_t scoreFloor = prob.sc_->monotone ? MIN_I64 : 0;
+		if(score_st_ < scoreFloor) {
+			// Dipped below the score floor
+			return false;
+		}
+		if(isSolution(prob)) {
+			// It's a solution, so it's also valid
+			return true;
+		}
+		if((int64_t)len_ > row_) {
+			// Went all the way to the top row
+			//assert_leq(score_st_, prob.targ_);
+			return score_st_ == prob.targ_;
+		} else {
+			int64_t match = prob.sc_->match();
+			int64_t bonusLeft = (row_ + 1 - len_) * match;
+			return score_st_ + bonusLeft >= prob.targ_;
+		}
+	}
+	
+	/**
+	 * Return true iff this branch overlaps with the given branch.
+	 */
+	bool overlap(const BtBranchProblem& prob, const BtBranch& bt) const {
+		// Calculate this branch's diagonal
+		assert_lt(row_, (int64_t)prob.qrylen_);
+		size_t fromend = prob.qrylen_ - row_ - 1;
+		size_t diag = fromend + col_;
+		int64_t lo = 0, hi = row_ + 1;
+		if(len_ == 0) {
+			lo = row_;
+		} else {
+			lo = row_ - (len_ - 1);
+		}
+		// Calculate other branch's diagonal
+		assert_lt(bt.row_, (int64_t)prob.qrylen_);
+		size_t ofromend = prob.qrylen_ - bt.row_ - 1;
+		size_t odiag = ofromend + bt.col_;
+		if(diag != odiag) {
+			return false;
+		}
+		int64_t olo = 0, ohi = bt.row_ + 1;
+		if(bt.len_ == 0) {
+			olo = bt.row_;
+		} else {
+			olo = bt.row_ - (bt.len_ - 1);
+		}
+		int64_t losm = olo, hism = ohi;
+		if(hi - lo < ohi - olo) {
+			swap(lo, losm);
+			swap(hi, hism);
+		}
+		if((lo <= losm && hi > losm) || (lo <  hism && hi >= hism)) {
+			return true;
+		}
+		return false;
+	}
+	
+	/**
+	 * Return true iff this branch is higher priority than the branch 'o'.
+	 */
+	bool operator<(const BtBranch& o) const {
+		// Prioritize uppermost above score
+		if(uppermostRow() != o.uppermostRow()) {
+			return uppermostRow() < o.uppermostRow();
+		}
+		if(score_st_ != o.score_st_) return score_st_ > o.score_st_;
+		if(row_      != o.row_)      return row_ < o.row_;
+		if(col_      != o.col_)      return col_ > o.col_;
+		if(parentId_ != o.parentId_) return parentId_ > o.parentId_;
+		assert(false);
+		return false;
+	}
+	
+	/**
+	 * Return true iff the topmost cell involved in this branch is in the top
+	 * row.
+	 */
+	bool endsInFirstRow() const {
+		assert_leq((int64_t)len_, row_ + 1);
+		return (int64_t)len_ == row_+1;
+	}
+	
+	/**
+	 * Return the uppermost row covered by this branch.
+	 */
+	size_t uppermostRow() const {
+		assert_geq(row_ + 1, (int64_t)len_);
+		return row_ + 1 - (int64_t)len_;
+	}
+
+	/**
+	 * Return the leftmost column covered by this branch.
+	 */
+	size_t leftmostCol() const {
+		assert_geq(col_ + 1, (int64_t)len_);
+		return col_ + 1 - (int64_t)len_;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Sanity-check this BtBranch.
+	 */
+	bool repOk() const {
+		assert(root_ || e_.inited());
+		assert_gt(len_, 0);
+		assert_geq(col_ + 1, (int64_t)len_);
+		assert_geq(row_ + 1, (int64_t)len_);
+		return true;
+	}
+#endif
+
+protected:
+
+	// ID of the parent branch.
+	size_t   parentId_;
+
+	// Penalty associated with the edit at the bottom of this branch (0 if
+	// there is no edit)
+	TAlScore penalty_;
+	
+	// Score at the beginning of the branch
+	TAlScore score_st_;
+	
+	// Score at the end of the branch (taking the edit into account)
+	TAlScore score_en_;
+	
+	// Length of the branch.  That is, the total number of diagonal cells
+	// involved in all the matches and in the edit (if any).  Should always be
+	// > 0.
+	size_t   len_;
+	
+	// The row of the final (bottommost) cell in the branch.  This might be the
+	// bottommost match if the branch has no associated edit.  Otherwise, it's
+	// the cell occupied by the edit.
+	int64_t  row_;
+	
+	// The column of the final (bottommost) cell in the branch.
+	int64_t  col_;
+	
+	// The edit at the bottom of the branch.  If this is the bottommost branch
+	// in the alignment and it does not end in an edit, then this remains
+	// uninitialized.
+	Edit     e_;
+	
+	// True iff this is the bottommost branch in the alignment.  We can't just
+	// use row_ to tell us this because local alignments don't necessarily end
+	// in the last row.
+	bool     root_;
+	
+	bool     curtailed_;  // true -> pruned at a checkpoint where we otherwise
+	                      // would have had a match
+
+friend class BtBranchQ;
+friend class BtBranchTracer;
+
+};
+
+/**
+ * Instantiate and solve best-first branch-based backtraces.
+ */
+class BtBranchTracer {
+
+public:
+
+	explicit BtBranchTracer() :
+		prob_(), bs_(), seenPaths_(DP_CAT), sawcell_(DP_CAT), doTri_() { }
+
+	/**
+	 * Add a branch to the queue.
+	 */
+	void add(size_t id) {
+		assert(!bs_[id].isSolution(prob_));
+		unsorted_.push_back(make_pair(bs_[id].score_st_, id));
+	}
+	
+	/**
+	 * Add a branch to the list of solutions.
+	 */
+	void addSolution(size_t id) {
+		assert(bs_[id].isSolution(prob_));
+		solutions_.push_back(id);
+	}
+
+	/**
+	 * Given a potential branch to add to the queue, see if we can follow the
+	 * branch a little further first.  If it's still valid, or if we reach a
+	 * choice between valid outgoing paths, go ahead and add it to the queue.
+	 */
+	void examineBranch(
+		int64_t row,
+		int64_t col,
+		const Edit& e,
+		TAlScore pen,
+		TAlScore sc,
+		size_t parentId);
+
+	/**
+	 * Take all possible ways of leaving the given branch and add them to the
+	 * branch queue.
+	 */
+	void addOffshoots(size_t bid);
+	
+	/**
+	 * Get the best branch and remove it from the priority queue.
+	 */
+	size_t best(RandomSource& rnd) {
+		assert(!empty());
+		flushUnsorted();
+		assert_gt(sortedSel_ ? sorted1_.size() : sorted2_.size(), cur_);
+		// Perhaps shuffle everyone who's tied for first?
+		size_t id = sortedSel_ ? sorted1_[cur_] : sorted2_[cur_];
+		cur_++;
+		return id;
+	}
+	
+	/**
+	 * Return true iff there are no branches left to try.
+	 */
+	bool empty() const {
+		return size() == 0;
+	}
+	
+	/**
+	 * Return the size, i.e. the total number of branches contained.
+	 */
+	size_t size() const {
+		return unsorted_.size() +
+		       (sortedSel_ ? sorted1_.size() : sorted2_.size()) - cur_;
+	}
+
+	/**
+	 * Return true iff there are no solutions left to try.
+	 */
+	bool emptySolution() const {
+		return sizeSolution() == 0;
+	}
+	
+	/**
+	 * Return the size of the solution set so far.
+	 */
+	size_t sizeSolution() const {
+		return solutions_.size();
+	}
+	
+	/**
+	 * Sort unsorted branches, merge them with master sorted list.
+	 */
+	void flushUnsorted();
+	
+#ifndef NDEBUG
+	/**
+	 * Sanity-check the queue.
+	 */
+	bool repOk() const {
+		assert_lt(cur_, (sortedSel_ ? sorted1_.size() : sorted2_.size()));
+		return true;
+	}
+#endif
+	
+	/**
+	 * Initialize the tracer with respect to a new read.  This involves
+	 * resetting all the state relating to the set of cells already visited
+	 */
+	void initRef(
+		const char*         rd,     // in: read sequence
+		const char*         qu,     // in: quality sequence
+		size_t              rdlen,  // in: read sequence length
+		const char*         rf,     // in: reference sequence
+		size_t              rflen,  // in: in-rectangle reference sequence length
+		TRefOff             trflen, // in: total reference sequence length
+		TRefId              refid,  // in: reference id
+		TRefOff             refoff, // in: reference offset
+		bool                fw,     // in: orientation
+		const DPRect       *rect,   // in: DP rectangle
+		const Checkpointer *cper,   // in: checkpointer
+		const Scoring&      sc,     // in: scoring scheme
+		size_t              nceil)  // in: N ceiling
+	{
+		prob_.initRef(rd, qu, rdlen, rf, rflen, trflen, refid, refoff, fw, rect, cper, &sc, nceil);
+		const size_t ndiag = rflen + rdlen - 1;
+		seenPaths_.resize(ndiag);
+		for(size_t i = 0; i < ndiag; i++) {
+			seenPaths_[i].clear();
+		}
+		// clear each of the per-column sets
+		if(sawcell_.size() < rflen) {
+			size_t isz = sawcell_.size();
+			sawcell_.resize(rflen);
+			for(size_t i = isz; i < rflen; i++) {
+				sawcell_[i].setCat(DP_CAT);
+			}
+		}
+		for(size_t i = 0; i < rflen; i++) {
+			sawcell_[i].setCat(DP_CAT);
+			sawcell_[i].clear(); // clear the set
+		}
+	}
+	
+	/**
+	 * Initialize with a new backtrace.
+	 */
+	void initBt(
+		TAlScore       escore, // in: alignment score
+		size_t         row,    // in: start in this row
+		size_t         col,    // in: start in this column
+		bool           fill,   // in: use mini-filling?
+		bool           usecp,  // in: use checkpointing?
+		bool           doTri,  // in: triangle-shaped mini-fills?
+		RandomSource&  rnd)    // in: random gen, to choose among equal paths
+	{
+		prob_.initBt(row, col, fill, usecp, escore);
+		Edit e; e.reset();
+		unsorted_.clear();
+		solutions_.clear();
+		sorted1_.clear();
+		sorted2_.clear();
+		cur_ = 0;
+		nmm_ = 0;         // number of mismatches attempted
+		nnmm_ = 0;        // number of mismatches involving N attempted
+		nrdop_ = 0;       // number of read gap opens attempted
+		nrfop_ = 0;       // number of ref gap opens attempted
+		nrdex_ = 0;       // number of read gap extensions attempted
+		nrfex_ = 0;       // number of ref gap extensions attempted
+		nmmPrune_ = 0;    // number of mismatches attempted
+		nnmmPrune_ = 0;   // number of mismatches involving N attempted
+		nrdopPrune_ = 0;  // number of read gap opens attempted
+		nrfopPrune_ = 0;  // number of ref gap opens attempted
+		nrdexPrune_ = 0;  // number of read gap extensions attempted
+		nrfexPrune_ = 0;  // number of ref gap extensions attempted
+		row_ = row;
+		col_ = col;
+		doTri_ = doTri;
+		bs_.clear();
+		if(!prob_.fill_) {
+			size_t id = bs_.alloc();
+			bs_[id].init(
+				prob_,
+				0,     // parent id
+				0,     // penalty
+				0,     // starting score
+				row,   // row
+				col,   // column
+				e,
+				0,
+			    true,  // this is the root
+				true); // this should be extend with exact matches
+			if(bs_[id].isSolution(prob_)) {
+				addSolution(id);
+			} else {
+				add(id);
+			}
+		} else {
+			int64_t row = row_, col = col_;
+			TAlScore targsc = prob_.targ_;
+			int hef = 0;
+			bool done = false, abort = false;
+			size_t depth = 0;
+			while(!done && !abort) {
+				// Accumulate edits as we go.  We can do this by adding
+				// BtBranches to the bs_ structure.  Each step of the backtrace
+				// either involves an edit (thereby starting a new branch) or
+				// extends the previous branch by one more position.
+				//
+				// Note: if the BtBranches are in line, then trySolution can be
+				// used to populate the SwResult and check for various
+				// situations where we might reject the alignment (i.e. due to
+				// a cell having been visited previously).
+				if(doTri_) {
+					triangleFill(
+						row,          // row of cell to backtrace from
+						col,          // column of cell to backtrace from
+						hef,          // cell to bt from: H (0), E (1), or F (2)
+						targsc,       // score of cell to backtrace from
+						prob_.targ_,  // score of alignment we're looking for
+						rnd,          // pseudo-random generator
+						row,          // out: row we ended up in after bt
+						col,          // out: column we ended up in after bt
+						hef,          // out: H/E/F after backtrace
+						targsc,       // out: score up to cell we ended up in
+						done,         // out: finished tracing out an alignment?
+						abort);       // out: aborted b/c cell was seen before?
+				} else {
+					squareFill(
+						row,          // row of cell to backtrace from
+						col,          // column of cell to backtrace from
+						hef,          // cell to bt from: H (0), E (1), or F (2)
+						targsc,       // score of cell to backtrace from
+						prob_.targ_,  // score of alignment we're looking for
+						rnd,          // pseudo-random generator
+						row,          // out: row we ended up in after bt
+						col,          // out: column we ended up in after bt
+						hef,          // out: H/E/F after backtrace
+						targsc,       // out: score up to cell we ended up in
+						done,         // out: finished tracing out an alignment?
+						abort);       // out: aborted b/c cell was seen before?
+				}
+				if(depth >= ndep_.size()) {
+					ndep_.resize(depth+1);
+					ndep_[depth] = 1;
+				} else {
+					ndep_[depth]++;
+				}
+				depth++;
+				assert((row >= 0 && col >= 0) || done);
+			}
+		}
+		ASSERT_ONLY(seen_.clear());
+	}
+	
+	/**
+	 * Get the next valid alignment given the backtrace problem.  Return false
+	 * if there is no valid solution, e.g., if 
+	 */
+	bool nextAlignment(
+		size_t maxiter,
+		SwResult& res,
+		size_t& off,
+		size_t& nrej,
+		size_t& niter,
+		RandomSource& rnd);
+	
+	/**
+	 * Return true iff this tracer has been initialized
+	 */
+	bool inited() const {
+		return prob_.inited();
+	}
+	
+	/**
+	 * Return true iff the mini-fills are triangle-shaped.
+	 */
+	bool doTri() const { return doTri_; }
+
+	/**
+	 * Fill in a triangle of the DP table and backtrace from the given cell to
+	 * a cell in the previous checkpoint, or to the terminal cell.
+	 */
+	void triangleFill(
+		int64_t rw,          // row of cell to backtrace from
+		int64_t cl,          // column of cell to backtrace from
+		int hef,             // cell to backtrace from is H (0), E (1), or F (2)
+		TAlScore targ,       // score of cell to backtrace from
+		TAlScore targ_final, // score of alignment we're looking for
+		RandomSource& rnd,   // pseudo-random generator
+		int64_t& row_new,    // out: row we ended up in after backtrace
+		int64_t& col_new,    // out: column we ended up in after backtrace
+		int& hef_new,        // out: H/E/F after backtrace
+		TAlScore& targ_new,  // out: score up to cell we ended up in
+		bool& done,          // out: finished tracing out an alignment?
+		bool& abort);        // out: aborted b/c cell was seen before?
+
+	/**
+	 * Fill in a square of the DP table and backtrace from the given cell to
+	 * a cell in the previous checkpoint, or to the terminal cell.
+	 */
+	void squareFill(
+		int64_t rw,          // row of cell to backtrace from
+		int64_t cl,          // column of cell to backtrace from
+		int hef,             // cell to backtrace from is H (0), E (1), or F (2)
+		TAlScore targ,       // score of cell to backtrace from
+		TAlScore targ_final, // score of alignment we're looking for
+		RandomSource& rnd,   // pseudo-random generator
+		int64_t& row_new,    // out: row we ended up in after backtrace
+		int64_t& col_new,    // out: column we ended up in after backtrace
+		int& hef_new,        // out: H/E/F after backtrace
+		TAlScore& targ_new,  // out: score up to cell we ended up in
+		bool& done,          // out: finished tracing out an alignment?
+		bool& abort);        // out: aborted b/c cell was seen before?
+
+protected:
+
+	/**
+	 * Get the next valid alignment given a backtrace problem.  Return false
+	 * if there is no valid solution.  Use a backtracking search to find the
+	 * solution.  This can be very slow.
+	 */
+	bool nextAlignmentBacktrace(
+		size_t maxiter,
+		SwResult& res,
+		size_t& off,
+		size_t& nrej,
+		size_t& niter,
+		RandomSource& rnd);
+
+	/**
+	 * Get the next valid alignment given a backtrace problem.  Return false
+	 * if there is no valid solution.  Use a triangle-fill backtrace to find
+	 * the solution.  This is usually fast (it's O(m + n)).
+	 */
+	bool nextAlignmentFill(
+		size_t maxiter,
+		SwResult& res,
+		size_t& off,
+		size_t& nrej,
+		size_t& niter,
+		RandomSource& rnd);
+
+	/**
+	 * Try all the solutions accumulated so far.  Solutions might be rejected
+	 * if they, for instance, overlap a previous solution, have too many Ns,
+	 * fail to overlap a core diagonal, etc.
+	 */
+	bool trySolutions(
+		bool lookForOlap,
+		SwResult& res,
+		size_t& off,
+		size_t& nrej,
+		RandomSource& rnd,
+		bool& success);
+	
+	/**
+	 * See if a given solution branch works as a solution (i.e. doesn't overlap
+	 * another one, have too many Ns, fail to overlap a core diagonal, etc.)
+	 */
+	int trySolution(
+		size_t id,
+		bool lookForOlap,
+		SwResult& res,
+		size_t& off,
+		size_t& nrej,
+		RandomSource& rnd);
+
+	BtBranchProblem    prob_; // problem configuration
+	EFactory<BtBranch> bs_;   // global BtBranch factory
+	
+	// already reported alignments going through these diagonal segments
+	ELList<std::pair<size_t, size_t> > seenPaths_;
+	ELSet<size_t> sawcell_; // cells already backtraced through
+	
+	EList<std::pair<TAlScore, size_t> > unsorted_;  // unsorted list of as-yet-unflished BtBranches
+	EList<size_t> sorted1_;   // list of BtBranch, sorted by score
+	EList<size_t> sorted2_;   // list of BtBranch, sorted by score
+	EList<size_t> solutions_; // list of solution branches
+	bool          sortedSel_; // true -> 1, false -> 2
+	size_t        cur_;       // cursor into sorted list to start from
+	
+	size_t        nmm_;         // number of mismatches attempted
+	size_t        nnmm_;        // number of mismatches involving N attempted
+	size_t        nrdop_;       // number of read gap opens attempted
+	size_t        nrfop_;       // number of ref gap opens attempted
+	size_t        nrdex_;       // number of read gap extensions attempted
+	size_t        nrfex_;       // number of ref gap extensions attempted
+	
+	size_t        nmmPrune_;    // 
+	size_t        nnmmPrune_;   // 
+	size_t        nrdopPrune_;  // 
+	size_t        nrfopPrune_;  // 
+	size_t        nrdexPrune_;  // 
+	size_t        nrfexPrune_;  // 
+	
+	size_t        row_;         // row
+	size_t        col_;         // column
+
+	bool           doTri_;      // true -> fill in triangles; false -> squares
+	EList<CpQuad>  sq_;         // square to fill when doing mini-fills
+	ELList<CpQuad> tri_;        // triangle to fill when doing mini-fills
+	EList<size_t>  ndep_;       // # triangles mini-filled at various depths
+
+#ifndef NDEBUG
+	ESet<size_t>  seen_;        // seedn branch ids; should never see same twice
+#endif
+};
+
+#endif /*ndef ALIGNER_BT_H_*/
diff --git a/aligner_cache.cpp b/aligner_cache.cpp
new file mode 100644
index 0000000..7a8de26
--- /dev/null
+++ b/aligner_cache.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "aligner_cache.h"
+#include "tinythread.h"
+
+#ifdef ALIGNER_CACHE_MAIN
+
+#include <iostream>
+#include <getopt.h>
+#include <string>
+#include "random_source.h"
+
+using namespace std;
+
+enum {
+	ARG_TESTS = 256
+};
+
+static const char *short_opts = "vCt";
+static struct option long_opts[] = {
+	{(char*)"verbose",  no_argument, 0, 'v'},
+	{(char*)"tests",    no_argument, 0, ARG_TESTS},
+};
+
+static void printUsage(ostream& os) {
+	os << "Usage: sawhi-cache [options]*" << endl;
+	os << "Options:" << endl;
+	os << "  --tests       run unit tests" << endl;
+	os << "  -v/--verbose  talkative mode" << endl;
+}
+
+int gVerbose = 0;
+
+static void add(
+	RedBlack<QKey, QVal>& t,
+	Pool& p,
+	const char *dna)
+{
+	QKey qk;
+	qk.init(BTDnaString(dna, true));
+	t.add(p, qk, NULL);
+}
+
+/**
+ * Small tests for the AlignmentCache.
+ */
+static void aligner_cache_tests() {
+	RedBlack<QKey, QVal> rb(1024);
+	Pool p(64 * 1024, 1024);
+	// Small test
+	add(rb, p, "ACGTCGATCGT");
+	add(rb, p, "ACATCGATCGT");
+	add(rb, p, "ACGACGATCGT");
+	add(rb, p, "ACGTAGATCGT");
+	add(rb, p, "ACGTCAATCGT");
+	add(rb, p, "ACGTCGCTCGT");
+	add(rb, p, "ACGTCGAACGT");
+	assert_eq(7, rb.size());
+	rb.clear();
+	p.clear();
+	// Another small test
+	add(rb, p, "ACGTCGATCGT");
+	add(rb, p, "CCGTCGATCGT");
+	add(rb, p, "TCGTCGATCGT");
+	add(rb, p, "GCGTCGATCGT");
+	add(rb, p, "AAGTCGATCGT");
+	assert_eq(5, rb.size());
+	rb.clear();
+	p.clear();
+	// Regression test (attempt to make it smaller)
+	add(rb, p, "CCTA");
+	add(rb, p, "AGAA");
+	add(rb, p, "TCTA");
+	add(rb, p, "GATC");
+	add(rb, p, "CTGC");
+	add(rb, p, "TTGC");
+	add(rb, p, "GCCG");
+	add(rb, p, "GGAT");
+	rb.clear();
+	p.clear();
+	// Regression test
+	add(rb, p, "CCTA");
+	add(rb, p, "AGAA");
+	add(rb, p, "TCTA");
+	add(rb, p, "GATC");
+	add(rb, p, "CTGC");
+	add(rb, p, "CATC");
+	add(rb, p, "CAAA");
+	add(rb, p, "CTAT");
+	add(rb, p, "CTCA");
+	add(rb, p, "TTGC");
+	add(rb, p, "GCCG");
+	add(rb, p, "GGAT");
+	assert_eq(12, rb.size());
+	rb.clear();
+	p.clear();
+	// Larger random test
+	EList<BTDnaString> strs;
+	char buf[5];
+	for(int i = 0; i < 4; i++) {
+		for(int j = 0; j < 4; j++) {
+			for(int k = 0; k < 4; k++) {
+				for(int m = 0; m < 4; m++) {
+					buf[0] = "ACGT"[i];
+					buf[1] = "ACGT"[j];
+					buf[2] = "ACGT"[k];
+					buf[3] = "ACGT"[m];
+					buf[4] = '\0';
+					strs.push_back(BTDnaString(buf, true));
+				}
+			}
+		}
+	}
+	// Add all of the 4-mers in several different random orders
+	RandomSource rand;
+	for(uint32_t runs = 0; runs < 100; runs++) {
+		rb.clear();
+		p.clear();
+		assert_eq(0, rb.size());
+		rand.init(runs);
+		EList<bool> used;
+		used.resize(256);
+		for(int i = 0; i < 256; i++) used[i] = false;
+		for(int i = 0; i < 256; i++) {
+			int r = rand.nextU32() % (256-i);
+			int unused = 0;
+			bool added = false;
+			for(int j = 0; j < 256; j++) {
+				if(!used[j] && unused == r) {
+					used[j] = true;
+					QKey qk;
+					qk.init(strs[j]);
+					rb.add(p, qk, NULL);
+					added = true;
+					break;
+				}
+				if(!used[j]) unused++;
+			}
+			assert(added);
+		}
+	}
+}
+
+/**
+ * A way of feeding simply tests to the seed alignment infrastructure.
+ */
+int main(int argc, char **argv) {
+	int option_index = 0;
+	int next_option;
+	do {
+		next_option = getopt_long(argc, argv, short_opts, long_opts, &option_index);
+		switch (next_option) {
+			case 'v':       gVerbose = true; break;
+			case ARG_TESTS: aligner_cache_tests(); return 0;
+			case -1: break;
+			default: {
+				cerr << "Unknown option: " << (char)next_option << endl;
+				printUsage(cerr);
+				exit(1);
+			}
+		}
+	} while(next_option != -1);
+}
+#endif
diff --git a/aligner_cache.h b/aligner_cache.h
new file mode 100644
index 0000000..2237071
--- /dev/null
+++ b/aligner_cache.h
@@ -0,0 +1,1013 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_CACHE_H_
+#define ALIGNER_CACHE_H_
+
+/**
+ * CACHEING
+ *
+ * By caching the results of some alignment sub-problems, we hope to
+ * enable a "fast path" for read alignment whereby answers are mostly
+ * looked up rather than calculated from scratch.  This is particularly
+ * effective when the input is sorted or otherwise grouped in a way
+ * that brings together reads with (at least some) seed sequences in
+ * common.
+ *
+ * But the cache is also where results are held, regardless of whether
+ * the results are maintained & re-used across reads.
+ *
+ * The cache consists of two linked potions:
+ *
+ * 1. A multimap from seed strings (i.e. read substrings) to reference strings
+ *    that are within some edit distance (roughly speaking).  This is the "seed
+ *    multimap".
+ *
+ *    Key:   Read substring (2-bit-per-base encoded + length)
+ *    Value: Set of reference substrings (i.e. keys into the suffix
+ *           array multimap).
+ *
+ * 2. A multimap from reference strings to the corresponding elements of the
+ *    suffix array.  Elements are filled in with reference-offset info as it's
+ *    calculated.  This is the "suffix array multimap"
+ *
+ *    Key:   Reference substring (2-bit-per-base encoded + length)
+ *    Value: (a) top from BWT, (b) length of range, (c) offset of first
+ *           range element in 
+ *
+ * For both multimaps, we use a combo Red-Black tree and EList.  The payload in
+ * the Red-Black tree nodes points to a range in the EList.
+ */
+
+#include <iostream>
+#include "ds.h"
+#include "read.h"
+#include "threading.h"
+#include "mem_ids.h"
+#include "simple_func.h"
+#include "btypes.h"
+
+#define CACHE_PAGE_SZ (16 * 1024)
+
+typedef PListSlice<TIndexOffU, CACHE_PAGE_SZ> TSlice;
+
+/**
+ * Key for the query multimap: the read substring and its length.
+ */
+struct QKey {
+
+	/**
+	 * Initialize invalid QKey.
+	 */
+	QKey() { reset(); }
+
+	/**
+	 * Initialize QKey from DNA string.
+	 */
+	QKey(const BTDnaString& s ASSERT_ONLY(, BTDnaString& tmp)) {
+		init(s ASSERT_ONLY(, tmp));
+	}
+	
+	/**
+	 * Initialize QKey from DNA string.  Rightmost character is placed in the
+	 * least significant bitpair.
+	 */
+	bool init(
+		const BTDnaString& s
+		ASSERT_ONLY(, BTDnaString& tmp))
+	{
+		seq = 0;
+		len = (uint32_t)s.length();
+		ASSERT_ONLY(tmp.clear());
+		if(len > 32) {
+			len = 0xffffffff;
+			return false; // wasn't cacheable
+		} else {
+			// Rightmost char of 's' goes in the least significant bitpair
+			for(size_t i = 0; i < 32 && i < s.length(); i++) {
+				int c = (int)s.get(i);
+				assert_range(0, 4, c);
+				if(c == 4) {
+					len = 0xffffffff;
+					return false;
+				}
+				seq = (seq << 2) | s.get(i);
+			}
+			ASSERT_ONLY(toString(tmp));
+			assert(sstr_eq(tmp, s));
+			assert_leq(len, 32);
+			return true; // was cacheable
+		}
+	}
+	
+	/**
+	 * Convert this key to a DNA string.
+	 */
+	void toString(BTDnaString& s) {
+		s.resize(len);
+		uint64_t sq = seq;
+		for(int i = (len)-1; i >= 0; i--) {
+			s.set((uint32_t)(sq & 3), i);
+			sq >>= 2;
+		}
+	}
+	
+	/**
+	 * Return true iff the read substring is cacheable.
+	 */
+	bool cacheable() const { return len != 0xffffffff; }
+	
+	/**
+	 * Reset to uninitialized state.
+	 */
+	void reset() { seq = 0; len = 0xffffffff; }
+
+	/**
+	 * True -> my key is less than the given key.
+	 */
+	bool operator<(const QKey& o) const {
+		return seq < o.seq || (seq == o.seq && len < o.len);
+	}
+
+	/**
+	 * True -> my key is greater than the given key.
+	 */
+	bool operator>(const QKey& o) const {
+		return !(*this < o || *this == o);
+	}
+
+	/**
+	 * True -> my key is equal to the given key.
+	 */
+	bool operator==(const QKey& o) const {
+		return seq == o.seq && len == o.len;
+	}
+
+
+	/**
+	 * True -> my key is not equal to the given key.
+	 */
+	bool operator!=(const QKey& o) const {
+		return !(*this == o);
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that this is a valid, initialized QKey.
+	 */
+	bool repOk() const {
+		return len != 0xffffffff;
+	}
+#endif
+
+	uint64_t seq; // sequence
+	uint32_t len; // length of sequence
+};
+
+template <typename index_t>
+class AlignmentCache;
+
+/**
+ * Payload for the query multimap: a range of elements in the reference
+ * string list.
+ */
+template <typename index_t>
+class QVal {
+
+public:
+
+	QVal() { reset(); }
+
+	/**
+	 * Return the offset of the first reference substring in the qlist.
+	 */
+	index_t offset() const { return i_; }
+
+	/**
+	 * Return the number of reference substrings associated with a read
+	 * substring.
+	 */
+	index_t numRanges() const {
+		assert(valid());
+		return rangen_;
+	}
+
+	/**
+	 * Return the number of elements associated with all associated
+	 * reference substrings.
+	 */
+	index_t numElts() const {
+		assert(valid());
+		return eltn_;
+	}
+	
+	/**
+	 * Return true iff the read substring is not associated with any
+	 * reference substrings.
+	 */
+	bool empty() const {
+		assert(valid());
+		return numRanges() == 0;
+	}
+
+	/**
+	 * Return true iff the QVal is valid.
+	 */
+	bool valid() const { return rangen_ != (index_t)OFF_MASK; }
+	
+	/**
+	 * Reset to invalid state.
+	 */
+	void reset() {
+		i_ = 0; rangen_ = eltn_ = (index_t)OFF_MASK;
+	}
+	
+	/**
+	 * Initialize Qval.
+	 */
+	void init(index_t i, index_t ranges, index_t elts) {
+		i_ = i; rangen_ = ranges; eltn_ = elts;
+	}
+	
+	/**
+	 * Tally another range with given number of elements.
+	 */
+	void addRange(index_t numElts) {
+		rangen_++;
+		eltn_ += numElts;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that this QVal is internally consistent and consistent
+	 * with the contents of the given cache.
+	 */
+	bool repOk(const AlignmentCache<index_t>& ac) const;
+#endif
+
+protected:
+
+	index_t i_;      // idx of first elt in qlist
+	index_t rangen_; // # ranges (= # associated reference substrings)
+	index_t eltn_;   // # elements (total)
+};
+
+/**
+ * Key for the suffix array multimap: the reference substring and its
+ * length.  Same as QKey so I typedef it.
+ */
+typedef QKey SAKey;
+
+/**
+ * Payload for the suffix array multimap: (a) the top element of the
+ * range in BWT, (b) the offset of the first elt in the salist, (c)
+ * length of the range.
+ */
+template <typename index_t>
+struct SAVal {
+
+	SAVal() : topf(), topb(), i(), len(OFF_MASK) { }
+
+	/**
+	 * Return true iff the SAVal is valid.
+	 */
+	bool valid() { return len != (index_t)OFF_MASK; }
+
+#ifndef NDEBUG
+	/**
+	 * Check that this SAVal is internally consistent and consistent
+	 * with the contents of the given cache.
+	 */
+	bool repOk(const AlignmentCache<index_t>& ac) const;
+#endif
+	
+	/**
+	 * Initialize the SAVal.
+	 */
+	void init(
+		index_t tf,
+		index_t tb,
+		index_t ii,
+		index_t ln)
+	{
+		topf = tf;
+		topb = tb;
+		i = ii;
+		len = ln;
+	}
+
+	index_t topf;  // top in BWT
+	index_t topb;  // top in BWT'
+	index_t i;     // idx of first elt in salist
+	index_t len;   // length of range
+};
+
+/**
+ * One data structure that encapsulates all of the cached information
+ * associated with a particular reference substring.  This is useful
+ * for summarizing what info should be added to the cache for a partial
+ * alignment.
+ */
+template <typename index_t>
+class SATuple {
+
+public:
+
+	SATuple() { reset(); };
+
+	SATuple(SAKey k, index_t tf, index_t tb, TSlice o) {
+		init(k, tf, tb, o);
+	}
+	
+	void init(SAKey k, index_t tf, index_t tb, TSlice o) {
+		key = k; topf = tf; topb = tb; offs = o;
+	}
+
+	/**
+	 * Initialize this SATuple from a subrange of the SATuple 'src'.
+	 */
+	void init(const SATuple& src, index_t first, index_t last) {
+		assert_neq((index_t)OFF_MASK, src.topb);
+		key = src.key;
+		topf = (index_t)(src.topf + first);
+		topb = (index_t)OFF_MASK; // unknown!
+		offs.init(src.offs, first, last);
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that this SATuple is internally consistent and that its
+	 * PListSlice is consistent with its backing PList.
+	 */
+	bool repOk() const {
+		assert(offs.repOk());
+		return true;
+	}
+#endif
+
+	/**
+	 * Function for ordering SATuples.  This is used when prioritizing which to
+	 * explore first when extending seed hits into full alignments.  Smaller
+	 * ranges get higher priority and we use 'top' to break ties, though any
+	 * way of breaking a tie would be fine.
+	 */
+	bool operator<(const SATuple& o) const {
+		if(offs.size() < o.offs.size()) {
+			return true;
+		}
+		if(offs.size() > o.offs.size()) {
+			return false;
+		}
+		return topf < o.topf;
+	}
+	bool operator>(const SATuple& o) const {
+		if(offs.size() < o.offs.size()) {
+			return false;
+		}
+		if(offs.size() > o.offs.size()) {
+			return true;
+		}
+		return topf > o.topf;
+	}
+	
+	bool operator==(const SATuple& o) const {
+		return key == o.key && topf == o.topf && topb == o.topb && offs == o.offs;
+	}
+
+	void reset() { topf = topb = (index_t)OFF_MASK; offs.reset(); }
+	
+	/**
+	 * Set the length to be at most the original length.
+	 */
+	void setLength(index_t nlen) {
+		assert_leq(nlen, offs.size());
+		offs.setLength(nlen);
+	}
+	
+	/**
+	 * Return the number of times this reference substring occurs in the
+	 * reference, which is also the size of the 'offs' TSlice.
+	 */
+	index_t size() const { return (index_t)offs.size(); }
+
+	// bot/length of SA range equals offs.size()
+	SAKey   key;  // sequence key
+	index_t topf;  // top in BWT index
+	index_t topb;  // top in BWT' index
+	TSlice  offs; // offsets
+};
+
+/**
+ * Encapsulate the data structures and routines that constitute a
+ * particular cache, i.e., a particular stratum of the cache system,
+ * which might comprise many strata.
+ *
+ * Each thread has a "current-read" AlignmentCache which is used to
+ * build and store subproblem results as alignment is performed.  When
+ * we're finished with a read, we might copy the cached results for
+ * that read (and perhaps a bundle of other recently-aligned reads) to
+ * a higher-level "across-read" cache.  Higher-level caches may or may
+ * not be shared among threads.
+ *
+ * A cache consists chiefly of two multimaps, each implemented as a
+ * Red-Black tree map backed by an EList.  A 'version' counter is
+ * incremented every time the cache is cleared.
+ */
+template <typename index_t>
+class AlignmentCache {
+
+	typedef RedBlackNode<QKey,  QVal<index_t> >  QNode;
+	typedef RedBlackNode<SAKey, SAVal<index_t> > SANode;
+
+	typedef PList<SAKey, CACHE_PAGE_SZ> TQList;
+	typedef PList<index_t, CACHE_PAGE_SZ> TSAList;
+
+public:
+
+	AlignmentCache(
+		uint64_t bytes,
+		bool shared) :
+		pool_(bytes, CACHE_PAGE_SZ, CA_CAT),
+		qmap_(CACHE_PAGE_SZ, CA_CAT),
+		qlist_(CA_CAT),
+		samap_(CACHE_PAGE_SZ, CA_CAT),
+		salist_(CA_CAT),
+		shared_(shared),
+        mutex_m(),
+		version_(0)
+	{
+	}
+
+	/**
+	 * Given a QVal, populate the given EList of SATuples with records
+	 * describing all of the cached information about the QVal's
+	 * reference substrings.
+	 */
+	template <int S>
+	void queryQval(
+		const QVal<index_t>& qv,
+		EList<SATuple<index_t>, S>& satups,
+		index_t& nrange,
+		index_t& nelt,
+		bool getLock = true)
+	{
+        ThreadSafe ts(lockPtr(), shared_ && getLock);
+		assert(qv.repOk(*this));
+		const index_t refi = qv.offset();
+		const index_t reff = refi + qv.numRanges();
+		// For each reference sequence sufficiently similar to the
+		// query sequence in the QKey...
+		for(index_t i = refi; i < reff; i++) {
+			// Get corresponding SAKey, containing similar reference
+			// sequence & length
+			SAKey sak = qlist_.get(i);
+			// Shouldn't have identical keys in qlist_
+			assert(i == refi || qlist_.get(i) != qlist_.get(i-1));
+			// Get corresponding SANode
+			SANode *n = samap_.lookup(sak);
+			assert(n != NULL);
+			const SAVal<index_t>& sav = n->payload;
+			assert(sav.repOk(*this));
+			if(sav.len > 0) {
+				nrange++;
+				satups.expand();
+				satups.back().init(sak, sav.topf, sav.topb, TSlice(salist_, sav.i, sav.len));
+				nelt += sav.len;
+#ifndef NDEBUG
+				// Shouldn't add consecutive identical entries too satups
+				if(i > refi) {
+					const SATuple<index_t> b1 = satups.back();
+					const SATuple<index_t> b2 = satups[satups.size()-2];
+					assert(b1.key != b2.key || b1.topf != b2.topf || b1.offs != b2.offs);
+				}
+#endif
+			}
+		}
+	}
+
+	/**
+	 * Return true iff the cache has no entries in it.
+	 */
+	bool empty() const {
+		bool ret = qmap_.empty();
+		assert(!ret || qlist_.empty());
+		assert(!ret || samap_.empty());
+		assert(!ret || salist_.empty());
+		return ret;
+	}
+	
+	/**
+	 * Add a new query key ('qk'), usually a 2-bit encoded substring of
+	 * the read) as the key in a new Red-Black node in the qmap and
+	 * return a pointer to the node's QVal.
+	 *
+	 * The expectation is that the caller is about to set about finding
+	 * associated reference substrings, and that there will be future
+	 * calls to addOnTheFly to add associations to reference substrings
+	 * found.
+	 */
+	QVal<index_t>* add(
+		const QKey& qk,
+		bool *added,
+		bool getLock = true)
+	{
+        ThreadSafe ts(lockPtr(), shared_ && getLock);
+		assert(qk.cacheable());
+		QNode *n = qmap_.add(pool(), qk, added);
+		return (n != NULL ? &n->payload : NULL);
+	}
+
+	/**
+	 * Add a new association between a read sequnce ('seq') and a
+	 * reference sequence ('')
+	 */
+	bool addOnTheFly(
+		QVal<index_t>& qv, // qval that points to the range of reference substrings
+		const SAKey& sak,  // the key holding the reference substring
+		index_t topf,      // top range elt in BWT index
+		index_t botf,      // bottom range elt in BWT index
+		index_t topb,      // top range elt in BWT' index
+		index_t botb,      // bottom range elt in BWT' index
+		bool getLock = true);
+
+	/**
+	 * Clear the cache, i.e. turn it over.  All HitGens referring to
+	 * ranges in this cache will become invalid and the corresponding
+	 * reads will have to be re-aligned.
+	 */
+	void clear(bool getLock = true) {
+        ThreadSafe ts(lockPtr(), shared_ && getLock);
+		pool_.clear();
+		qmap_.clear();
+		qlist_.clear();
+		samap_.clear();
+		salist_.clear();
+		version_++;
+	}
+
+	/**
+	 * Return the number of keys in the query multimap.
+	 */
+	index_t qNumKeys() const { return (index_t)qmap_.size(); }
+
+	/**
+	 * Return the number of keys in the suffix array multimap.
+	 */
+	index_t saNumKeys() const { return (index_t)samap_.size(); }
+
+	/**
+	 * Return the number of elements in the reference substring list.
+	 */
+	index_t qSize() const { return (index_t)qlist_.size(); }
+
+	/**
+	 * Return the number of elements in the SA range list.
+	 */
+	index_t saSize() const { return (index_t)salist_.size(); }
+
+	/**
+	 * Return the pool.
+	 */
+	Pool& pool() { return pool_; }
+	
+	/**
+	 * Return the lock object.
+	 */
+	MUTEX_T& lock() {
+	    return mutex_m;
+	}
+
+	/**
+	 * Return a const pointer to the lock object.  This allows us to
+	 * write const member functions that grab the lock.
+	 */
+	MUTEX_T* lockPtr() const {
+	    return const_cast<MUTEX_T*>(&mutex_m);
+	}
+	
+	/**
+	 * Return true iff this cache is shared among threads.
+	 */
+	bool shared() const { return shared_; }
+	
+	/**
+	 * Return the current "version" of the cache, i.e. the total number
+	 * of times it has turned over since its creation.
+	 */
+	uint32_t version() const { return version_; }
+
+protected:
+
+	Pool                   pool_;   // dispenses memory pages
+	RedBlack<QKey, QVal<index_t> >   qmap_;   // map from query substrings to reference substrings
+	TQList                 qlist_;  // list of reference substrings
+	RedBlack<SAKey, SAVal<index_t> > samap_;  // map from reference substrings to SA ranges
+	TSAList                salist_; // list of SA ranges
+	
+	bool     shared_;  // true -> this cache is global
+	MUTEX_T mutex_m;    // mutex used for syncronization in case the the cache is shared.
+	uint32_t version_; // cache version
+};
+
+/**
+ * Interface used to query and update a pair of caches: one thread-
+ * local and unsynchronized, another shared and synchronized.  One or
+ * both can be NULL.
+ */
+template <typename index_t>
+class AlignmentCacheIface {
+
+public:
+
+	AlignmentCacheIface(
+		AlignmentCache<index_t> *current,
+		AlignmentCache<index_t> *local,
+		AlignmentCache<index_t> *shared) :
+		qk_(),
+		qv_(NULL),
+		cacheable_(false),
+		rangen_(0),
+		eltsn_(0),
+		current_(current),
+		local_(local),
+		shared_(shared)
+	{
+		assert(current_ != NULL);
+	}
+
+#if 0
+	/**
+	 * Query the relevant set of caches, looking for a QVal to go with
+	 * the provided QKey.  If the QVal is found in a cache other than
+	 * the current-read cache, it is copied into the current-read cache
+	 * first and the QVal pointer for the current-read cache is
+	 * returned.  This function never returns a pointer from any cache
+	 * other than the current-read cache.  If the QVal could not be
+	 * found in any cache OR if the QVal was found in a cache other
+	 * than the current-read cache but could not be copied into the
+	 * current-read cache, NULL is returned.
+	 */
+	QVal* queryCopy(const QKey& qk, bool getLock = true) {
+		assert(qk.cacheable());
+		AlignmentCache* caches[3] = { current_, local_, shared_ };
+		for(int i = 0; i < 3; i++) {
+			if(caches[i] == NULL) continue;
+			QVal* qv = caches[i]->query(qk, getLock);
+			if(qv != NULL) {
+				if(i == 0) return qv;
+				if(!current_->copy(qk, *qv, *caches[i], getLock)) {
+					// Exhausted memory in the current cache while
+					// attempting to copy in the qk
+					return NULL;
+				}
+				QVal* curqv = current_->query(qk, getLock);
+				assert(curqv != NULL);
+				return curqv;
+			}
+		}
+		return NULL;
+	}
+
+	/**
+	 * Query the relevant set of caches, looking for a QVal to go with
+	 * the provided QKey.  If a QVal is found and which is non-NULL,
+	 * *which is set to 0 if the qval was found in the current-read
+	 * cache, 1 if it was found in the local across-read cache, and 2
+	 * if it was found in the shared across-read cache.
+	 */
+	inline QVal* query(
+		const QKey& qk,
+		AlignmentCache** which,
+		bool getLock = true)
+	{
+		assert(qk.cacheable());
+		AlignmentCache* caches[3] = { current_, local_, shared_ };
+		for(int i = 0; i < 3; i++) {
+			if(caches[i] == NULL) continue;
+			QVal* qv = caches[i]->query(qk, getLock);
+			if(qv != NULL) {
+				if(which != NULL) *which = caches[i];
+				return qv;
+			}
+		}
+		return NULL;
+	}
+#endif
+
+	/**
+	 * This function is called whenever we start to align a new read or
+	 * read substring.  We make key for it and store the key in qk_.
+	 * If the sequence is uncacheable, we don't actually add it to the
+	 * map but the corresponding reference substrings are still added
+	 * to the qlist_.
+	 *
+	 * Returns:
+	 *  -1 if out of memory
+	 *  0 if key was found in cache
+	 *  1 if key was not found in cache (and there's enough memory to
+	 *    add a new key)
+	 */
+	int beginAlign(
+		const BTDnaString& seq,
+		const BTString& qual,
+		QVal<index_t>& qv,              // out: filled in if we find it in the cache
+		bool getLock = true)
+	{
+		assert(repOk());
+		qk_.init(seq ASSERT_ONLY(, tmpdnastr_));
+		//if(qk_.cacheable() && (qv_ = current_->query(qk_, getLock)) != NULL) {
+		//	// qv_ holds the answer
+		//	assert(qv_->valid());
+		//	qv = *qv_;
+		//	resetRead();
+		//	return 1; // found in cache
+		//} else
+		if(qk_.cacheable()) {
+			// Make a QNode for this key and possibly add the QNode to the
+			// Red-Black map; but if 'seq' isn't cacheable, just create the
+			// QNode (without adding it to the map).
+			qv_ = current_->add(qk_, &cacheable_, getLock);
+		} else {
+			qv_ = &qvbuf_;
+		}
+		if(qv_ == NULL) {
+			resetRead();
+ 			return -1; // Not in memory
+		}
+		qv_->reset();
+		return 0; // Need to search for it
+	}
+	ASSERT_ONLY(BTDnaString tmpdnastr_);
+	
+	/**
+	 * Called when is finished aligning a read (and so is finished
+	 * adding associated reference strings).  Returns a copy of the
+	 * final QVal object and resets the alignment state of the
+	 * current-read cache.
+	 *
+	 * Also, if the alignment is cacheable, it commits it to the next
+	 * cache up in the cache hierarchy.
+	 */
+	QVal<index_t> finishAlign(bool getLock = true) {
+		if(!qv_->valid()) {
+			qv_->init(0, 0, 0);
+		}
+		// Copy this pointer because we're about to reset the qv_ field
+		// to NULL
+		QVal<index_t>* qv = qv_;
+		// Commit the contents of the current-read cache to the next
+		// cache up in the hierarchy.
+		// If qk is cacheable, then it must be in the cache
+#if 0
+		if(qk_.cacheable()) {
+			AlignmentCache* caches[3] = { current_, local_, shared_ };
+			ASSERT_ONLY(AlignmentCache* which);
+			ASSERT_ONLY(QVal* qv2 = query(qk_, &which, true));
+			assert(qv2 == qv);
+			assert(which == current_);
+			for(int i = 1; i < 3; i++) {
+				if(caches[i] != NULL) {
+					// Copy this key/value pair to the to the higher
+					// level cache and, if its memory is exhausted,
+					// clear the cache and try again.
+					caches[i]->clearCopy(qk_, *qv_, *current_, getLock);
+					break;
+				}
+			}
+		}
+#endif
+		// Reset the state in this iface in preparation for the next
+		// alignment.
+		resetRead();
+		assert(repOk());
+		return *qv;
+	}
+
+	/**
+	 * A call to this member indicates that the caller has finished
+	 * with the last read (if any) and is ready to work on the next.
+	 * This gives the cache a chance to reset some of its state if
+	 * necessary.
+	 */
+	void nextRead() {
+		current_->clear();
+		resetRead();
+		assert(!aligning());
+	}
+	
+	/**
+	 * Return true iff we're in the middle of aligning a sequence.
+	 */
+	bool aligning() const {
+		return qv_ != NULL;
+	}
+	
+	/**
+	 * Clears both the local and shared caches.
+	 */
+	void clear() {
+		if(current_ != NULL) current_->clear();
+		if(local_   != NULL) local_->clear();
+		if(shared_  != NULL) shared_->clear();
+	}
+	
+	/**
+	 * Add an alignment to the running list of alignments being
+	 * compiled for the current read in the local cache.
+	 */
+	bool addOnTheFly(
+		const BTDnaString& rfseq, // reference sequence close to read seq
+		index_t topf,            // top in BWT index
+		index_t botf,            // bot in BWT index
+		index_t topb,            // top in BWT' index
+		index_t botb,            // bot in BWT' index
+		bool getLock = true)      // true -> lock is not held by caller
+	{
+		
+		assert(aligning());
+		assert(repOk());
+		ASSERT_ONLY(BTDnaString tmp);
+		SAKey sak(rfseq ASSERT_ONLY(, tmp));
+		//assert(sak.cacheable());
+		if(current_->addOnTheFly((*qv_), sak, topf, botf, topb, botb, getLock)) {
+			rangen_++;
+			eltsn_ += (botf-topf);
+			return true;
+		}
+		return false;
+	}
+
+	/**
+	 * Given a QVal, populate the given EList of SATuples with records
+	 * describing all of the cached information about the QVal's
+	 * reference substrings.
+	 */
+	template<int S>
+	void queryQval(
+		const QVal<index_t>& qv,
+		EList<SATuple<index_t>, S>& satups,
+		index_t& nrange,
+		index_t& nelt,
+		bool getLock = true)
+	{
+		current_->queryQval(qv, satups, nrange, nelt, getLock);
+	}
+
+	/**
+	 * Return a pointer to the current-read cache object.
+	 */
+	const AlignmentCache<index_t>* currentCache() const { return current_; }
+	
+	index_t curNumRanges() const { return rangen_; }
+	index_t curNumElts()   const { return eltsn_;  }
+	
+#ifndef NDEBUG
+	/**
+	 * Check that AlignmentCacheIface is internally consistent.
+	 */
+	bool repOk() const {
+		assert(current_ != NULL);
+		assert_geq(eltsn_, rangen_);
+		if(qv_ == NULL) {
+			assert_eq(0, rangen_);
+			assert_eq(0, eltsn_);
+		}
+		return true;
+	}
+#endif
+	
+	/**
+	 * Return the alignment cache for the current read.
+	 */
+	const AlignmentCache<index_t>& current() {
+		return *current_;
+	}
+
+protected:
+
+	/**
+	 * Reset fields encoding info about the in-process read.
+	 */
+	void resetRead() {
+		cacheable_ = false;
+		rangen_ = eltsn_ = 0;
+		qv_ = NULL;
+	}
+
+	QKey qk_;  // key representation for current read substring
+	QVal<index_t> *qv_; // pointer to value representation for current read substring
+	QVal<index_t> qvbuf_; // buffer for when key is uncacheable but we need a qv
+	bool cacheable_; // true iff the read substring currently being aligned is cacheable
+	
+	index_t rangen_; // number of ranges since last alignment job began
+	index_t eltsn_;  // number of elements since last alignment job began
+
+	AlignmentCache<index_t> *current_; // cache dedicated to the current read
+	AlignmentCache<index_t> *local_;   // local, unsynchronized cache
+	AlignmentCache<index_t> *shared_;  // shared, synchronized cache
+};
+
+#ifndef NDEBUG
+/**
+ * Check that this QVal is internally consistent and consistent
+ * with the contents of the given cache.
+ */
+template <typename index_t>
+bool QVal<index_t>::repOk(const AlignmentCache<index_t>& ac) const {
+	if(rangen_ > 0) {
+		assert_lt(i_, ac.qSize());
+		assert_leq(i_ + rangen_, ac.qSize());
+	}
+	assert_geq(eltn_, rangen_);
+	return true;
+}
+#endif
+
+#ifndef NDEBUG
+/**
+ * Check that this SAVal is internally consistent and consistent
+ * with the contents of the given cache.
+ */
+template <typename index_t>
+bool SAVal<index_t>::repOk(const AlignmentCache<index_t>& ac) const {
+	assert(len == 0 || i < ac.saSize());
+	assert_leq(i + len, ac.saSize());
+	return true;
+}
+#endif
+
+/**
+ * Add a new association between a read sequnce ('seq') and a
+ * reference sequence ('')
+ */
+template <typename index_t>
+bool AlignmentCache<index_t>::addOnTheFly(
+								 QVal<index_t>& qv, // qval that points to the range of reference substrings
+								 const SAKey& sak,  // the key holding the reference substring
+								 index_t topf,      // top range elt in BWT index
+								 index_t botf,      // bottom range elt in BWT index
+								 index_t topb,      // top range elt in BWT' index
+								 index_t botb,      // bottom range elt in BWT' index
+								 bool getLock)
+{
+    ThreadSafe ts(lockPtr(), shared_ && getLock);
+	bool added = true;
+	// If this is the first reference sequence we're associating with
+	// the query sequence, initialize the QVal.
+	if(!qv.valid()) {
+		qv.init((index_t)qlist_.size(), 0, 0);
+	}
+	qv.addRange(botf-topf); // update tally for # ranges and # elts
+	if(!qlist_.add(pool(), sak)) {
+		return false; // Exhausted pool memory
+	}
+#ifndef NDEBUG
+	for(index_t i = qv.offset(); i < qlist_.size(); i++) {
+		if(i > qv.offset()) {
+			assert(qlist_.get(i) != qlist_.get(i-1));
+		}
+	}
+#endif
+	assert_eq(qv.offset() + qv.numRanges(), qlist_.size());
+	SANode *s = samap_.add(pool(), sak, &added);
+	if(s == NULL) {
+		return false; // Exhausted pool memory
+	}
+	assert(s->key.repOk());
+	if(added) {
+		s->payload.i = (index_t)salist_.size();
+		s->payload.len = botf - topf;
+		s->payload.topf = topf;
+		s->payload.topb = topb;
+		for(size_t j = 0; j < (botf-topf); j++) {
+			if(!salist_.add(pool(), (index_t)0xffffffff)) {
+				// Change the payload's len field
+				s->payload.len = (uint32_t)j;
+				return false; // Exhausted pool memory
+			}
+		}
+		assert(s->payload.repOk(*this));
+	}
+	// Now that we know all allocations have succeeded, we can do a few final
+	// updates
+	
+	return true; 
+}
+
+#endif /*ALIGNER_CACHE_H_*/
diff --git a/aligner_metrics.h b/aligner_metrics.h
new file mode 100644
index 0000000..c0b0182
--- /dev/null
+++ b/aligner_metrics.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_METRICS_H_
+#define ALIGNER_METRICS_H_
+
+#include <math.h>
+#include <iostream>
+#include "alphabet.h"
+#include "timer.h"
+#include "sstring.h"
+
+using namespace std;
+
+/**
+ * Borrowed from http://www.johndcook.com/standard_deviation.html,
+ * which in turn is borrowed from Knuth.
+ */
+class RunningStat {
+public:
+	RunningStat() : m_n(0), m_tot(0.0) { }
+
+	void clear() {
+		m_n = 0;
+		m_tot = 0.0;
+	}
+
+	void push(float x) {
+		m_n++;
+		m_tot += x;
+		// See Knuth TAOCP vol 2, 3rd edition, page 232
+		if (m_n == 1) {
+			m_oldM = m_newM = x;
+			m_oldS = 0.0;
+		} else {
+			m_newM = m_oldM + (x - m_oldM)/m_n;
+			m_newS = m_oldS + (x - m_oldM)*(x - m_newM);
+			// set up for next iteration
+			m_oldM = m_newM;
+			m_oldS = m_newS;
+		}
+	}
+
+	int num() const {
+		return m_n;
+	}
+
+	double tot() const {
+		return m_tot;
+	}
+
+	double mean() const {
+		return (m_n > 0) ? m_newM : 0.0;
+	}
+
+	double variance() const {
+		return ( (m_n > 1) ? m_newS/(m_n - 1) : 0.0 );
+	}
+
+	double stddev() const {
+		return sqrt(variance());
+	}
+
+private:
+	int m_n;
+	double m_tot;
+	double m_oldM, m_newM, m_oldS, m_newS;
+};
+
+/**
+ * Encapsulates a set of metrics that we would like an aligner to keep
+ * track of, so that we can possibly use it to diagnose performance
+ * issues.
+ */
+class AlignerMetrics {
+
+public:
+
+	AlignerMetrics() :
+		curBacktracks_(0),
+		curBwtOps_(0),
+		first_(true),
+		curIsLowEntropy_(false),
+		curIsHomoPoly_(false),
+		curHadRanges_(false),
+		curNumNs_(0),
+		reads_(0),
+		homoReads_(0),
+		lowEntReads_(0),
+		hiEntReads_(0),
+		alignedReads_(0),
+		unalignedReads_(0),
+		threeOrMoreNReads_(0),
+		lessThanThreeNRreads_(0),
+		bwtOpsPerRead_(),
+		backtracksPerRead_(),
+		bwtOpsPerHomoRead_(),
+		backtracksPerHomoRead_(),
+		bwtOpsPerLoEntRead_(),
+		backtracksPerLoEntRead_(),
+		bwtOpsPerHiEntRead_(),
+		backtracksPerHiEntRead_(),
+		bwtOpsPerAlignedRead_(),
+		backtracksPerAlignedRead_(),
+		bwtOpsPerUnalignedRead_(),
+		backtracksPerUnalignedRead_(),
+		bwtOpsPer0nRead_(),
+		backtracksPer0nRead_(),
+		bwtOpsPer1nRead_(),
+		backtracksPer1nRead_(),
+		bwtOpsPer2nRead_(),
+		backtracksPer2nRead_(),
+		bwtOpsPer3orMoreNRead_(),
+		backtracksPer3orMoreNRead_(),
+		timer_(cout, "", false)
+		{ }
+
+	void printSummary() {
+		if(!first_) {
+			finishRead();
+		}
+		cout << "AlignerMetrics:" << endl;
+		cout << "  # Reads:             " << reads_ << endl;
+		float hopct = (reads_ > 0) ? (((float)homoReads_)/((float)reads_)) : (0.0f);
+		hopct *= 100.0f;
+		cout << "  % homo-polymeric:    " << (hopct) << endl;
+		float lopct = (reads_ > 0) ? ((float)lowEntReads_/(float)(reads_)) : (0.0f);
+		lopct *= 100.0f;
+		cout << "  % low-entropy:       " << (lopct) << endl;
+		float unpct = (reads_ > 0) ? ((float)unalignedReads_/(float)(reads_)) : (0.0f);
+		unpct *= 100.0f;
+		cout << "  % unaligned:         " << (unpct) << endl;
+		float npct = (reads_ > 0) ? ((float)threeOrMoreNReads_/(float)(reads_)) : (0.0f);
+		npct *= 100.0f;
+		cout << "  % with 3 or more Ns: " << (npct) << endl;
+		cout << endl;
+		cout << "  Total BWT ops:    avg: " << bwtOpsPerRead_.mean() << ", stddev: " << bwtOpsPerRead_.stddev() << endl;
+		cout << "  Total Backtracks: avg: " << backtracksPerRead_.mean() << ", stddev: " << backtracksPerRead_.stddev() << endl;
+		time_t elapsed = timer_.elapsed();
+		cout << "  BWT ops per second:    " << (bwtOpsPerRead_.tot()/elapsed) << endl;
+		cout << "  Backtracks per second: " << (backtracksPerRead_.tot()/elapsed) << endl;
+		cout << endl;
+		cout << "  Homo-poly:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPerHomoRead_.mean() << ", stddev: " << bwtOpsPerHomoRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPerHomoRead_.mean() << ", stddev: " << backtracksPerHomoRead_.stddev() << endl;
+		cout << "  Low-entropy:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPerLoEntRead_.mean() << ", stddev: " << bwtOpsPerLoEntRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPerLoEntRead_.mean() << ", stddev: " << backtracksPerLoEntRead_.stddev() << endl;
+		cout << "  High-entropy:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPerHiEntRead_.mean() << ", stddev: " << bwtOpsPerHiEntRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPerHiEntRead_.mean() << ", stddev: " << backtracksPerHiEntRead_.stddev() << endl;
+		cout << endl;
+		cout << "  Unaligned:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPerUnalignedRead_.mean() << ", stddev: " << bwtOpsPerUnalignedRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPerUnalignedRead_.mean() << ", stddev: " << backtracksPerUnalignedRead_.stddev() << endl;
+		cout << "  Aligned:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPerAlignedRead_.mean() << ", stddev: " << bwtOpsPerAlignedRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPerAlignedRead_.mean() << ", stddev: " << backtracksPerAlignedRead_.stddev() << endl;
+		cout << endl;
+		cout << "  0 Ns:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPer0nRead_.mean() << ", stddev: " << bwtOpsPer0nRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPer0nRead_.mean() << ", stddev: " << backtracksPer0nRead_.stddev() << endl;
+		cout << "  1 N:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPer1nRead_.mean() << ", stddev: " << bwtOpsPer1nRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPer1nRead_.mean() << ", stddev: " << backtracksPer1nRead_.stddev() << endl;
+		cout << "  2 Ns:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPer2nRead_.mean() << ", stddev: " << bwtOpsPer2nRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPer2nRead_.mean() << ", stddev: " << backtracksPer2nRead_.stddev() << endl;
+		cout << "  >2 Ns:" << endl;
+		cout << "    BWT ops:    avg: " << bwtOpsPer3orMoreNRead_.mean() << ", stddev: " << bwtOpsPer3orMoreNRead_.stddev() << endl;
+		cout << "    Backtracks: avg: " << backtracksPer3orMoreNRead_.mean() << ", stddev: " << backtracksPer3orMoreNRead_.stddev() << endl;
+		cout << endl;
+	}
+
+	/**
+	 *
+	 */
+	void nextRead(const BTDnaString& read) {
+		if(!first_) {
+			finishRead();
+		}
+		first_ = false;
+		//float ent = entropyDna5(read);
+		float ent = 0.0f;
+		curIsLowEntropy_ = (ent < 0.75f);
+		curIsHomoPoly_ = (ent < 0.001f);
+		curHadRanges_ = false;
+		curBwtOps_ = 0;
+		curBacktracks_ = 0;
+		// Count Ns
+		curNumNs_ = 0;
+		const size_t len = read.length();
+		for(size_t i = 0; i < len; i++) {
+			if((int)read[i] == 4) curNumNs_++;
+		}
+	}
+
+	/**
+	 *
+	 */
+	void setReadHasRange() {
+		curHadRanges_ = true;
+	}
+
+	/**
+	 * Commit the running statistics for this read to
+	 */
+	void finishRead() {
+		reads_++;
+		if(curIsHomoPoly_) homoReads_++;
+		else if(curIsLowEntropy_) lowEntReads_++;
+		else hiEntReads_++;
+		if(curHadRanges_) alignedReads_++;
+		else unalignedReads_++;
+		bwtOpsPerRead_.push((float)curBwtOps_);
+		backtracksPerRead_.push((float)curBacktracks_);
+		// Drill down by entropy
+		if(curIsHomoPoly_) {
+			bwtOpsPerHomoRead_.push((float)curBwtOps_);
+			backtracksPerHomoRead_.push((float)curBacktracks_);
+		} else if(curIsLowEntropy_) {
+			bwtOpsPerLoEntRead_.push((float)curBwtOps_);
+			backtracksPerLoEntRead_.push((float)curBacktracks_);
+		} else {
+			bwtOpsPerHiEntRead_.push((float)curBwtOps_);
+			backtracksPerHiEntRead_.push((float)curBacktracks_);
+		}
+		// Drill down by whether it aligned
+		if(curHadRanges_) {
+			bwtOpsPerAlignedRead_.push((float)curBwtOps_);
+			backtracksPerAlignedRead_.push((float)curBacktracks_);
+		} else {
+			bwtOpsPerUnalignedRead_.push((float)curBwtOps_);
+			backtracksPerUnalignedRead_.push((float)curBacktracks_);
+		}
+		if(curNumNs_ == 0) {
+			lessThanThreeNRreads_++;
+			bwtOpsPer0nRead_.push((float)curBwtOps_);
+			backtracksPer0nRead_.push((float)curBacktracks_);
+		} else if(curNumNs_ == 1) {
+			lessThanThreeNRreads_++;
+			bwtOpsPer1nRead_.push((float)curBwtOps_);
+			backtracksPer1nRead_.push((float)curBacktracks_);
+		} else if(curNumNs_ == 2) {
+			lessThanThreeNRreads_++;
+			bwtOpsPer2nRead_.push((float)curBwtOps_);
+			backtracksPer2nRead_.push((float)curBacktracks_);
+		} else {
+			threeOrMoreNReads_++;
+			bwtOpsPer3orMoreNRead_.push((float)curBwtOps_);
+			backtracksPer3orMoreNRead_.push((float)curBacktracks_);
+		}
+	}
+
+	// Running-total of the number of backtracks and BWT ops for the
+	// current read
+	uint32_t curBacktracks_;
+	uint32_t curBwtOps_;
+
+protected:
+
+	bool first_;
+
+	// true iff the current read is low entropy
+	bool curIsLowEntropy_;
+	// true if current read is all 1 char (or very close)
+	bool curIsHomoPoly_;
+	// true iff the current read has had one or more ranges reported
+	bool curHadRanges_;
+	// number of Ns in current read
+	int curNumNs_;
+
+	// # reads
+	uint32_t reads_;
+	// # homo-poly reads
+	uint32_t homoReads_;
+	// # low-entropy reads
+	uint32_t lowEntReads_;
+	// # high-entropy reads
+	uint32_t hiEntReads_;
+	// # reads with alignments
+	uint32_t alignedReads_;
+	// # reads without alignments
+	uint32_t unalignedReads_;
+	// # reads with 3 or more Ns
+	uint32_t threeOrMoreNReads_;
+	// # reads with < 3 Ns
+	uint32_t lessThanThreeNRreads_;
+
+	// Distribution of BWT operations per read
+	RunningStat bwtOpsPerRead_;
+	RunningStat backtracksPerRead_;
+
+	// Distribution of BWT operations per homo-poly read
+	RunningStat bwtOpsPerHomoRead_;
+	RunningStat backtracksPerHomoRead_;
+
+	// Distribution of BWT operations per low-entropy read
+	RunningStat bwtOpsPerLoEntRead_;
+	RunningStat backtracksPerLoEntRead_;
+
+	// Distribution of BWT operations per high-entropy read
+	RunningStat bwtOpsPerHiEntRead_;
+	RunningStat backtracksPerHiEntRead_;
+
+	// Distribution of BWT operations per read that "aligned" (for
+	// which a range was arrived at - range may not have necessarily
+	// lead to an alignment)
+	RunningStat bwtOpsPerAlignedRead_;
+	RunningStat backtracksPerAlignedRead_;
+
+	// Distribution of BWT operations per read that didn't align
+	RunningStat bwtOpsPerUnalignedRead_;
+	RunningStat backtracksPerUnalignedRead_;
+
+	// Distribution of BWT operations/backtracks per read with no Ns
+	RunningStat bwtOpsPer0nRead_;
+	RunningStat backtracksPer0nRead_;
+
+	// Distribution of BWT operations/backtracks per read with one N
+	RunningStat bwtOpsPer1nRead_;
+	RunningStat backtracksPer1nRead_;
+
+	// Distribution of BWT operations/backtracks per read with two Ns
+	RunningStat bwtOpsPer2nRead_;
+	RunningStat backtracksPer2nRead_;
+
+	// Distribution of BWT operations/backtracks per read with three or
+	// more Ns
+	RunningStat bwtOpsPer3orMoreNRead_;
+	RunningStat backtracksPer3orMoreNRead_;
+
+	Timer timer_;
+};
+
+#endif /* ALIGNER_METRICS_H_ */
diff --git a/aligner_result.h b/aligner_result.h
new file mode 100644
index 0000000..2ce4c2f
--- /dev/null
+++ b/aligner_result.h
@@ -0,0 +1,469 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_RESULT_H_
+#define ALIGNER_RESULT_H_
+
+#include <utility>
+#include <limits>
+#include <vector>
+#include "mem_ids.h"
+#include "ref_coord.h"
+#include "read.h"
+#include "filebuf.h"
+#include "ds.h"
+#include "edit.h"
+#include "limit.h"
+
+typedef int64_t TAlScore;
+
+#define VALID_AL_SCORE(x)   ((x).score_ > MIN_I64)
+#define VALID_SCORE(x)      ((x) > MIN_I64)
+#define INVALIDATE_SCORE(x) ((x) = MIN_I64)
+
+/**
+ * A generic score object for an alignment.  Used for accounting during
+ * SW and elsewhere.  Encapsulates the score, the number of N positions
+ * and the number gaps in the alignment.
+ *
+ * The scale for 'score' is such that a perfect alignment score is 0
+ * and a score with non-zero penalty is less than 0.  So differences
+ * between scores work as expected, but interpreting an individual
+ * score (larger is better) as a penalty (smaller is better) requires
+ * taking the absolute value.
+ */
+class AlnScore {
+
+public:
+
+	/**
+	 * Gapped scores are invalid until proven valid.
+	 */
+	inline AlnScore() {
+		reset();
+		invalidate();
+		assert(!valid());
+	}
+
+	/**
+	 * Gapped scores are invalid until proven valid.
+	 */
+	inline AlnScore(TAlScore score) {
+		score_ = score;
+	}
+	
+	/**
+	 * Reset the score.
+	 */
+	void reset() {
+		score_ = 0;
+	}
+
+	/**
+	 * Return an invalid SwScore.
+	 */
+	inline static AlnScore INVALID() {
+		AlnScore s;
+		s.invalidate();
+		assert(!s.valid());
+		return s;
+	}
+	
+	/**
+	 * Return true iff this score has a valid value.
+	 */
+	inline bool valid() const {
+		return score_ != MIN_I64;
+	}
+
+	/**
+	 * Make this score invalid (and therefore <= all other scores).
+	 */
+	inline void invalidate() {
+		score_ = MIN_I64;
+		assert(!valid());
+	}
+	
+
+	/**
+	 * Scores are equal iff they're bitwise equal.
+	 */
+	inline bool operator==(const AlnScore& o) const {
+		// Profiling shows cache misses on following line
+		return VALID_AL_SCORE(*this) && VALID_AL_SCORE(o) && score_ == o.score_;
+	}
+
+	/**
+	 * Return true iff the two scores are unequal.
+	 */
+	inline bool operator!=(const AlnScore& o) const {
+		return !(*this == o);
+	}
+
+	/**
+	 * Return true iff this score is >= score o.
+	 */
+	inline bool operator>=(const AlnScore& o) const {
+		if(!VALID_AL_SCORE(o)) {
+			if(!VALID_AL_SCORE(*this)) {
+				// both invalid
+				return false;
+			} else {
+				// I'm valid, other is invalid
+				return true;
+			}
+		} else if(!VALID_AL_SCORE(*this)) {
+			// I'm invalid, other is valid
+			return false;
+		}
+		return score_ >= o.score_;
+	}
+
+	/**
+	 * Return true iff this score is < score o.
+	 */
+	inline bool operator<(const AlnScore& o) const {
+		return !operator>=(o);
+	}
+
+	/**
+	 * Return true iff this score is <= score o.
+	 */
+	inline bool operator<=(const AlnScore& o) const {
+		return operator<(o) || operator==(o);
+	}
+    
+    /**
+     * Return true iff this score is < score o.
+     */
+    inline bool operator>(const AlnScore& o) const {
+        return !operator<=(o);
+    }
+
+	TAlScore score()   const { return  score_; }
+
+    // Score accumulated so far (penalties are subtracted starting at 0)
+	TAlScore score_;
+};
+
+static inline ostream& operator<<(ostream& os, const AlnScore& o) {
+	os << o.score();
+	return os;
+}
+
+// Forward declaration
+class BitPairReference;
+
+
+/**
+ * Encapsulates an alignment result.  The result comprises:
+ *
+ * 1. All the nucleotide edits for both mates ('ned').
+ * 2. All "edits" where an ambiguous reference char is resolved to an
+ *    unambiguous char ('aed').
+ * 3. The score for the alginment, including summary information about the
+ *    number of gaps and Ns involved.
+ * 4. The reference id, strand, and 0-based offset of the leftmost character
+ *    involved in the alignment.
+ * 5. Information about trimming prior to alignment and whether it was hard or
+ *    soft.
+ * 6. Information about trimming during alignment and whether it was hard or
+ *    soft.  Local-alignment trimming is usually soft when aligning nucleotide
+ *    reads.
+ *
+ * Note that the AlnRes, together with the Read and an AlnSetSumm (*and* the
+ * opposite mate's AlnRes and Read in the case of a paired-end alignment),
+ * should contain enough information to print an entire alignment record.
+ *
+ * TRIMMING
+ *
+ * Accounting for trimming is tricky.  Trimming affects:
+ *
+ * 1. The values of the trim* and pretrim* fields.
+ * 2. The offsets of the Edits in the EList<Edit>s.
+ * 3. The read extent, if the trimming is soft.
+ * 4. The read extent and the read sequence and length, if trimming is hard.
+ *
+ * Handling 1. is not too difficult.  2., 3., and 4. are handled in setShape().
+ */
+class AlnRes {
+
+public:
+
+	AlnRes()
+	{
+		reset();
+	}
+    
+    AlnRes(const AlnRes& other)
+    {
+        score_ = other.score_;
+        max_score_ = other.max_score_;
+        uid_ = other.uid_;
+        tid_ = other.tid_;
+        taxRank_ = other.taxRank_;
+        summedHitLen_ = other.summedHitLen_;
+		readPositions_ = other.readPositions_;
+		isFw_ = other.isFw_;
+    }
+    
+    AlnRes& operator=(const AlnRes& other) {
+        if(this == &other) return *this;
+        score_ = other.score_;
+        max_score_ = other.max_score_;
+        uid_ = other.uid_;
+        tid_ = other.tid_;
+        taxRank_ = other.taxRank_;
+        summedHitLen_ = other.summedHitLen_;
+		readPositions_ = other.readPositions_;
+		isFw_ = other.isFw_;
+        return *this;
+    }
+    
+    ~AlnRes() {}
+
+	/**
+	 * Clear all contents.
+	 */
+	void reset() {
+        score_ = 0;
+        max_score_ = 0;
+        uid_ = "";
+        tid_ = 0;
+        taxRank_ = RANK_UNKNOWN;
+        summedHitLen_ = 0.0;
+		readPositions_.clear();
+    }
+    
+	/**
+	 * Set alignment score for this alignment.
+	 */
+	void setScore(TAlScore score) {
+		score_ = score;
+	}
+
+	TAlScore           score()          const { return score_;     }
+    TAlScore           max_score()      const { return max_score_; }
+    string             uid()            const { return uid_;   }
+    uint64_t           taxID()          const { return tid_;   }
+    uint8_t            taxRank()        const { return taxRank_; }
+    double             summedHitLen()   const { return summedHitLen_; }
+
+	const EList<pair<uint32_t,uint32_t> >& readPositionsPtr() const { return readPositions_; }
+
+	const pair<uint32_t,uint32_t> readPositions(size_t i) const { return readPositions_[i]; }
+	size_t nReadPositions() const { return readPositions_.size(); }
+
+	bool               isFw()           const { return isFw_;      }
+    
+   /**
+	 * Print the sequence for the read that aligned using A, C, G and
+	 * T.  This will simply print the read sequence (or its reverse
+	 * complement).
+	 */
+ 	void printSeq(
+		const Read& rd,
+		const BTDnaString* dns,
+		BTString& o) const
+    {
+        assert(!rd.patFw.empty());
+        ASSERT_ONLY(size_t written = 0);
+        // Print decoded nucleotides
+        assert(dns != NULL);
+        size_t len = dns->length();
+        size_t st = 0;
+        size_t en = len;
+        for(size_t i = st; i < en; i++) {
+            int c = dns->get(i);
+            assert_range(0, 3, c);
+            o.append("ACGT"[c]);
+            ASSERT_ONLY(written++);
+        }
+    }
+
+	/**
+	 * Print the quality string for the read that aligned.  This will
+	 * simply print the read qualities (or their reverse).
+	 */
+ 	void printQuals(
+		const Read& rd,
+		const BTString* dqs,
+		BTString& o) const
+    {
+        assert(dqs != NULL);
+        size_t len = dqs->length();
+        // Print decoded qualities from upstream to downstream Watson
+        for(size_t i = 1; i < len-1; i++) {
+            o.append(dqs->get(i));
+        }
+    }
+	
+
+	/**
+	 * Initialize new AlnRes.
+	 */
+	void init(
+              TAlScore score,           // alignment score
+              TAlScore max_score,
+              const string& uniqueID,
+              uint64_t taxID,
+              uint8_t taxRank,
+			  double summedHitLen,
+			  const EList<pair<uint32_t, uint32_t> >& readPositions,
+			  bool isFw)
+    {
+        score_  = score;
+        max_score_ = max_score;
+        uid_ = uniqueID;
+        tid_ = taxID;
+        taxRank_ = taxRank;
+        summedHitLen_ = summedHitLen;
+		readPositions_ = readPositions;
+		isFw_ = isFw;
+    }
+
+protected:
+	TAlScore     score_;        //
+    TAlScore     max_score_;
+    string       uid_;
+    uint64_t     tid_;
+    uint8_t      taxRank_;
+    double       summedHitLen_; // sum of the length of all partial hits, divided by the number of genome matches
+	bool         isFw_;
+  
+	EList<pair<uint32_t, uint32_t> > readPositions_;
+};
+
+typedef uint64_t TNumAlns;
+
+/**
+ * Encapsulates a concise summary of a set of alignment results for a
+ * given pair or mate.  Referring to the fields of this object should
+ * provide enough information to print output records for the read.
+ */
+class AlnSetSumm {
+
+public:
+
+	AlnSetSumm() { reset(); }
+
+	/**
+	 * Given an unpaired read (in either rd1 or rd2) or a read pair
+	 * (mate 1 in rd1, mate 2 in rd2).
+	 */
+	explicit AlnSetSumm(
+		const Read* rd1,
+		const Read* rd2,
+		const EList<AlnRes>* rs)
+	{
+		init(rd1, rd2, rs);
+	}
+
+	explicit AlnSetSumm(
+		AlnScore best,
+		AlnScore secbest)
+	{
+		init(best, secbest);
+	}
+	
+	/**
+	 * Set to uninitialized state.
+	 */
+	void reset() {
+		best_.invalidate();
+		secbest_.invalidate();
+	}
+	
+    /**
+     * Given all the paired and unpaired results involving mates #1 and #2,
+     * calculate best and second-best scores for both mates.  These are
+     * used for future MAPQ calculations.
+     */
+	void init(
+		const Read* rd1,
+		const Read* rd2,
+		const EList<AlnRes>* rs)
+    {
+        assert(rd1 != NULL || rd2 != NULL);
+        assert(rs != NULL);
+        AlnScore best, secbest;
+        size_t szs = 0;
+        best.invalidate();    secbest.invalidate();
+        szs = rs->size();
+        //assert_gt(szs[j], 0);
+        for(size_t i = 0; i < rs->size(); i++) {
+            AlnScore sc = (*rs)[i].score();
+            if(sc > best) {
+                secbest = best;
+                best = sc;
+                assert(VALID_AL_SCORE(best));
+            } else if(sc > secbest) {
+                secbest = sc;
+                assert(VALID_AL_SCORE(best));
+                assert(VALID_AL_SCORE(secbest));
+            }
+        }
+        if(szs > 0) {
+            init(best, secbest);
+        } else {
+            reset();
+        }
+    }
+
+	
+	/**
+	 * Initialize given fields.  See constructor for how fields are set.
+	 */
+	void init(
+		AlnScore best,
+		AlnScore secbest)
+	{
+		best_         = best;
+		secbest_      = secbest;
+		assert(repOk());
+	}
+	
+	/**
+	 * Return true iff there is at least a best alignment
+	 */
+	bool empty() const {
+		assert(repOk());
+		return !VALID_AL_SCORE(best_);
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that the summary is internally consistent.
+	 */
+	bool repOk() const {
+		return true;
+	}
+#endif
+	
+	AlnScore best()         const { return best_;         }
+	AlnScore secbest()      const { return secbest_;      }
+
+
+protected:
+	
+	AlnScore best_;         // best full-alignment score found for this read
+	AlnScore secbest_;      // second-best
+};
+
+#endif
diff --git a/aligner_seed.cpp b/aligner_seed.cpp
new file mode 100644
index 0000000..d417e32
--- /dev/null
+++ b/aligner_seed.cpp
@@ -0,0 +1,532 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "aligner_cache.h"
+#include "aligner_seed.h"
+#include "search_globals.h"
+#include "bt2_idx.h"
+
+using namespace std;
+
+/**
+ * Construct a constraint with no edits of any kind allowed.
+ */
+Constraint Constraint::exact() {
+	Constraint c;
+	c.edits = c.mms = c.ins = c.dels = c.penalty = 0;
+	return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint.
+ */
+Constraint Constraint::penaltyBased(int pen) {
+	Constraint c;
+	c.penalty = pen;
+	return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint related to the length of the read.
+ */
+Constraint Constraint::penaltyFuncBased(const SimpleFunc& f) {
+	Constraint c;
+	c.penFunc = f;
+	return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint.
+ */
+Constraint Constraint::mmBased(int mms) {
+	Constraint c;
+	c.mms = mms;
+	c.edits = c.dels = c.ins = 0;
+	return c;
+}
+
+/**
+ * Construct a constraint where the only constraint is a total
+ * penalty constraint.
+ */
+Constraint Constraint::editBased(int edits) {
+	Constraint c;
+	c.edits = edits;
+	c.dels = c.ins = c.mms = 0;
+	return c;
+}
+
+//
+// Some static methods for constructing some standard SeedPolicies
+//
+
+/**
+ * Given a read, depth and orientation, extract a seed data structure
+ * from the read and fill in the steps & zones arrays.  The Seed
+ * contains the sequence and quality values.
+ */
+bool
+Seed::instantiate(
+	const Read& read,
+	const BTDnaString& seq, // seed read sequence
+	const BTString& qual,   // seed quality sequence
+	const Scoring& pens,
+	int depth,
+	int seedoffidx,
+	int seedtypeidx,
+	bool fw,
+	InstantiatedSeed& is) const
+{
+	assert(overall != NULL);
+	int seedlen = len;
+	if((int)read.length() < seedlen) {
+		// Shrink seed length to fit read if necessary
+		seedlen = (int)read.length();
+	}
+	assert_gt(seedlen, 0);
+	is.steps.resize(seedlen);
+	is.zones.resize(seedlen);
+	// Fill in 'steps' and 'zones'
+	//
+	// The 'steps' list indicates which read character should be
+	// incorporated at each step of the search process.  Often we will
+	// simply proceed from one end to the other, in which case the
+	// 'steps' list is ascending or descending.  In some cases (e.g.
+	// the 2mm case), we might want to switch directions at least once
+	// during the search, in which case 'steps' will jump in the
+	// middle.  When an element of the 'steps' list is negative, this
+	// indicates that the next
+	//
+	// The 'zones' list indicates which zone constraint is active at
+	// each step.  Each element of the 'zones' list is a pair; the
+	// first pair element indicates the applicable zone when
+	// considering either mismatch or delete (ref gap) events, while
+	// the second pair element indicates the applicable zone when
+	// considering insertion (read gap) events.  When either pair
+	// element is a negative number, that indicates that we are about
+	// to leave the zone for good, at which point we may need to
+	// evaluate whether we have reached the zone's budget.
+	//
+	switch(type) {
+		case SEED_TYPE_EXACT: {
+			for(int k = 0; k < seedlen; k++) {
+				is.steps[k] = -(seedlen - k);
+				// Zone 0 all the way
+				is.zones[k].first = is.zones[k].second = 0;
+			}
+			break;
+		}
+		case SEED_TYPE_LEFT_TO_RIGHT: {
+			for(int k = 0; k < seedlen; k++) {
+				is.steps[k] = k+1;
+				// Zone 0 from 0 up to ceil(len/2), then 1
+				is.zones[k].first = is.zones[k].second = ((k < (seedlen+1)/2) ? 0 : 1);
+			}
+			// Zone 1 ends at the RHS
+			is.zones[seedlen-1].first = is.zones[seedlen-1].second = -1;
+			break;
+		}
+		case SEED_TYPE_RIGHT_TO_LEFT: {
+			for(int k = 0; k < seedlen; k++) {
+				is.steps[k] = -(seedlen - k);
+				// Zone 0 from 0 up to floor(len/2), then 1
+				is.zones[k].first  = ((k < seedlen/2) ? 0 : 1);
+				// Inserts: Zone 0 from 0 up to ceil(len/2)-1, then 1
+				is.zones[k].second = ((k < (seedlen+1)/2+1) ? 0 : 1);
+			}
+			is.zones[seedlen-1].first = is.zones[seedlen-1].second = -1;
+			break;
+		}
+		case SEED_TYPE_INSIDE_OUT: {
+			// Zone 0 from ceil(N/4) up to N-floor(N/4)
+			int step = 0;
+			for(int k = (seedlen+3)/4; k < seedlen - (seedlen/4); k++) {
+				is.zones[step].first = is.zones[step].second = 0;
+				is.steps[step++] = k+1;
+			}
+			// Zone 1 from N-floor(N/4) up
+			for(int k = seedlen - (seedlen/4); k < seedlen; k++) {
+				is.zones[step].first = is.zones[step].second = 1;
+				is.steps[step++] = k+1;
+			}
+			// No Zone 1 if seedlen is short (like 2)
+			//assert_eq(1, is.zones[step-1].first);
+			is.zones[step-1].first = is.zones[step-1].second = -1;
+			// Zone 2 from ((seedlen+3)/4)-1 down to 0
+			for(int k = ((seedlen+3)/4)-1; k >= 0; k--) {
+				is.zones[step].first = is.zones[step].second = 2;
+				is.steps[step++] = -(k+1);
+			}
+			assert_eq(2, is.zones[step-1].first);
+			is.zones[step-1].first = is.zones[step-1].second = -2;
+			assert_eq(seedlen, step);
+			break;
+		}
+		default:
+			throw 1;
+	}
+	// Instantiate constraints
+	for(int i = 0; i < 3; i++) {
+		is.cons[i] = zones[i];
+		is.cons[i].instantiate(read.length());
+	}
+	is.overall = *overall;
+	is.overall.instantiate(read.length());
+	// Take a sweep through the seed sequence.  Consider where the Ns
+	// occur and how zones are laid out.  Calculate the maximum number
+	// of positions we can jump over initially (e.g. with the ftab) and
+	// perhaps set this function's return value to false, indicating
+	// that the arrangements of Ns prevents the seed from aligning.
+	bool streak = true;
+	is.maxjump = 0;
+	bool ret = true;
+	bool ltr = (is.steps[0] > 0); // true -> left-to-right
+	for(size_t i = 0; i < is.steps.size(); i++) {
+		assert_neq(0, is.steps[i]);
+		int off = is.steps[i];
+		off = abs(off)-1;
+		Constraint& cons = is.cons[abs(is.zones[i].first)];
+		int c = seq[off];  assert_range(0, 4, c);
+		int q = qual[off];
+		if(ltr != (is.steps[i] > 0) || // changed direction
+		   is.zones[i].first != 0 ||   // changed zone
+		   is.zones[i].second != 0)    // changed zone
+		{
+			streak = false;
+		}
+		if(c == 4) {
+			// Induced mismatch
+			if(cons.canN(q, pens)) {
+				cons.chargeN(q, pens);
+			} else {
+				// Seed disqualified due to arrangement of Ns
+				return false;
+			}
+		}
+		if(streak) is.maxjump++;
+	}
+	is.seedoff = depth;
+	is.seedoffidx = seedoffidx;
+	is.fw = fw;
+	is.s = *this;
+	return ret;
+}
+
+/**
+ * Return a set consisting of 1 seed encapsulating an exact matching
+ * strategy.
+ */
+void
+Seed::zeroMmSeeds(int ln, EList<Seed>& pols, Constraint& oall) {
+	oall.init();
+	// Seed policy 1: left-to-right search
+	pols.expand();
+	pols.back().len = ln;
+	pols.back().type = SEED_TYPE_EXACT;
+	pols.back().zones[0] = Constraint::exact();
+	pols.back().zones[1] = Constraint::exact();
+	pols.back().zones[2] = Constraint::exact(); // not used
+	pols.back().overall = &oall;
+}
+
+/**
+ * Return a set of 2 seeds encapsulating a half-and-half 1mm strategy.
+ */
+void
+Seed::oneMmSeeds(int ln, EList<Seed>& pols, Constraint& oall) {
+	oall.init();
+	// Seed policy 1: left-to-right search
+	pols.expand();
+	pols.back().len = ln;
+	pols.back().type = SEED_TYPE_LEFT_TO_RIGHT;
+	pols.back().zones[0] = Constraint::exact();
+	pols.back().zones[1] = Constraint::mmBased(1);
+	pols.back().zones[2] = Constraint::exact(); // not used
+	pols.back().overall = &oall;
+	// Seed policy 2: right-to-left search
+	pols.expand();
+	pols.back().len = ln;
+	pols.back().type = SEED_TYPE_RIGHT_TO_LEFT;
+	pols.back().zones[0] = Constraint::exact();
+	pols.back().zones[1] = Constraint::mmBased(1);
+	pols.back().zones[1].mmsCeil = 0;
+	pols.back().zones[2] = Constraint::exact(); // not used
+	pols.back().overall = &oall;
+}
+
+/**
+ * Return a set of 3 seeds encapsulating search roots for:
+ *
+ * 1. Starting from the left-hand side and searching toward the
+ *    right-hand side allowing 2 mismatches in the right half.
+ * 2. Starting from the right-hand side and searching toward the
+ *    left-hand side allowing 2 mismatches in the left half.
+ * 3. Starting (effectively) from the center and searching out toward
+ *    both the left and right-hand sides, allowing one mismatch on
+ *    either side.
+ *
+ * This is not exhaustive.  There are 2 mismatch cases mised; if you
+ * imagine the seed as divided into four successive quarters A, B, C
+ * and D, the cases we miss are when mismatches occur in A and C or B
+ * and D.
+ */
+void
+Seed::twoMmSeeds(int ln, EList<Seed>& pols, Constraint& oall) {
+	oall.init();
+	// Seed policy 1: left-to-right search
+	pols.expand();
+	pols.back().len = ln;
+	pols.back().type = SEED_TYPE_LEFT_TO_RIGHT;
+	pols.back().zones[0] = Constraint::exact();
+	pols.back().zones[1] = Constraint::mmBased(2);
+	pols.back().zones[2] = Constraint::exact(); // not used
+	pols.back().overall = &oall;
+	// Seed policy 2: right-to-left search
+	pols.expand();
+	pols.back().len = ln;
+	pols.back().type = SEED_TYPE_RIGHT_TO_LEFT;
+	pols.back().zones[0] = Constraint::exact();
+	pols.back().zones[1] = Constraint::mmBased(2);
+	pols.back().zones[1].mmsCeil = 1; // Must have used at least 1 mismatch
+	pols.back().zones[2] = Constraint::exact(); // not used
+	pols.back().overall = &oall;
+	// Seed policy 3: inside-out search
+	pols.expand();
+	pols.back().len = ln;
+	pols.back().type = SEED_TYPE_INSIDE_OUT;
+	pols.back().zones[0] = Constraint::exact();
+	pols.back().zones[1] = Constraint::mmBased(1);
+	pols.back().zones[1].mmsCeil = 0; // Must have used at least 1 mismatch
+	pols.back().zones[2] = Constraint::mmBased(1);
+	pols.back().zones[2].mmsCeil = 0; // Must have used at least 1 mismatch
+	pols.back().overall = &oall;
+}
+
+/**
+ * Types of actions that can be taken by the SeedAligner.
+ */
+enum {
+	SA_ACTION_TYPE_RESET = 1,
+	SA_ACTION_TYPE_SEARCH_SEED, // 2
+	SA_ACTION_TYPE_FTAB,        // 3
+	SA_ACTION_TYPE_FCHR,        // 4
+	SA_ACTION_TYPE_MATCH,       // 5
+	SA_ACTION_TYPE_EDIT         // 6
+};
+
+#define MIN(x, y) ((x < y) ? x : y)
+
+#ifdef ALIGNER_SEED_MAIN
+
+#include <getopt.h>
+#include <string>
+
+/**
+ * Parse an int out of optarg and enforce that it be at least 'lower';
+ * if it is less than 'lower', than output the given error message and
+ * exit with an error and a usage message.
+ */
+static int parseInt(const char *errmsg, const char *arg) {
+	long l;
+	char *endPtr = NULL;
+	l = strtol(arg, &endPtr, 10);
+	if (endPtr != NULL) {
+		return (int32_t)l;
+	}
+	cerr << errmsg << endl;
+	throw 1;
+	return -1;
+}
+
+enum {
+	ARG_NOFW = 256,
+	ARG_NORC,
+	ARG_MM,
+	ARG_SHMEM,
+	ARG_TESTS,
+	ARG_RANDOM_TESTS,
+	ARG_SEED
+};
+
+static const char *short_opts = "vCt";
+static struct option long_opts[] = {
+	{(char*)"verbose",  no_argument,       0, 'v'},
+	{(char*)"color",    no_argument,       0, 'C'},
+	{(char*)"timing",   no_argument,       0, 't'},
+	{(char*)"nofw",     no_argument,       0, ARG_NOFW},
+	{(char*)"norc",     no_argument,       0, ARG_NORC},
+	{(char*)"mm",       no_argument,       0, ARG_MM},
+	{(char*)"shmem",    no_argument,       0, ARG_SHMEM},
+	{(char*)"tests",    no_argument,       0, ARG_TESTS},
+	{(char*)"random",   required_argument, 0, ARG_RANDOM_TESTS},
+	{(char*)"seed",     required_argument, 0, ARG_SEED},
+};
+
+static void printUsage(ostream& os) {
+	os << "Usage: ac [options]* <index> <patterns>" << endl;
+	os << "Options:" << endl;
+	os << "  --mm                memory-mapped mode" << endl;
+	os << "  --shmem             shared memory mode" << endl;
+	os << "  --nofw              don't align forward-oriented read" << endl;
+	os << "  --norc              don't align reverse-complemented read" << endl;
+	os << "  -t/--timing         show timing information" << endl;
+	os << "  -C/--color          colorspace mode" << endl;
+	os << "  -v/--verbose        talkative mode" << endl;
+}
+
+bool gNorc = false;
+bool gNofw = false;
+bool gColor = false;
+int gVerbose = 0;
+int gGapBarrier = 1;
+bool gColorExEnds = true;
+int gSnpPhred = 30;
+bool gReportOverhangs = true;
+
+extern void aligner_seed_tests();
+extern void aligner_random_seed_tests(
+	int num_tests,
+	uint32_t qslo,
+	uint32_t qshi,
+	bool color,
+	uint32_t seed);
+
+/**
+ * A way of feeding simply tests to the seed alignment infrastructure.
+ */
+int main(int argc, char **argv) {
+	bool useMm = false;
+	bool useShmem = false;
+	bool mmSweep = false;
+	bool noRefNames = false;
+	bool sanity = false;
+	bool timing = false;
+	int option_index = 0;
+	int seed = 777;
+	int next_option;
+	do {
+		next_option = getopt_long(
+			argc, argv, short_opts, long_opts, &option_index);
+		switch (next_option) {
+			case 'v':       gVerbose = true; break;
+			case 'C':       gColor   = true; break;
+			case 't':       timing   = true; break;
+			case ARG_NOFW:  gNofw    = true; break;
+			case ARG_NORC:  gNorc    = true; break;
+			case ARG_MM:    useMm    = true; break;
+			case ARG_SHMEM: useShmem = true; break;
+			case ARG_SEED:  seed = parseInt("", optarg); break;
+			case ARG_TESTS: {
+				aligner_seed_tests();
+				aligner_random_seed_tests(
+					100,     // num references
+					100,   // queries per reference lo
+					400,   // queries per reference hi
+					false, // true -> generate colorspace reference/reads
+					18);   // pseudo-random seed
+				return 0;
+			}
+			case ARG_RANDOM_TESTS: {
+				seed = parseInt("", optarg);
+				aligner_random_seed_tests(
+					100,   // num references
+					100,   // queries per reference lo
+					400,   // queries per reference hi
+					false, // true -> generate colorspace reference/reads
+					seed); // pseudo-random seed
+				return 0;
+			}
+			case -1: break;
+			default: {
+				cerr << "Unknown option: " << (char)next_option << endl;
+				printUsage(cerr);
+				exit(1);
+			}
+		}
+	} while(next_option != -1);
+	char *reffn;
+	if(optind >= argc) {
+		cerr << "No reference; quitting..." << endl;
+		return 1;
+	}
+	reffn = argv[optind++];
+	if(optind >= argc) {
+		cerr << "No reads; quitting..." << endl;
+		return 1;
+	}
+	string ebwtBase(reffn);
+	BitPairReference ref(
+		ebwtBase,    // base path
+		gColor,      // whether we expect it to be colorspace
+		sanity,      // whether to sanity-check reference as it's loaded
+		NULL,        // fasta files to sanity check reference against
+		NULL,        // another way of specifying original sequences
+		false,       // true -> infiles (2 args ago) contains raw seqs
+		useMm,       // use memory mapping to load index?
+		useShmem,    // use shared memory (not memory mapping)
+		mmSweep,     // touch all the pages after memory-mapping the index
+		gVerbose,    // verbose
+		gVerbose);   // verbose but just for startup messages
+	Timer *t = new Timer(cerr, "Time loading fw index: ", timing);
+	Ebwt ebwtFw(
+		ebwtBase,
+		gColor,      // index is colorspace
+		0,           // don't need entireReverse for fw index
+		true,        // index is for the forward direction
+		-1,          // offrate (irrelevant)
+		useMm,       // whether to use memory-mapped files
+		useShmem,    // whether to use shared memory
+		mmSweep,     // sweep memory-mapped files
+		!noRefNames, // load names?
+		false,       // load SA sample?
+		true,        // load ftab?
+		true,        // load rstarts?
+		NULL,        // reference map, or NULL if none is needed
+		gVerbose,    // whether to be talkative
+		gVerbose,    // talkative during initialization
+		false,       // handle memory exceptions, don't pass them up
+		sanity);
+	delete t;
+	t = new Timer(cerr, "Time loading bw index: ", timing);
+	Ebwt ebwtBw(
+		ebwtBase + ".rev",
+		gColor,      // index is colorspace
+		1,           // need entireReverse
+		false,       // index is for the backward direction
+		-1,          // offrate (irrelevant)
+		useMm,       // whether to use memory-mapped files
+		useShmem,    // whether to use shared memory
+		mmSweep,     // sweep memory-mapped files
+		!noRefNames, // load names?
+		false,       // load SA sample?
+		true,        // load ftab?
+		false,       // load rstarts?
+		NULL,        // reference map, or NULL if none is needed
+		gVerbose,    // whether to be talkative
+		gVerbose,    // talkative during initialization
+		false,       // handle memory exceptions, don't pass them up
+		sanity);
+	delete t;
+	for(int i = optind; i < argc; i++) {
+	}
+}
+#endif
diff --git a/aligner_seed.h b/aligner_seed.h
new file mode 100644
index 0000000..a5d4874
--- /dev/null
+++ b/aligner_seed.h
@@ -0,0 +1,2866 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_SEED_H_
+#define ALIGNER_SEED_H_
+
+#include <iostream>
+#include <utility>
+#include <limits>
+#include "qual.h"
+#include "ds.h"
+#include "sstring.h"
+#include "alphabet.h"
+#include "edit.h"
+#include "read.h"
+// Threading is necessary to synchronize the classes that dump
+// intermediate alignment results to files.  Otherwise, all data herein
+// is constant and shared, or per-thread.
+#include "threading.h"
+#include "aligner_result.h"
+#include "aligner_cache.h"
+#include "scoring.h"
+#include "mem_ids.h"
+#include "simple_func.h"
+#include "btypes.h"
+
+/**
+ * A constraint to apply to an alignment zone, or to an overall
+ * alignment.
+ *
+ * The constraint can put both caps and ceilings on the number and
+ * types of edits allowed.
+ */
+struct Constraint {
+	
+	Constraint() { init(); }
+	
+	/**
+	 * Initialize Constraint to be fully permissive.
+	 */
+	void init() {
+		edits = mms = ins = dels = penalty = editsCeil = mmsCeil =
+		insCeil = delsCeil = penaltyCeil = MAX_I;
+		penFunc.reset();
+		instantiated = false;
+	}
+	
+	/**
+	 * Return true iff penalities and constraints prevent us from
+	 * adding any edits.
+	 */
+	bool mustMatch() {
+		assert(instantiated);
+		return (mms == 0 && edits == 0) ||
+		        penalty == 0 ||
+		       (mms == 0 && dels == 0 && ins == 0);
+	}
+	
+	/**
+	 * Return true iff a mismatch of the given quality is permitted.
+	 */
+	bool canMismatch(int q, const Scoring& cm) {
+		assert(instantiated);
+		return (mms > 0 || edits > 0) &&
+		       penalty >= cm.mm(q);
+	}
+
+	/**
+	 * Return true iff a mismatch of the given quality is permitted.
+	 */
+	bool canN(int q, const Scoring& cm) {
+		assert(instantiated);
+		return (mms > 0 || edits > 0) &&
+		       penalty >= cm.n(q);
+	}
+	
+	/**
+	 * Return true iff a mismatch of *any* quality (even qual=1) is
+	 * permitted.
+	 */
+	bool canMismatch() {
+		assert(instantiated);
+		return (mms > 0 || edits > 0) && penalty > 0;
+	}
+
+	/**
+	 * Return true iff a mismatch of *any* quality (even qual=1) is
+	 * permitted.
+	 */
+	bool canN() {
+		assert(instantiated);
+		return (mms > 0 || edits > 0);
+	}
+	
+	/**
+	 * Return true iff a deletion of the given extension (0=open, 1=1st
+	 * extension, etc) is permitted.
+	 */
+	bool canDelete(int ex, const Scoring& cm) {
+		assert(instantiated);
+		return (dels > 0 && edits > 0) &&
+		       penalty >= cm.del(ex);
+	}
+
+	/**
+	 * Return true iff a deletion of any extension is permitted.
+	 */
+	bool canDelete() {
+		assert(instantiated);
+		return (dels > 0 || edits > 0) &&
+		       penalty > 0;
+	}
+	
+	/**
+	 * Return true iff an insertion of the given extension (0=open,
+	 * 1=1st extension, etc) is permitted.
+	 */
+	bool canInsert(int ex, const Scoring& cm) {
+		assert(instantiated);
+		return (ins > 0 || edits > 0) &&
+		       penalty >= cm.ins(ex);
+	}
+
+	/**
+	 * Return true iff an insertion of any extension is permitted.
+	 */
+	bool canInsert() {
+		assert(instantiated);
+		return (ins > 0 || edits > 0) &&
+		       penalty > 0;
+	}
+	
+	/**
+	 * Return true iff a gap of any extension is permitted
+	 */
+	bool canGap() {
+		assert(instantiated);
+		return ((ins > 0 || dels > 0) || edits > 0) && penalty > 0;
+	}
+	
+	/**
+	 * Charge a mismatch of the given quality.
+	 */
+	void chargeMismatch(int q, const Scoring& cm) {
+		assert(instantiated);
+		if(mms == 0) { assert_gt(edits, 0); edits--; }
+		else mms--;
+		penalty -= cm.mm(q);
+		assert_geq(mms, 0);
+		assert_geq(edits, 0);
+		assert_geq(penalty, 0);
+	}
+	
+	/**
+	 * Charge an N mismatch of the given quality.
+	 */
+	void chargeN(int q, const Scoring& cm) {
+		assert(instantiated);
+		if(mms == 0) { assert_gt(edits, 0); edits--; }
+		else mms--;
+		penalty -= cm.n(q);
+		assert_geq(mms, 0);
+		assert_geq(edits, 0);
+		assert_geq(penalty, 0);
+	}
+	
+	/**
+	 * Charge a deletion of the given extension.
+	 */
+	void chargeDelete(int ex, const Scoring& cm) {
+		assert(instantiated);
+		dels--;
+		edits--;
+		penalty -= cm.del(ex);
+		assert_geq(dels, 0);
+		assert_geq(edits, 0);
+		assert_geq(penalty, 0);
+	}
+
+	/**
+	 * Charge an insertion of the given extension.
+	 */
+	void chargeInsert(int ex, const Scoring& cm) {
+		assert(instantiated);
+		ins--;
+		edits--;
+		penalty -= cm.ins(ex);
+		assert_geq(ins, 0);
+		assert_geq(edits, 0);
+		assert_geq(penalty, 0);
+	}
+	
+	/**
+	 * Once the constrained area is completely explored, call this
+	 * function to check whether there were *at least* as many
+	 * dissimilarities as required by the constraint.  Bounds like this
+	 * are helpful to resolve instances where two search roots would
+	 * otherwise overlap in what alignments they can find.
+	 */
+	bool acceptable() {
+		assert(instantiated);
+		return edits   <= editsCeil &&
+		       mms     <= mmsCeil   &&
+		       ins     <= insCeil   &&
+		       dels    <= delsCeil  &&
+		       penalty <= penaltyCeil;
+	}
+	
+	/**
+	 * Instantiate a constraint w/r/t the read length and the constant
+	 * and linear coefficients for the penalty function.
+	 */
+	static int instantiate(size_t rdlen, const SimpleFunc& func) {
+		return func.f<int>((double)rdlen);
+	}
+	
+	/**
+	 * Instantiate this constraint w/r/t the read length.
+	 */
+	void instantiate(size_t rdlen) {
+		assert(!instantiated);
+		if(penFunc.initialized()) {
+			penalty = Constraint::instantiate(rdlen, penFunc);
+		}
+		instantiated = true;
+	}
+	
+	int edits;      // # edits permitted
+	int mms;        // # mismatches permitted
+	int ins;        // # insertions permitted
+	int dels;       // # deletions permitted
+	int penalty;    // penalty total permitted
+	int editsCeil;  // <= this many edits can be left at the end
+	int mmsCeil;    // <= this many mismatches can be left at the end
+	int insCeil;    // <= this many inserts can be left at the end
+	int delsCeil;   // <= this many deletions can be left at the end
+	int penaltyCeil;// <= this much leftover penalty can be left at the end
+	SimpleFunc penFunc;// penalty function; function of read len
+	bool instantiated; // whether constraint is instantiated w/r/t read len
+	
+	//
+	// Some static methods for constructing some standard Constraints
+	//
+
+	/**
+	 * Construct a constraint with no edits of any kind allowed.
+	 */
+	static Constraint exact();
+	
+	/**
+	 * Construct a constraint where the only constraint is a total
+	 * penalty constraint.
+	 */
+	static Constraint penaltyBased(int pen);
+
+	/**
+	 * Construct a constraint where the only constraint is a total
+	 * penalty constraint related to the length of the read.
+	 */
+	static Constraint penaltyFuncBased(const SimpleFunc& func);
+
+	/**
+	 * Construct a constraint where the only constraint is a total
+	 * penalty constraint.
+	 */
+	static Constraint mmBased(int mms);
+
+	/**
+	 * Construct a constraint where the only constraint is a total
+	 * penalty constraint.
+	 */
+	static Constraint editBased(int edits);
+};
+
+/**
+ * We divide seed search strategies into three categories:
+ *
+ * 1. A left-to-right search where the left half of the read is
+ *    constrained to match exactly and the right half is subject to
+ *    some looser constraint (e.g. 1mm or 2mm)
+ * 2. Same as 1, but going right to left with the exact matching half
+ *    on the right.
+ * 3. Inside-out search where the center half of the read is
+ *    constrained to match exactly, and the extreme quarters of the
+ *    read are subject to a looser constraint.
+ */
+enum {
+	SEED_TYPE_EXACT = 1,
+	SEED_TYPE_LEFT_TO_RIGHT,
+	SEED_TYPE_RIGHT_TO_LEFT,
+	SEED_TYPE_INSIDE_OUT
+};
+
+struct InstantiatedSeed;
+
+/**
+ * Policy dictating how to size and arrange seeds along the length of
+ * the read, and what constraints to force on the zones of the seed.
+ * We assume that seeds are plopped down at regular intervals from the
+ * 5' to 3' ends, with the first seed flush to the 5' end.
+ *
+ * If the read is shorter than a single seed, one seed is used and it
+ * is shrunk to accommodate the read.
+ */
+struct Seed {
+
+	int len;             // length of a seed
+	int type;            // dictates anchor portion, direction of search
+	Constraint *overall; // for the overall alignment
+
+	Seed() { init(0, 0, NULL); }
+
+	/**
+	 * Construct and initialize this seed with given length and type.
+	 */
+	Seed(int ln, int ty, Constraint* oc) {
+		init(ln, ty, oc);
+	}
+
+	/**
+	 * Initialize this seed with given length and type.
+	 */
+	void init(int ln, int ty, Constraint* oc) {
+		len = ln;
+		type = ty;
+		overall = oc;
+	}
+	
+	// If the seed is split into halves, we just use zones[0] and
+	// zones[1]; 0 is the near half and 1 is the far half.  If the seed
+	// is split into thirds (i.e. inside-out) then 0 is the center, 1
+	// is the far portion on the left, and 2 is the far portion on the
+	// right.
+	Constraint zones[3];
+
+	/**
+	 * Once the constrained seed is completely explored, call this
+	 * function to check whether there were *at least* as many
+	 * dissimilarities as required by all constraints.  Bounds like this
+	 * are helpful to resolve instances where two search roots would
+	 * otherwise overlap in what alignments they can find.
+	 */
+	bool acceptable() {
+		assert(overall != NULL);
+		return zones[0].acceptable() &&
+		       zones[1].acceptable() &&
+		       zones[2].acceptable() &&
+		       overall->acceptable();
+	}
+
+	/**
+	 * Given a read, depth and orientation, extract a seed data structure
+	 * from the read and fill in the steps & zones arrays.  The Seed
+	 * contains the sequence and quality values.
+	 */
+	bool instantiate(
+		const Read& read,
+		const BTDnaString& seq, // already-extracted seed sequence
+		const BTString& qual,   // already-extracted seed quality sequence
+		const Scoring& pens,
+		int depth,
+		int seedoffidx,
+		int seedtypeidx,
+		bool fw,
+		InstantiatedSeed& si) const;
+
+	/**
+	 * Return a list of Seed objects encapsulating
+	 */
+	static void mmSeeds(
+		int mms,
+		int ln,
+		EList<Seed>& pols,
+		Constraint& oall)
+	{
+		if(mms == 0) {
+			zeroMmSeeds(ln, pols, oall);
+		} else if(mms == 1) {
+			oneMmSeeds(ln, pols, oall);
+		} else if(mms == 2) {
+			twoMmSeeds(ln, pols, oall);
+		} else throw 1;
+	}
+	
+	static void zeroMmSeeds(int ln, EList<Seed>&, Constraint&);
+	static void oneMmSeeds (int ln, EList<Seed>&, Constraint&);
+	static void twoMmSeeds (int ln, EList<Seed>&, Constraint&);
+};
+
+/**
+ * An instantiated seed is a seed (perhaps modified to fit the read)
+ * plus all data needed to conduct a search of the seed.
+ */
+struct InstantiatedSeed {
+
+	InstantiatedSeed() : steps(AL_CAT), zones(AL_CAT) { }
+
+	// Steps map.  There are as many steps as there are positions in
+	// the seed.  The map is a helpful abstraction because we sometimes
+	// visit seed positions in an irregular order (e.g. inside-out
+	// search).
+	EList<int> steps;
+
+	// Zones map.  For each step, records what constraint to charge an
+	// edit to.  The first entry in each pair gives the constraint for
+	// non-insert edits and the second entry in each pair gives the
+	// constraint for insert edits.  If the value stored is negative,
+	// this indicates that the zone is "closed out" after this
+	// position, so zone acceptility should be checked.
+	EList<pair<int, int> > zones;
+
+	// Nucleotide sequence covering the seed, extracted from read
+	BTDnaString *seq;
+	
+	// Quality sequence covering the seed, extracted from read
+	BTString *qual;
+	
+	// Initial constraints governing zones 0, 1, 2.  We precalculate
+	// the effect of Ns on these.
+	Constraint cons[3];
+	
+	// Overall constraint, tailored to the read length.
+	Constraint overall;
+	
+	// Maximum number of positions that the aligner may advance before
+	// its first step.  This lets the aligner know whether it can use
+	// the ftab or not.
+	int maxjump;
+	
+	// Offset of seed from 5' end of read
+	int seedoff;
+
+	// Id for seed offset; ids are such that the smallest index is the
+	// closest to the 5' end and consecutive ids are adjacent (i.e.
+	// there are no intervening offsets with seeds)
+	int seedoffidx;
+	
+	// Type of seed (left-to-right, etc)
+	int seedtypeidx;
+	
+	// Seed comes from forward-oriented read?
+	bool fw;
+	
+	// Filtered out due to the pattern of Ns present.  If true, this
+	// seed should be ignored by searchAllSeeds().
+	bool nfiltered;
+	
+	// Seed this was instantiated from
+	Seed s;
+	
+#ifndef NDEBUG
+	/**
+	 * Check that InstantiatedSeed is internally consistent.
+	 */
+	bool repOk() const {
+		return true;
+	}
+#endif
+};
+
+/**
+ * Simple struct for holding a end-to-end alignments for the read with at most
+ * 2 edits.
+ */
+template <typename index_t>
+struct EEHit {
+	
+	EEHit() { reset(); }
+	
+	void reset() {
+		top = bot = 0;
+		fw = false;
+		e1.reset();
+		e2.reset();
+		score = MIN_I64;
+	}
+	
+	void init(
+		index_t top_,
+		index_t bot_,
+		const Edit* e1_,
+		const Edit* e2_,
+		bool fw_,
+		int64_t score_)
+	{
+		top = top_; bot = bot_;
+		if(e1_ != NULL) {
+			e1 = *e1_;
+		} else {
+			e1.reset();
+		}
+		if(e2_ != NULL) {
+			e2 = *e2_;
+		} else {
+			e2.reset();
+		}
+		fw = fw_;
+		score = score_;
+	}
+	
+	/**
+	 * Return number of mismatches in the alignment.
+	 */
+	int mms() const {
+		if     (e2.inited()) return 2;
+		else if(e1.inited()) return 1;
+		else                 return 0;
+	}
+	
+	/**
+	 * Return the number of Ns involved in the alignment.
+	 */
+	int ns() const {
+		int ns = 0;
+		if(e1.inited() && e1.hasN()) {
+			ns++;
+			if(e2.inited() && e2.hasN()) {
+				ns++;
+			}
+		}
+		return ns;
+	}
+
+	/**
+	 * Return the number of Ns involved in the alignment.
+	 */
+	int refns() const {
+		int ns = 0;
+		if(e1.inited() && e1.chr == 'N') {
+			ns++;
+			if(e2.inited() && e2.chr == 'N') {
+				ns++;
+			}
+		}
+		return ns;
+	}
+	
+	/**
+	 * Return true iff there is no hit.
+	 */
+	bool empty() const {
+		return bot <= top;
+	}
+	
+	/**
+	 * Higher score = higher priority.
+	 */
+	bool operator<(const EEHit& o) const {
+		return score > o.score;
+	}
+	
+	/**
+	 * Return the size of the alignments SA range.s
+	 */
+	index_t size() const { return bot - top; }
+	
+#ifndef NDEBUG
+	/**
+	 * Check that hit is sane w/r/t read.
+	 */
+	bool repOk(const Read& rd) const {
+		assert_gt(bot, top);
+		if(e1.inited()) {
+			assert_lt(e1.pos, rd.length());
+			if(e2.inited()) {
+				assert_lt(e2.pos, rd.length());
+			}
+		}
+		return true;
+	}
+#endif
+	
+	index_t top;
+	index_t bot;
+	Edit     e1;
+	Edit     e2;
+	bool     fw;
+	int64_t  score;
+};
+
+/**
+ * Data structure for holding all of the seed hits associated with a read.  All
+ * the seed hits for a given read are encapsulated in a single QVal object.  A
+ * QVal refers to a range of values in the qlist, where each qlist value is a 
+ * BW range and a slot to hold the hit's suffix array offset.  QVals are kept
+ * in two lists (hitsFw_ and hitsRc_), one for seeds on the forward read strand,
+ * one for seeds on the reverse read strand.  The list is indexed by read
+ * offset index (e.g. 0=closest-to-5', 1=second-closest, etc).
+ *
+ * An assumption behind this data structure is that all the seeds are found
+ * first, then downstream analyses try to extend them.  In between finding the
+ * seed hits and extending them, the sort() member function is called, which
+ * ranks QVals according to the order they should be extended.  Right now the
+ * policy is that QVals with fewer elements (hits) should be tried first.
+ */
+template <typename index_t>
+class SeedResults {
+
+public:
+	SeedResults() :
+		seqFw_(AL_CAT),
+		seqRc_(AL_CAT),
+		qualFw_(AL_CAT),
+		qualRc_(AL_CAT),
+		hitsFw_(AL_CAT),
+		hitsRc_(AL_CAT),
+		isFw_(AL_CAT),
+		isRc_(AL_CAT),
+		sortedFw_(AL_CAT),
+		sortedRc_(AL_CAT),
+		offIdx2off_(AL_CAT),
+		rankOffs_(AL_CAT),
+		rankFws_(AL_CAT),
+		mm1Hit_(AL_CAT)
+	{
+		clear();
+	}
+	
+	/**
+	 * Set the current read.
+	 */
+	void nextRead(const Read& read) {
+		read_ = &read;
+	}
+
+	/**
+	 * Set the appropriate element of either hitsFw_ or hitsRc_ to the given
+	 * QVal.  A QVal encapsulates all the BW ranges for reference substrings 
+	 * that are within some distance of the seed string.
+	 */
+	void add(
+		const   QVal<index_t>& qv,  // range of ranges in cache
+		const   AlignmentCache<index_t>& ac, // cache
+		index_t seedIdx,            // seed index (from 5' end)
+		bool    seedFw)             // whether seed is from forward read
+	{
+		assert(qv.repOk(ac));
+		assert(repOk(&ac));
+		assert_lt(seedIdx, hitsFw_.size());
+		assert_gt(numOffs_, 0); // if this fails, probably failed to call reset
+		if(qv.empty()) return;
+		if(seedFw) {
+			assert(!hitsFw_[seedIdx].valid());
+			hitsFw_[seedIdx] = qv;
+			numEltsFw_ += qv.numElts();
+			numRangesFw_ += qv.numRanges();
+			if(qv.numRanges() > 0) nonzFw_++;
+		} else {
+			assert(!hitsRc_[seedIdx].valid());
+			hitsRc_[seedIdx] = qv;
+			numEltsRc_ += qv.numElts();
+			numRangesRc_ += qv.numRanges();
+			if(qv.numRanges() > 0) nonzRc_++;
+		}
+		numElts_ += qv.numElts();
+		numRanges_ += qv.numRanges();
+		if(qv.numRanges() > 0) {
+			nonzTot_++;
+		}
+		assert(repOk(&ac));
+	}
+
+	/**
+	 * Clear buffered seed hits and state.  Set the number of seed
+	 * offsets and the read.
+	 */
+	void reset(
+		const Read& read,
+		const EList<index_t>& offIdx2off,
+		size_t numOffs)
+	{
+		assert_gt(numOffs, 0);
+		clearSeeds();
+		numOffs_ = numOffs;
+		seqFw_.resize(numOffs_);
+		seqRc_.resize(numOffs_);
+		qualFw_.resize(numOffs_);
+		qualRc_.resize(numOffs_);
+		hitsFw_.resize(numOffs_);
+		hitsRc_.resize(numOffs_);
+		isFw_.resize(numOffs_);
+		isRc_.resize(numOffs_);
+		sortedFw_.resize(numOffs_);
+		sortedRc_.resize(numOffs_);
+		offIdx2off_ = offIdx2off;
+		for(size_t i = 0; i < numOffs_; i++) {
+			sortedFw_[i] = sortedRc_[i] = false;
+			hitsFw_[i].reset();
+			hitsRc_[i].reset();
+			isFw_[i].clear();
+			isRc_[i].clear();
+		}
+		read_ = &read;
+		sorted_ = false;
+	}
+	
+	/**
+	 * Clear seed-hit state.
+	 */
+	void clearSeeds() {
+		sortedFw_.clear();
+		sortedRc_.clear();
+		rankOffs_.clear();
+		rankFws_.clear();
+		offIdx2off_.clear();
+		hitsFw_.clear();
+		hitsRc_.clear();
+		isFw_.clear();
+		isRc_.clear();
+		seqFw_.clear();
+		seqRc_.clear();
+		nonzTot_ = 0;
+		nonzFw_ = 0;
+		nonzRc_ = 0;
+		numOffs_ = 0;
+		numRanges_ = 0;
+		numElts_ = 0;
+		numRangesFw_ = 0;
+		numEltsFw_ = 0;
+		numRangesRc_ = 0;
+		numEltsRc_ = 0;
+	}
+	
+	/**
+	 * Clear seed-hit state and end-to-end alignment state.
+	 */
+	void clear() {
+		clearSeeds();
+		read_ = NULL;
+		exactFwHit_.reset();
+		exactRcHit_.reset();
+		mm1Hit_.clear();
+		mm1Sorted_ = false;
+		mm1Elt_ = 0;
+		assert(empty());
+	}
+	
+    /**
+	 * Return average number of hits per seed.
+	 */
+	float averageHitsPerSeed() const {
+		return (float)numElts_ / (float)nonzTot_;
+	}
+	
+	/**
+	 * Return median of all the non-zero per-seed # hits
+	 */
+	float medianHitsPerSeed() const {
+		EList<size_t>& median = const_cast<EList<size_t>&>(tmpMedian_);
+		median.clear();
+		for(size_t i = 0; i < numOffs_; i++) {
+			if(hitsFw_[i].valid() && hitsFw_[i].numElts() > 0) {
+				median.push_back(hitsFw_[i].numElts());
+			}
+			if(hitsRc_[i].valid() && hitsRc_[i].numElts() > 0) {
+				median.push_back(hitsRc_[i].numElts());
+			}
+		}
+		if(tmpMedian_.empty()) {
+			return 0.0f;
+		}
+		median.sort();
+		float med1 = (float)median[tmpMedian_.size() >> 1];
+		float med2 = med1;
+		if((median.size() & 1) == 0) {
+			med2 = (float)median[(tmpMedian_.size() >> 1) - 1];
+		}
+		return med1 + med2 * 0.5f;
+	}
+	
+	/**
+	 * Return a number that's meant to quantify how hopeful we are that this
+	 * set of seed hits will lead to good alignments.
+	 */
+	double uniquenessFactor() const {
+		double result = 0.0;
+		for(size_t i = 0; i < numOffs_; i++) {
+			if(hitsFw_[i].valid()) {
+				size_t nelt = hitsFw_[i].numElts();
+				result += (1.0 / (double)(nelt * nelt));
+			}
+			if(hitsRc_[i].valid()) {
+				size_t nelt = hitsRc_[i].numElts();
+				result += (1.0 / (double)(nelt * nelt));
+			}
+		}
+		return result;
+	}
+
+	/**
+	 * Return the number of ranges being held.
+	 */
+	index_t numRanges() const { return numRanges_; }
+
+	/**
+	 * Return the number of elements being held.
+	 */
+	index_t numElts() const { return numElts_; }
+
+	/**
+	 * Return the number of ranges being held for seeds on the forward
+	 * read strand.
+	 */
+	index_t numRangesFw() const { return numRangesFw_; }
+
+	/**
+	 * Return the number of elements being held for seeds on the
+	 * forward read strand.
+	 */
+	index_t numEltsFw() const { return numEltsFw_; }
+
+	/**
+	 * Return the number of ranges being held for seeds on the
+	 * reverse-complement read strand.
+	 */
+	index_t numRangesRc() const { return numRangesRc_; }
+
+	/**
+	 * Return the number of elements being held for seeds on the
+	 * reverse-complement read strand.
+	 */
+	index_t numEltsRc() const { return numEltsRc_; }
+	
+	/**
+	 * Given an offset index, return the offset that has that index.
+	 */
+	index_t idx2off(size_t off) const {
+		return offIdx2off_[off];
+	}
+	
+	/**
+	 * Return true iff there are 0 hits being held.
+	 */
+	bool empty() const { return numRanges() == 0; }
+	
+	/**
+	 * Get the QVal representing all the reference hits for the given
+	 * orientation and seed offset index.
+	 */
+	const QVal<index_t>& hitsAtOffIdx(bool fw, size_t seedoffidx) const {
+		assert_lt(seedoffidx, numOffs_);
+		assert(repOk(NULL));
+		return fw ? hitsFw_[seedoffidx] : hitsRc_[seedoffidx];
+	}
+
+	/**
+	 * Get the Instantiated seeds for the given orientation and offset.
+	 */
+	EList<InstantiatedSeed>& instantiatedSeeds(bool fw, size_t seedoffidx) {
+		assert_lt(seedoffidx, numOffs_);
+		assert(repOk(NULL));
+		return fw ? isFw_[seedoffidx] : isRc_[seedoffidx];
+	}
+	
+	/**
+	 * Return the number of different seed offsets possible.
+	 */
+	index_t numOffs() const { return numOffs_; }
+	
+	/**
+	 * Return the read from which seeds were extracted, aligned.
+	 */
+	const Read& read() const { return *read_; }
+	
+#ifndef NDEBUG
+	/**
+	 * Check that this SeedResults is internally consistent.
+	 */
+	bool repOk(
+		const AlignmentCache<index_t>* ac,
+		bool requireInited = false) const
+	{
+		if(requireInited) {
+			assert(read_ != NULL);
+		}
+		if(numOffs_ > 0) {
+			assert_eq(numOffs_, hitsFw_.size());
+			assert_eq(numOffs_, hitsRc_.size());
+			assert_leq(numRanges_, numElts_);
+			assert_leq(nonzTot_, numRanges_);
+			size_t nonzs = 0;
+			for(int fw = 0; fw <= 1; fw++) {
+				const EList<QVal<index_t> >& rrs = (fw ? hitsFw_ : hitsRc_);
+				for(size_t i = 0; i < numOffs_; i++) {
+					if(rrs[i].valid()) {
+						if(rrs[i].numRanges() > 0) nonzs++;
+						if(ac != NULL) {
+							assert(rrs[i].repOk(*ac));
+						}
+					}
+				}
+			}
+			assert_eq(nonzs, nonzTot_);
+			assert(!sorted_ || nonzTot_ == rankFws_.size());
+			assert(!sorted_ || nonzTot_ == rankOffs_.size());
+		}
+		return true;
+	}
+#endif
+	
+	/**
+	 * Populate rankOffs_ and rankFws_ with the list of QVals that need to be
+	 * examined for this SeedResults, in order.  The order is ascending by
+	 * number of elements, so QVals with fewer elements (i.e. seed sequences
+	 * that are more unique) will be tried first and QVals with more elements
+	 * (i.e. seed sequences
+	 */
+	void rankSeedHits(RandomSource& rnd) {
+		while(rankOffs_.size() < nonzTot_) {
+			index_t minsz = (index_t)0xffffffff;
+			index_t minidx = 0;
+			bool minfw = true;
+			// Rank seed-hit positions in ascending order by number of elements
+			// in all BW ranges
+			bool rb = rnd.nextBool();
+			assert(rb == 0 || rb == 1);
+			for(int fwi = 0; fwi <= 1; fwi++) {
+				bool fw = (fwi == (rb ? 1 : 0));
+				EList<QVal<index_t> >& rrs = (fw ? hitsFw_ : hitsRc_);
+				EList<bool>& sorted = (fw ? sortedFw_ : sortedRc_);
+				index_t i = (rnd.nextU32() % (index_t)numOffs_);
+				for(index_t ii = 0; ii < numOffs_; ii++) {
+					if(rrs[i].valid() &&         // valid QVal
+					   rrs[i].numElts() > 0 &&   // non-empty
+					   !sorted[i] &&             // not already sorted
+					   rrs[i].numElts() < minsz) // least elts so far?
+					{
+						minsz = rrs[i].numElts();
+						minidx = i;
+						minfw = (fw == 1);
+					}
+					if((++i) == numOffs_) {
+						i = 0;
+					}
+				}
+			}
+			assert_neq((index_t)0xffffffff, minsz);
+			if(minfw) {
+				sortedFw_[minidx] = true;
+			} else {
+				sortedRc_[minidx] = true;
+			}
+			rankOffs_.push_back(minidx);
+			rankFws_.push_back(minfw);
+		}
+		assert_eq(rankOffs_.size(), rankFws_.size());
+		sorted_ = true;
+	}
+
+	/**
+	 * Return the number of orientation/offsets into the read that have
+	 * at least one seed hit.
+	 */
+	size_t nonzeroOffsets() const {
+		assert(!sorted_ || nonzTot_ == rankFws_.size());
+		assert(!sorted_ || nonzTot_ == rankOffs_.size());
+		return nonzTot_;
+	}
+	
+	/**
+	 * Return true iff all seeds hit for forward read.
+	 */
+	bool allFwSeedsHit() const {
+		return nonzFw_ == numOffs();
+	}
+
+	/**
+	 * Return true iff all seeds hit for revcomp read.
+	 */
+	bool allRcSeedsHit() const {
+		return nonzRc_ == numOffs();
+	}
+	
+	/**
+	 * Return the minimum number of edits that an end-to-end alignment of the
+	 * fw read could have.  Uses knowledge of how many seeds have exact hits
+	 * and how the seeds overlap.
+	 */
+	index_t fewestEditsEE(bool fw, int seedlen, int per) const {
+		assert_gt(seedlen, 0);
+		assert_gt(per, 0);
+		index_t nonz = fw ? nonzFw_ : nonzRc_;
+		if(nonz < numOffs()) {
+			int maxdepth = (seedlen + per - 1) / per;
+			int missing = (int)(numOffs() - nonz);
+			return (missing + maxdepth - 1) / maxdepth;
+		} else {
+			// Exact hit is possible (not guaranteed)
+			return 0;
+		}
+	}
+
+	/**
+	 * Return the number of offsets into the forward read that have at
+	 * least one seed hit.
+	 */
+	index_t nonzeroOffsetsFw() const {
+		return nonzFw_;
+	}
+	
+	/**
+	 * Return the number of offsets into the reverse-complement read
+	 * that have at least one seed hit.
+	 */
+	index_t nonzeroOffsetsRc() const {
+		return nonzRc_;
+	}
+
+	/**
+	 * Return a QVal of seed hits of the given rank 'r'.  'offidx' gets the id
+	 * of the offset from 5' from which it was extracted (0 for the 5-most
+	 * offset, 1 for the next closes to 5', etc).  'off' gets the offset from
+	 * the 5' end.  'fw' gets true iff the seed was extracted from the forward
+	 * read.
+	 */
+	const QVal<index_t>& hitsByRank(
+		index_t  r,       // in
+		index_t& offidx,  // out
+		index_t& off,     // out
+		bool&    fw,      // out
+		index_t& seedlen) // out
+	{
+		assert(sorted_);
+		assert_lt(r, nonzTot_);
+		if(rankFws_[r]) {
+			fw = true;
+			offidx = rankOffs_[r];
+			assert_lt(offidx, offIdx2off_.size());
+			off = offIdx2off_[offidx];
+			seedlen = (index_t)seqFw_[rankOffs_[r]].length();
+			return hitsFw_[rankOffs_[r]];
+		} else {
+			fw = false;
+			offidx = rankOffs_[r];
+			assert_lt(offidx, offIdx2off_.size());
+			off = offIdx2off_[offidx];
+			seedlen = (index_t)seqRc_[rankOffs_[r]].length();
+			return hitsRc_[rankOffs_[r]];
+		}
+	}
+
+	/**
+	 * Return an EList of seed hits of the given rank.
+	 */
+	const BTDnaString& seqByRank(index_t r) {
+		assert(sorted_);
+		assert_lt(r, nonzTot_);
+		return rankFws_[r] ? seqFw_[rankOffs_[r]] : seqRc_[rankOffs_[r]];
+	}
+
+	/**
+	 * Return an EList of seed hits of the given rank.
+	 */
+	const BTString& qualByRank(index_t r) {
+		assert(sorted_);
+		assert_lt(r, nonzTot_);
+		return rankFws_[r] ? qualFw_[rankOffs_[r]] : qualRc_[rankOffs_[r]];
+	}
+	
+	/**
+	 * Return the list of extracted seed sequences for seeds on either
+	 * the forward or reverse strand.
+	 */
+	EList<BTDnaString>& seqs(bool fw) { return fw ? seqFw_ : seqRc_; }
+
+	/**
+	 * Return the list of extracted quality sequences for seeds on
+	 * either the forward or reverse strand.
+	 */
+	EList<BTString>& quals(bool fw) { return fw ? qualFw_ : qualRc_; }
+
+	/**
+	 * Return exact end-to-end alignment of fw read.
+	 */
+	EEHit<index_t> exactFwEEHit() const { return exactFwHit_; }
+
+	/**
+	 * Return exact end-to-end alignment of rc read.
+	 */
+	EEHit<index_t> exactRcEEHit() const { return exactRcHit_; }
+	
+	/**
+	 * Return const ref to list of 1-mismatch end-to-end alignments.
+	 */
+	const EList<EEHit<index_t> >& mm1EEHits() const { return mm1Hit_; }
+    
+	/**
+	 * Sort the end-to-end 1-mismatch alignments, prioritizing by score (higher
+	 * score = higher priority).
+	 */
+	void sort1mmEe(RandomSource& rnd) {
+		assert(!mm1Sorted_);
+		mm1Hit_.sort();
+		size_t streak = 0;
+		for(size_t i = 1; i < mm1Hit_.size(); i++) {
+			if(mm1Hit_[i].score == mm1Hit_[i-1].score) {
+				if(streak == 0) { streak = 1; }
+				streak++;
+			} else {
+				if(streak > 1) {
+					assert_geq(i, streak);
+					mm1Hit_.shufflePortion(i-streak, streak, rnd);
+				}
+				streak = 0;
+			}
+		}
+		if(streak > 1) {
+			mm1Hit_.shufflePortion(mm1Hit_.size() - streak, streak, rnd);
+		}
+		mm1Sorted_ = true;
+	}
+	
+	/**
+	 * Add an end-to-end 1-mismatch alignment.
+	 */
+	void add1mmEe(
+		index_t top,
+		index_t bot,
+		const Edit* e1,
+		const Edit* e2,
+		bool fw,
+		int64_t score)
+	{
+		mm1Hit_.expand();
+		mm1Hit_.back().init(top, bot, e1, e2, fw, score);
+		mm1Elt_ += (bot - top);
+	}
+
+	/**
+	 * Add an end-to-end exact alignment.
+	 */
+	void addExactEeFw(
+		index_t top,
+		index_t bot,
+		const Edit* e1,
+		const Edit* e2,
+		bool fw,
+		int64_t score)
+	{
+		exactFwHit_.init(top, bot, e1, e2, fw, score);
+	}
+
+	/**
+	 * Add an end-to-end exact alignment.
+	 */
+	void addExactEeRc(
+		index_t top,
+		index_t bot,
+		const Edit* e1,
+		const Edit* e2,
+		bool fw,
+		int64_t score)
+	{
+		exactRcHit_.init(top, bot, e1, e2, fw, score);
+	}
+	
+	/**
+	 * Clear out the end-to-end exact alignments.
+	 */
+	void clearExactE2eHits() {
+		exactFwHit_.reset();
+		exactRcHit_.reset();
+	}
+	
+	/**
+	 * Clear out the end-to-end 1-mismatch alignments.
+	 */
+	void clear1mmE2eHits() {
+		mm1Hit_.clear();     // 1-mismatch end-to-end hits
+		mm1Elt_ = 0;         // number of 1-mismatch hit rows
+		mm1Sorted_ = false;  // true iff we've sorted the mm1Hit_ list
+	}
+
+	/**
+	 * Return the number of distinct exact and 1-mismatch end-to-end hits
+	 * found.
+	 */
+	index_t numE2eHits() const {
+		return (index_t)(exactFwHit_.size() + exactRcHit_.size() + mm1Elt_);
+	}
+
+	/**
+	 * Return the number of distinct exact end-to-end hits found.
+	 */
+	index_t numExactE2eHits() const {
+		return (index_t)(exactFwHit_.size() + exactRcHit_.size());
+	}
+
+	/**
+	 * Return the number of distinct 1-mismatch end-to-end hits found.
+	 */
+	index_t num1mmE2eHits() const {
+		return mm1Elt_;
+	}
+	
+	/**
+	 * Return the length of the read that yielded the seed hits.
+	 */
+	index_t readLength() const {
+		assert(read_ != NULL);
+		return read_->length();
+	}
+
+protected:
+
+	// As seed hits and edits are added they're sorted into these
+	// containers
+	EList<BTDnaString>  seqFw_;       // seqs for seeds from forward read
+	EList<BTDnaString>  seqRc_;       // seqs for seeds from revcomp read
+	EList<BTString>     qualFw_;      // quals for seeds from forward read
+	EList<BTString>     qualRc_;      // quals for seeds from revcomp read
+	EList<QVal<index_t> >         hitsFw_;      // hits for forward read
+	EList<QVal<index_t> >         hitsRc_;      // hits for revcomp read
+	EList<EList<InstantiatedSeed> > isFw_; // hits for forward read
+	EList<EList<InstantiatedSeed> > isRc_; // hits for revcomp read
+	EList<bool>         sortedFw_;    // true iff fw QVal was sorted/ranked
+	EList<bool>         sortedRc_;    // true iff rc QVal was sorted/ranked
+	index_t             nonzTot_;     // # offsets with non-zero size
+	index_t             nonzFw_;      // # offsets into fw read with non-0 size
+	index_t             nonzRc_;      // # offsets into rc read with non-0 size
+	index_t             numRanges_;   // # ranges added
+	index_t             numElts_;     // # elements added
+	index_t             numRangesFw_; // # ranges added for fw seeds
+	index_t             numEltsFw_;   // # elements added for fw seeds
+	index_t             numRangesRc_; // # ranges added for rc seeds
+	index_t             numEltsRc_;   // # elements added for rc seeds
+
+	EList<index_t>      offIdx2off_;// map from offset indexes to offsets from 5' end
+
+	// When the sort routine is called, the seed hits collected so far
+	// are sorted into another set of containers that allow easy access
+	// to hits from the lowest-ranked offset (the one with the fewest
+	// BW elements) to the greatest-ranked offset.  Offsets with 0 hits
+	// are ignored.
+	EList<index_t>      rankOffs_;  // sorted offests of seeds to try
+	EList<bool>         rankFws_;   // sorted orientations assoc. with rankOffs_
+	bool                sorted_;    // true if sort() called since last reset
+	
+	// These fields set once per read
+	index_t             numOffs_;   // # different seed offsets possible
+	const Read*         read_;      // read from which seeds were extracted
+	
+	EEHit<index_t>      exactFwHit_; // end-to-end exact hit for fw read
+	EEHit<index_t>      exactRcHit_; // end-to-end exact hit for rc read
+	EList<EEHit<index_t> > mm1Hit_;     // 1-mismatch end-to-end hits
+	index_t             mm1Elt_;     // number of 1-mismatch hit rows
+	bool                mm1Sorted_;  // true iff we've sorted the mm1Hit_ list
+    
+	EList<size_t> tmpMedian_; // temporary storage for calculating median
+};
+
+
+// Forward decl
+template <typename index_t> class Ebwt;
+template <typename index_t> struct SideLocus;
+
+/**
+ * Encapsulates a sumamry of what the searchAllSeeds aligner did.
+ */
+struct SeedSearchMetrics {
+
+	SeedSearchMetrics() : mutex_m() {
+	    reset();
+	}
+
+	/**
+	 * Merge this metrics object with the given object, i.e., sum each
+	 * category.  This is the only safe way to update a
+	 * SeedSearchMetrics object shread by multiple threads.
+	 */
+	void merge(const SeedSearchMetrics& m, bool getLock = false) {
+        ThreadSafe ts(&mutex_m, getLock);
+		seedsearch   += m.seedsearch;
+		possearch    += m.possearch;
+		intrahit     += m.intrahit;
+		interhit     += m.interhit;
+		filteredseed += m.filteredseed;
+		ooms         += m.ooms;
+		bwops        += m.bwops;
+		bweds        += m.bweds;
+		bestmin0     += m.bestmin0;
+		bestmin1     += m.bestmin1;
+		bestmin2     += m.bestmin2;
+	}
+	
+	/**
+	 * Set all counters to 0.
+	 */
+	void reset() {
+		seedsearch =
+		possearch =
+		intrahit =
+		interhit =
+		filteredseed =
+		ooms =
+		bwops =
+		bweds =
+		bestmin0 =
+		bestmin1 =
+		bestmin2 = 0;
+	}
+
+	uint64_t seedsearch;   // # times we executed strategy in InstantiatedSeed
+	uint64_t possearch;    // # offsets where aligner executed >= 1 strategy
+	uint64_t intrahit;     // # offsets where current-read cache gave answer
+	uint64_t interhit;     // # offsets where across-read cache gave answer
+	uint64_t filteredseed; // # seed instantiations skipped due to Ns
+	uint64_t ooms;         // out-of-memory errors
+	uint64_t bwops;        // Burrows-Wheeler operations
+	uint64_t bweds;        // Burrows-Wheeler edits
+	uint64_t bestmin0;     // # times the best min # edits was 0
+	uint64_t bestmin1;     // # times the best min # edits was 1
+	uint64_t bestmin2;     // # times the best min # edits was 2
+	MUTEX_T  mutex_m;
+};
+
+/**
+ * Given an index and a seeding scheme, searches for seed hits.
+ */
+template <typename index_t>
+class SeedAligner {
+
+public:
+	
+	/**
+	 * Initialize with index.
+	 */
+	SeedAligner() : edits_(AL_CAT), offIdx2off_(AL_CAT) { }
+
+	/**
+	 * Given a read and a few coordinates that describe a substring of the
+	 * read (or its reverse complement), fill in 'seq' and 'qual' objects
+	 * with the seed sequence and qualities.
+	 */
+	void instantiateSeq(
+		const Read& read, // input read
+		BTDnaString& seq, // output sequence
+		BTString& qual,   // output qualities
+		int len,          // seed length
+		int depth,        // seed's 0-based offset from 5' end
+		bool fw) const;   // seed's orientation
+
+	/**
+	 * Iterate through the seeds that cover the read and initiate a
+	 * search for each seed.
+	 */
+	std::pair<int, int> instantiateSeeds(
+		const EList<Seed>& seeds,   // search seeds
+		index_t off,                // offset into read to start extracting
+		int per,                    // interval between seeds
+		const Read& read,           // read to align
+		const Scoring& pens,        // scoring scheme
+		bool nofw,                  // don't align forward read
+		bool norc,                  // don't align revcomp read
+		AlignmentCacheIface<index_t>& cache, // holds some seed hits from previous reads
+		SeedResults<index_t>& sr,   // holds all the seed hits
+		SeedSearchMetrics& met);    // metrics
+
+	/**
+	 * Iterate through the seeds that cover the read and initiate a
+	 * search for each seed.
+	 */
+	void searchAllSeeds(
+		const EList<Seed>& seeds,     // search seeds
+		const Ebwt<index_t>* ebwtFw,  // BWT index
+		const Ebwt<index_t>* ebwtBw,  // BWT' index
+		const Read& read,             // read to align
+		const Scoring& pens,          // scoring scheme
+		AlignmentCacheIface<index_t>& cache,   // local seed alignment cache
+		SeedResults<index_t>& hits,   // holds all the seed hits
+		SeedSearchMetrics& met,       // metrics
+		PerReadMetrics& prm);         // per-read metrics
+
+	/**
+	 * Sanity-check a partial alignment produced during oneMmSearch.
+	 */
+	bool sanityPartial(
+		const Ebwt<index_t>* ebwtFw, // BWT index
+		const Ebwt<index_t>* ebwtBw, // BWT' index
+		const BTDnaString&   seq,
+		index_t              dep,
+		index_t              len,
+		bool                 do1mm,
+		index_t              topfw,
+		index_t              botfw,
+		index_t              topbw,
+		index_t              botbw);
+
+	/**
+	 * Do an exact-matching sweet to establish a lower bound on number of edits
+	 * and to find exact alignments.
+	 */
+	size_t exactSweep(
+		const Ebwt<index_t>&  ebwt,    // BWT index
+		const Read&           read,    // read to align
+		const Scoring&        sc,      // scoring scheme
+		bool                  nofw,    // don't align forward read
+		bool                  norc,    // don't align revcomp read
+		size_t                mineMax, // don't care about edit bounds > this
+		size_t&               mineFw,  // minimum # edits for forward read
+		size_t&               mineRc,  // minimum # edits for revcomp read
+		bool                  repex,   // report 0mm hits?
+		SeedResults<index_t>& hits,    // holds all the seed hits (and exact hit)
+		SeedSearchMetrics&    met);    // metrics
+
+	/**
+	 * Search for end-to-end alignments with up to 1 mismatch.
+	 */
+	bool oneMmSearch(
+		const Ebwt<index_t>*  ebwtFw, // BWT index
+		const Ebwt<index_t>*  ebwtBw, // BWT' index
+		const Read&           read,   // read to align
+		const Scoring&        sc,     // scoring
+		int64_t               minsc,  // minimum score
+		bool                  nofw,   // don't align forward read
+		bool                  norc,   // don't align revcomp read
+		bool                  local,  // 1mm hits must be legal local alignments
+		bool                  repex,  // report 0mm hits?
+		bool                  rep1mm, // report 1mm hits?
+		SeedResults<index_t>& hits,   // holds all the seed hits (and exact hit)
+		SeedSearchMetrics&    met);   // metrics
+    
+protected:
+
+	/**
+	 * Report a seed hit found by searchSeedBi(), but first try to extend it out in
+	 * either direction as far as possible without hitting any edits.  This will
+	 * allow us to prioritize the seed hits better later on.  Call reportHit() when
+	 * we're done, which actually adds the hit to the cache.  Returns result from
+	 * calling reportHit().
+	 */
+	bool extendAndReportHit(
+		index_t topf,                      // top in BWT
+		index_t botf,                      // bot in BWT
+		index_t topb,                      // top in BWT'
+		index_t botb,                      // bot in BWT'
+		index_t len,                       // length of hit
+		DoublyLinkedList<Edit> *prevEdit); // previous edit
+
+	/**
+	 * Report a seed hit found by searchSeedBi() by adding it to the cache.  Return
+	 * false if the hit could not be reported because of, e.g., cache exhaustion.
+	 */
+	bool reportHit(
+		index_t topf,         // top in BWT
+		index_t botf,         // bot in BWT
+		index_t topb,         // top in BWT'
+		index_t botb,         // bot in BWT'
+		index_t len,          // length of hit
+		DoublyLinkedList<Edit> *prevEdit);  // previous edit
+	
+	/**
+	 * Given an instantiated seed (in s_ and other fields), search
+	 */
+	bool searchSeedBi();
+	
+	/**
+	 * Main, recursive implementation of the seed search.
+	 */
+	bool searchSeedBi(
+		int step,                // depth into steps_[] array
+		int depth,               // recursion depth
+		index_t topf,            // top in BWT
+		index_t botf,            // bot in BWT
+		index_t topb,            // top in BWT'
+		index_t botb,            // bot in BWT'
+		SideLocus<index_t> tloc, // locus for top (perhaps unititialized)
+		SideLocus<index_t> bloc, // locus for bot (perhaps unititialized)
+		Constraint c0,           // constraints to enforce in seed zone 0
+		Constraint c1,           // constraints to enforce in seed zone 1
+		Constraint c2,           // constraints to enforce in seed zone 2
+		Constraint overall,      // overall constraints
+		DoublyLinkedList<Edit> *prevEdit);  // previous edit
+	
+	/**
+	 * Get tloc and bloc ready for the next step.
+	 */
+	inline void nextLocsBi(
+		SideLocus<index_t>& tloc,  // top locus
+		SideLocus<index_t>& bloc,  // bot locus
+		index_t topf,              // top in BWT
+		index_t botf,              // bot in BWT
+		index_t topb,              // top in BWT'
+		index_t botb,              // bot in BWT'
+		int step);                 // step to get ready for
+	
+	// Following are set in searchAllSeeds then used by searchSeed()
+	// and other protected members.
+	const Ebwt<index_t>* ebwtFw_;       // forward index (BWT)
+	const Ebwt<index_t>* ebwtBw_;       // backward/mirror index (BWT')
+	const Scoring* sc_;                 // scoring scheme
+	const InstantiatedSeed* s_;         // current instantiated seed
+	
+	const Read* read_;                  // read whose seeds are currently being aligned
+	
+	// The following are set just before a call to searchSeedBi()
+	const BTDnaString* seq_;            // sequence of current seed
+	const BTString* qual_;              // quality string for current seed
+	index_t off_;                       // offset of seed currently being searched
+	bool fw_;                           // orientation of seed currently being searched
+	
+	EList<Edit> edits_;                 // temporary place to sort edits
+	AlignmentCacheIface<index_t> *ca_;  // local alignment cache for seed alignments
+	EList<index_t> offIdx2off_;         // offset idx to read offset map, set up instantiateSeeds()
+	uint64_t bwops_;                    // Burrows-Wheeler operations
+	uint64_t bwedits_;                  // Burrows-Wheeler edits
+	BTDnaString tmprfdnastr_;           // used in reportHit
+	
+	ASSERT_ONLY(ESet<BTDnaString> hits_); // Ref hits so far for seed being aligned
+	BTDnaString tmpdnastr_;
+};
+
+#define INIT_LOCS(top, bot, tloc, bloc, e) { \
+	if(bot - top == 1) { \
+		tloc.initFromRow(top, (e).eh(), (e).ebwt()); \
+		bloc.invalidate(); \
+	} else { \
+		SideLocus<index_t>::initFromTopBot(top, bot, (e).eh(), (e).ebwt(), tloc, bloc); \
+		assert(bloc.valid()); \
+	} \
+}
+
+#define SANITY_CHECK_4TUP(t, b, tp, bp) { \
+	ASSERT_ONLY(index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); \
+	ASSERT_ONLY(index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); \
+	assert_eq(tot, totp); \
+}
+
+/**
+ * Given a read and a few coordinates that describe a substring of the read (or
+ * its reverse complement), fill in 'seq' and 'qual' objects with the seed
+ * sequence and qualities.
+ *
+ * The seq field is filled with the sequence as it would align to the Watson
+ * reference strand.  I.e. if fw is false, then the sequence that appears in
+ * 'seq' is the reverse complement of the raw read substring.
+ */
+template <typename index_t>
+void SeedAligner<index_t>::instantiateSeq(
+										  const Read& read, // input read
+										  BTDnaString& seq, // output sequence
+										  BTString& qual,   // output qualities
+										  int len,          // seed length
+										  int depth,        // seed's 0-based offset from 5' end
+										  bool fw) const    // seed's orientation
+{
+	// Fill in 'seq' and 'qual'
+	int seedlen = len;
+	if((int)read.length() < seedlen) seedlen = (int)read.length();
+	seq.resize(len);
+	qual.resize(len);
+	// If fw is false, we take characters starting at the 3' end of the
+	// reverse complement of the read.
+	for(int i = 0; i < len; i++) {
+		seq.set(read.patFw.windowGetDna(i, fw, read.color, depth, len), i);
+		qual.set(read.qual.windowGet(i, fw, depth, len), i);
+	}
+}
+
+/**
+ * We assume that all seeds are the same length.
+ *
+ * For each seed, instantiate the seed, retracting if necessary.
+ */
+template <typename index_t>
+pair<int, int> SeedAligner<index_t>::instantiateSeeds(
+													  const EList<Seed>& seeds,  // search seeds
+													  index_t off,                // offset into read to start extracting
+													  int per,                   // interval between seeds
+													  const Read& read,          // read to align
+													  const Scoring& pens,       // scoring scheme
+													  bool nofw,                 // don't align forward read
+													  bool norc,                 // don't align revcomp read
+													  AlignmentCacheIface<index_t>& cache,// holds some seed hits from previous reads
+													  SeedResults<index_t>& sr,  // holds all the seed hits
+													  SeedSearchMetrics& met)    // metrics
+{
+	assert(!seeds.empty());
+	assert_gt(read.length(), 0);
+	// Check whether read has too many Ns
+	offIdx2off_.clear();
+	int len = seeds[0].len; // assume they're all the same length
+#ifndef NDEBUG
+	for(size_t i = 1; i < seeds.size(); i++) {
+		assert_eq(len, seeds[i].len);
+	}
+#endif
+	// Calc # seeds within read interval
+	int nseeds = 1;
+	if((int)read.length() - (int)off > len) {
+		nseeds += ((int)read.length() - (int)off - len) / per;
+	}
+	for(int i = 0; i < nseeds; i++) {
+		offIdx2off_.push_back(per * i + (int)off);
+	}
+	pair<int, int> ret;
+	ret.first = 0;  // # seeds that require alignment
+	ret.second = 0; // # seeds that hit in cache with non-empty results
+	sr.reset(read, offIdx2off_, nseeds);
+	assert(sr.repOk(&cache.current(), true)); // require that SeedResult be initialized
+	// For each seed position
+	for(int fwi = 0; fwi < 2; fwi++) {
+		bool fw = (fwi == 0);
+		if((fw && nofw) || (!fw && norc)) {
+			// Skip this orientation b/c user specified --nofw or --norc
+			continue;
+		}
+		// For each seed position
+		for(int i = 0; i < nseeds; i++) {
+			int depth = i * per + (int)off;
+			int seedlen = seeds[0].len;
+			// Extract the seed sequence at this offset
+			// If fw == true, we extract the characters from i*per to
+			// i*(per-1) (exclusive).  If fw == false, 
+			instantiateSeq(
+						   read,
+						   sr.seqs(fw)[i],
+						   sr.quals(fw)[i],
+						   std::min<int>((int)seedlen, (int)read.length()),
+						   depth,
+						   fw);
+			//QKey qk(sr.seqs(fw)[i] ASSERT_ONLY(, tmpdnastr_));
+			// For each search strategy
+			EList<InstantiatedSeed>& iss = sr.instantiatedSeeds(fw, i);
+			for(int j = 0; j < (int)seeds.size(); j++) {
+				iss.expand();
+				assert_eq(seedlen, seeds[j].len);
+				InstantiatedSeed* is = &iss.back();
+				if(seeds[j].instantiate(
+										read,
+										sr.seqs(fw)[i],
+										sr.quals(fw)[i],
+										pens,
+										depth,
+										i,
+										j,
+										fw,
+										*is))
+				{
+					// Can we fill this seed hit in from the cache?
+					ret.first++;
+				} else {
+					// Seed may fail to instantiate if there are Ns
+					// that prevent it from matching
+					met.filteredseed++;
+					iss.pop_back();
+				}
+			}
+		}
+	}
+	return ret;
+}
+
+/**
+ * We assume that all seeds are the same length.
+ *
+ * For each seed:
+ *
+ * 1. Instantiate all seeds, retracting them if necessary.
+ * 2. Calculate zone boundaries for each seed
+ */
+template <typename index_t>
+void SeedAligner<index_t>::searchAllSeeds(
+										  const EList<Seed>& seeds,    // search seeds
+										  const Ebwt<index_t>* ebwtFw, // BWT index
+										  const Ebwt<index_t>* ebwtBw, // BWT' index
+										  const Read& read,            // read to align
+										  const Scoring& pens,         // scoring scheme
+										  AlignmentCacheIface<index_t>& cache,  // local cache for seed alignments
+										  SeedResults<index_t>& sr,    // holds all the seed hits
+										  SeedSearchMetrics& met,      // metrics
+										  PerReadMetrics& prm)         // per-read metrics
+{
+	assert(!seeds.empty());
+	assert(ebwtFw != NULL);
+	assert(ebwtFw->isInMemory());
+	assert(sr.repOk(&cache.current()));
+	ebwtFw_ = ebwtFw;
+	ebwtBw_ = ebwtBw;
+	sc_ = &pens;
+	read_ = &read;
+	ca_ = &cache;
+	bwops_ = bwedits_ = 0;
+	uint64_t possearches = 0, seedsearches = 0, intrahits = 0, interhits = 0, ooms = 0;
+	// For each instantiated seed
+	for(int i = 0; i < (int)sr.numOffs(); i++) {
+		size_t off = sr.idx2off(i);
+		for(int fwi = 0; fwi < 2; fwi++) {
+			bool fw = (fwi == 0);
+			assert(sr.repOk(&cache.current()));
+			EList<InstantiatedSeed>& iss = sr.instantiatedSeeds(fw, i);
+			if(iss.empty()) {
+				// Cache hit in an across-read cache
+				continue;
+			}
+			QVal<index_t> qv;
+			seq_  = &sr.seqs(fw)[i];  // seed sequence
+			qual_ = &sr.quals(fw)[i]; // seed qualities
+			off_  = off;              // seed offset (from 5')
+			fw_   = fw;               // seed orientation
+			// Tell the cache that we've started aligning, so the cache can
+			// expect a series of on-the-fly updates
+			int ret = cache.beginAlign(*seq_, *qual_, qv);
+			ASSERT_ONLY(hits_.clear());
+			if(ret == -1) {
+				// Out of memory when we tried to add key to map
+				ooms++;
+				continue;
+			}
+			bool abort = false;
+			if(ret == 0) {
+				// Not already in cache
+				assert(cache.aligning());
+				possearches++;
+				for(size_t j = 0; j < iss.size(); j++) {
+					// Set seq_ and qual_ appropriately, using the seed sequences
+					// and qualities already installed in SeedResults
+					assert_eq(fw, iss[j].fw);
+					assert_eq(i, (int)iss[j].seedoffidx);
+					s_ = &iss[j];
+					// Do the search with respect to seq_, qual_ and s_.
+					if(!searchSeedBi()) {
+						// Memory exhausted during search
+						ooms++;
+						abort = true;
+						break;
+					}
+					seedsearches++;
+					assert(cache.aligning());
+				}
+				if(!abort) {
+					qv = cache.finishAlign();
+				}
+			} else {
+				// Already in cache
+				assert_eq(1, ret);
+				assert(qv.valid());
+				intrahits++;
+			}
+			assert(abort || !cache.aligning());
+			if(qv.valid()) {
+				sr.add(
+					   qv,    // range of ranges in cache
+					   cache.current(), // cache
+					   i,     // seed index (from 5' end)
+					   fw);   // whether seed is from forward read
+			}
+		}
+	}
+	prm.nSeedRanges = sr.numRanges();
+	prm.nSeedElts = sr.numElts();
+	prm.nSeedRangesFw = sr.numRangesFw();
+	prm.nSeedRangesRc = sr.numRangesRc();
+	prm.nSeedEltsFw = sr.numEltsFw();
+	prm.nSeedEltsRc = sr.numEltsRc();
+	prm.seedMedian = (uint64_t)(sr.medianHitsPerSeed() + 0.5);
+	prm.seedMean = (uint64_t)sr.averageHitsPerSeed();
+	
+	prm.nSdFmops += bwops_;
+	met.seedsearch += seedsearches;
+	met.possearch += possearches;
+	met.intrahit += intrahits;
+	met.interhit += interhits;
+	met.ooms += ooms;
+	met.bwops += bwops_;
+	met.bweds += bwedits_;
+}
+
+template <typename index_t>
+bool SeedAligner<index_t>::sanityPartial(
+										 const Ebwt<index_t>*        ebwtFw, // BWT index
+										 const Ebwt<index_t>*        ebwtBw, // BWT' index
+										 const BTDnaString& seq,
+										 index_t dep,
+										 index_t len,
+										 bool do1mm,
+										 index_t topfw,
+										 index_t botfw,
+										 index_t topbw,
+										 index_t botbw)
+{
+	tmpdnastr_.clear();
+	for(size_t i = dep; i < len; i++) {
+		tmpdnastr_.append(seq[i]);
+	}
+	index_t top_fw = 0, bot_fw = 0;
+	ebwtFw->contains(tmpdnastr_, &top_fw, &bot_fw);
+	assert_eq(top_fw, topfw);
+	assert_eq(bot_fw, botfw);
+	if(do1mm && ebwtBw != NULL) {
+		tmpdnastr_.reverse();
+		index_t top_bw = 0, bot_bw = 0;
+		ebwtBw->contains(tmpdnastr_, &top_bw, &bot_bw);
+		assert_eq(top_bw, topbw);
+		assert_eq(bot_bw, botbw);
+	}
+	return true;
+}
+
+/**
+ * Sweep right-to-left and left-to-right using exact matching.  Remember all
+ * the SA ranges encountered along the way.  Report exact matches if there are
+ * any.  Calculate a lower bound on the number of edits in an end-to-end
+ * alignment.
+ */
+template <typename index_t>
+size_t SeedAligner<index_t>::exactSweep(
+										const Ebwt<index_t>&  ebwt,    // BWT index
+										const Read&           read,    // read to align
+										const Scoring&        sc,      // scoring scheme
+										bool                  nofw,    // don't align forward read
+										bool                  norc,    // don't align revcomp read
+										size_t                mineMax, // don't care about edit bounds > this
+										size_t&               mineFw,  // minimum # edits for forward read
+										size_t&               mineRc,  // minimum # edits for revcomp read
+										bool                  repex,   // report 0mm hits?
+										SeedResults<index_t>& hits,    // holds all the seed hits (and exact hit)
+										SeedSearchMetrics&    met)     // metrics
+{
+	assert_gt(mineMax, 0);
+	index_t top = 0, bot = 0;
+	SideLocus<index_t> tloc, bloc;
+	const size_t len = read.length();
+	size_t nelt = 0;
+	for(int fwi = 0; fwi < 2; fwi++) {
+		bool fw = (fwi == 0);
+		if( fw && nofw) continue;
+		if(!fw && norc) continue;
+		const BTDnaString& seq = fw ? read.patFw : read.patRc;
+		assert(!seq.empty());
+		int ftabLen = ebwt.eh().ftabChars();
+		size_t dep = 0;
+		size_t nedit = 0;
+		bool done = false;
+		while(dep < len && !done) {
+			top = bot = 0;
+			size_t left = len - dep;
+			assert_gt(left, 0);
+			bool doFtab = ftabLen > 1 && left >= (size_t)ftabLen;
+			if(doFtab) {
+				// Does N interfere with use of Ftab?
+				for(size_t i = 0; i < (size_t)ftabLen; i++) {
+					int c = seq[len-dep-1-i];
+					if(c > 3) {
+						doFtab = false;
+						break;
+					}
+				}
+			}
+			if(doFtab) {
+				// Use ftab
+				ebwt.ftabLoHi(seq, len - dep - ftabLen, false, top, bot);
+				dep += (size_t)ftabLen;
+			} else {
+				// Use fchr
+				int c = seq[len-dep-1];
+				if(c < 4) {
+					top = ebwt.fchr()[c];
+					bot = ebwt.fchr()[c+1];
+				}
+				dep++;
+			}
+			if(bot <= top) {
+				nedit++;
+				if(nedit >= mineMax) {
+					if(fw) { mineFw = nedit; } else { mineRc = nedit; }
+					break;
+				}
+				continue;
+			}
+			INIT_LOCS(top, bot, tloc, bloc, ebwt);
+			// Keep going
+			while(dep < len) {
+				int c = seq[len-dep-1];
+				if(c > 3) {
+					top = bot = 0;
+				} else {
+					if(bloc.valid()) {
+						bwops_ += 2;
+						top = ebwt.mapLF(tloc, c);
+						bot = ebwt.mapLF(bloc, c);
+					} else {
+						bwops_++;
+						top = ebwt.mapLF1(top, tloc, c);
+						if(top == (index_t)OFF_MASK) {
+							top = bot = 0;
+						} else {
+							bot = top+1;
+						}
+					}
+				}
+				if(bot <= top) {
+					nedit++;
+					if(nedit >= mineMax) {
+						if(fw) { mineFw = nedit; } else { mineRc = nedit; }
+						done = true;
+					}
+					break;
+				}
+				INIT_LOCS(top, bot, tloc, bloc, ebwt);
+				dep++;
+			}
+			if(done) {
+				break;
+			}
+			if(dep == len) {
+				// Set the minimum # edits
+				if(fw) { mineFw = nedit; } else { mineRc = nedit; }
+				// Done
+				if(nedit == 0 && bot > top) {
+					if(repex) {
+						// This is an exact hit
+						int64_t score = len * sc.match();
+						if(fw) {
+							hits.addExactEeFw(top, bot, NULL, NULL, fw, score);
+							assert(ebwt.contains(seq, NULL, NULL));
+						} else {
+							hits.addExactEeRc(top, bot, NULL, NULL, fw, score);
+							assert(ebwt.contains(seq, NULL, NULL));
+						}
+					}
+					nelt += (bot - top);
+				}
+				break;
+			}
+			dep++;
+		}
+	}
+	return nelt;
+}
+
+/**
+ * Search for end-to-end exact hit for read.  Return true iff one is found.
+ */
+template <typename index_t>
+bool SeedAligner<index_t>::oneMmSearch(
+									   const Ebwt<index_t>*  ebwtFw, // BWT index
+									   const Ebwt<index_t>*  ebwtBw, // BWT' index
+									   const Read&           read,   // read to align
+									   const Scoring&        sc,     // scoring
+									   int64_t               minsc,  // minimum score
+									   bool                  nofw,   // don't align forward read
+									   bool                  norc,   // don't align revcomp read
+									   bool                  local,  // 1mm hits must be legal local alignments
+									   bool                  repex,  // report 0mm hits?
+									   bool                  rep1mm, // report 1mm hits?
+									   SeedResults<index_t>& hits,   // holds all the seed hits (and exact hit)
+									   SeedSearchMetrics&    met)    // metrics
+{
+	assert(!rep1mm || ebwtBw != NULL);
+	const size_t len = read.length();
+	int nceil = sc.nCeil.f<int>((double)len);
+	size_t ns = read.ns();
+	if(ns > 1) {
+		// Can't align this with <= 1 mismatches
+		return false;
+	} else if(ns == 1 && !rep1mm) {
+		// Can't align this with 0 mismatches
+		return false;
+	}
+	assert_geq(len, 2);
+	assert(!rep1mm || ebwtBw->eh().ftabChars() == ebwtFw->eh().ftabChars());
+#ifndef NDEBUG
+	if(ebwtBw != NULL) {
+		for(int i = 0; i < 4; i++) {
+			assert_eq(ebwtBw->fchr()[i], ebwtFw->fchr()[i]);
+		}
+	}
+#endif
+	size_t halfFw = len >> 1;
+	size_t halfBw = len >> 1;
+	if((len & 1) != 0) {
+		halfBw++;
+	}
+	assert_geq(halfFw, 1);
+	assert_geq(halfBw, 1);
+	SideLocus<index_t> tloc, bloc;
+	index_t t[4], b[4];   // dest BW ranges for BWT
+	t[0] = t[1] = t[2] = t[3] = 0;
+	b[0] = b[1] = b[2] = b[3] = 0;
+	index_t tp[4], bp[4]; // dest BW ranges for BWT'
+	tp[0] = tp[1] = tp[2] = tp[3] = 0;
+	bp[0] = bp[1] = bp[2] = bp[3] = 0;
+	index_t top = 0, bot = 0, topp = 0, botp = 0;
+	// Align fw read / rc read
+	bool results = false;
+	for(int fwi = 0; fwi < 2; fwi++) {
+		bool fw = (fwi == 0);
+		if( fw && nofw) continue;
+		if(!fw && norc) continue;
+		// Align going right-to-left, left-to-right
+		int lim = rep1mm ? 2 : 1;
+		for(int ebwtfwi = 0; ebwtfwi < lim; ebwtfwi++) {
+			bool ebwtfw = (ebwtfwi == 0);
+			const Ebwt<index_t>* ebwt  = (ebwtfw ? ebwtFw : ebwtBw);
+			const Ebwt<index_t>* ebwtp = (ebwtfw ? ebwtBw : ebwtFw);
+			assert(rep1mm || ebwt->fw());
+			const BTDnaString& seq =
+			(fw ? (ebwtfw ? read.patFw : read.patFwRev) :
+			 (ebwtfw ? read.patRc : read.patRcRev));
+			assert(!seq.empty());
+			const BTString& qual =
+			(fw ? (ebwtfw ? read.qual    : read.qualRev) :
+			 (ebwtfw ? read.qualRev : read.qual));
+			int ftabLen = ebwt->eh().ftabChars();
+			size_t nea = ebwtfw ? halfFw : halfBw;
+			// Check if there's an N in the near portion
+			bool skip = false;
+			for(size_t dep = 0; dep < nea; dep++) {
+				if(seq[len-dep-1] > 3) {
+					skip = true;
+					break;
+				}
+			}
+			if(skip) {
+				continue;
+			}
+			size_t dep = 0;
+			// Align near half
+			if(ftabLen > 1 && (size_t)ftabLen <= nea) {
+				// Use ftab to jump partway into near half
+				bool rev = !ebwtfw;
+				ebwt->ftabLoHi(seq, len - ftabLen, rev, top, bot);
+				if(rep1mm) {
+					ebwtp->ftabLoHi(seq, len - ftabLen, rev, topp, botp);
+					assert_eq(bot - top, botp - topp);
+				}
+				if(bot - top == 0) {
+					continue;
+				}
+				int c = seq[len - ftabLen];
+				t[c] = top; b[c] = bot;
+				tp[c] = topp; bp[c] = botp;
+				dep = ftabLen;
+				// initialize tloc, bloc??
+			} else {
+				// Use fchr to jump in by 1 pos
+				int c = seq[len-1];
+				assert_range(0, 3, c);
+				top = topp = tp[c] = ebwt->fchr()[c];
+				bot = botp = bp[c] = ebwt->fchr()[c+1];
+				if(bot - top == 0) {
+					continue;
+				}
+				dep = 1;
+				// initialize tloc, bloc??
+			}
+			INIT_LOCS(top, bot, tloc, bloc, *ebwt);
+			assert(sanityPartial(ebwt, ebwtp, seq, len-dep, len, rep1mm, top, bot, topp, botp));
+			bool do_continue = false;
+			for(; dep < nea; dep++) {
+				assert_lt(dep, len);
+				int rdc = seq[len - dep - 1];
+				tp[0] = tp[1] = tp[2] = tp[3] = topp;
+				bp[0] = bp[1] = bp[2] = bp[3] = botp;
+				if(bloc.valid()) {
+					bwops_++;
+					t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
+					ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
+					SANITY_CHECK_4TUP(t, b, tp, bp);
+					top = t[rdc]; bot = b[rdc];
+					if(bot <= top) {
+						do_continue = true;
+						break;
+					}
+					topp = tp[rdc]; botp = bp[rdc];
+					assert(!rep1mm || bot - top == botp - topp);
+				} else {
+					assert_eq(bot, top+1);
+					assert(!rep1mm || botp == topp+1);
+					bwops_++;
+					top = ebwt->mapLF1(top, tloc, rdc);
+					if(top == (index_t)OFF_MASK) {
+						do_continue = true;
+						break;
+					}
+					bot = top + 1;
+					t[rdc] = top; b[rdc] = bot;
+					tp[rdc] = topp; bp[rdc] = botp;
+					assert(!rep1mm || b[rdc] - t[rdc] == bp[rdc] - tp[rdc]);
+					// topp/botp stay the same
+				}
+				INIT_LOCS(top, bot, tloc, bloc, *ebwt);
+				assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp));
+			}
+			if(do_continue) {
+				continue;
+			}
+			// Align far half
+			for(; dep < len; dep++) {
+				int rdc = seq[len-dep-1];
+				int quc = qual[len-dep-1];
+				if(rdc > 3 && nceil == 0) {
+					break;
+				}
+				tp[0] = tp[1] = tp[2] = tp[3] = topp;
+				bp[0] = bp[1] = bp[2] = bp[3] = botp;
+				int clo = 0, chi = 3;
+				bool match = true;
+				if(bloc.valid()) {
+					bwops_++;
+					t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
+					ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
+					SANITY_CHECK_4TUP(t, b, tp, bp);
+					match = rdc < 4;
+					top = t[rdc]; bot = b[rdc];
+					topp = tp[rdc]; botp = bp[rdc];
+				} else {
+					assert_eq(bot, top+1);
+					assert(!rep1mm || botp == topp+1);
+					bwops_++;
+					clo = ebwt->mapLF1(top, tloc);
+					match = (clo == rdc);
+					assert_range(-1, 3, clo);
+					if(clo < 0) {
+						break; // Hit the $
+					} else {
+						t[clo] = top;
+						b[clo] = bot = top + 1;
+					}
+					bp[clo] = botp;
+					tp[clo] = topp;
+					assert(!rep1mm || bot - top == botp - topp);
+					assert(!rep1mm || b[clo] - t[clo] == bp[clo] - tp[clo]);
+					chi = clo;
+				}
+				//assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp));
+				if(rep1mm && (ns == 0 || rdc > 3)) {
+					for(int j = clo; j <= chi; j++) {
+						if(j == rdc || b[j] == t[j]) {
+							// Either matches read or isn't a possibility
+							continue;
+						}
+						// Potential mismatch - next, try
+						size_t depm = dep + 1;
+						index_t topm = t[j], botm = b[j];
+						index_t topmp = tp[j], botmp = bp[j];
+						assert_eq(botm - topm, botmp - topmp);
+						index_t tm[4], bm[4];   // dest BW ranges for BWT
+						tm[0] = t[0]; tm[1] = t[1];
+						tm[2] = t[2]; tm[3] = t[3];
+						bm[0] = b[0]; bm[1] = t[1];
+						bm[2] = b[2]; bm[3] = t[3];
+						index_t tmp[4], bmp[4]; // dest BW ranges for BWT'
+						tmp[0] = tp[0]; tmp[1] = tp[1];
+						tmp[2] = tp[2]; tmp[3] = tp[3];
+						bmp[0] = bp[0]; bmp[1] = tp[1];
+						bmp[2] = bp[2]; bmp[3] = tp[3];
+						SideLocus<index_t> tlocm, blocm;
+						INIT_LOCS(topm, botm, tlocm, blocm, *ebwt);
+						for(; depm < len; depm++) {
+							int rdcm = seq[len - depm - 1];
+							tmp[0] = tmp[1] = tmp[2] = tmp[3] = topmp;
+							bmp[0] = bmp[1] = bmp[2] = bmp[3] = botmp;
+							if(blocm.valid()) {
+								bwops_++;
+								tm[0] = tm[1] = tm[2] = tm[3] =
+								bm[0] = bm[1] = bm[2] = bm[3] = 0;
+								ebwt->mapBiLFEx(tlocm, blocm, tm, bm, tmp, bmp);
+								SANITY_CHECK_4TUP(tm, bm, tmp, bmp);
+								topm = tm[rdcm]; botm = bm[rdcm];
+								topmp = tmp[rdcm]; botmp = bmp[rdcm];
+								if(botm <= topm) {
+									break;
+								}
+							} else {
+								assert_eq(botm, topm+1);
+								assert_eq(botmp, topmp+1);
+								bwops_++;
+								topm = ebwt->mapLF1(topm, tlocm, rdcm);
+								if(topm == (index_t)0xffffffff) {
+									break;
+								}
+								botm = topm + 1;
+								// topp/botp stay the same
+							}
+							INIT_LOCS(topm, botm, tlocm, blocm, *ebwt);
+						}
+						if(depm == len) {
+							// Success; this is a 1MM hit
+							size_t off5p = dep;  // offset from 5' end of read
+							size_t offstr = dep; // offset into patFw/patRc
+							if(fw == ebwtfw) {
+								off5p = len - off5p - 1;
+							}
+							if(!ebwtfw) {
+								offstr = len - offstr - 1;
+							}
+							Edit e((uint32_t)off5p, j, rdc, EDIT_TYPE_MM, false);
+							results = true;
+							int64_t score = (len - 1) * sc.match();
+							// In --local mode, need to double-check that
+							// end-to-end alignment doesn't violate  local
+							// alignment principles.  Specifically, it
+							// shouldn't to or below 0 anywhere in the middle.
+							int pen = sc.score(rdc, (int)(1 << j), quc - 33);
+							score += pen;
+							bool valid = true;
+							if(local) {
+								int64_t locscore_fw = 0, locscore_bw = 0;
+								for(size_t i = 0; i < len; i++) {
+									if(i == dep) {
+										if(locscore_fw + pen <= 0) {
+											valid = false;
+											break;
+										}
+										locscore_fw += pen;
+									} else {
+										locscore_fw += sc.match();
+									}
+									if(len-i-1 == dep) {
+										if(locscore_bw + pen <= 0) {
+											valid = false;
+											break;
+										}
+										locscore_bw += pen;
+									} else {
+										locscore_bw += sc.match();
+									}
+								}
+							}
+							if(valid) {
+								valid = score >= minsc;
+							}
+							if(valid) {
+#ifndef NDEBUG
+								BTDnaString& rf = tmprfdnastr_;
+								rf.clear();
+								edits_.clear();
+								edits_.push_back(e);
+								if(!fw) Edit::invertPoss(edits_, len, false);
+								Edit::toRef(fw ? read.patFw : read.patRc, edits_, rf);
+								if(!fw) Edit::invertPoss(edits_, len, false);
+								assert_eq(len, rf.length());
+								for(size_t i = 0; i < len; i++) {
+									assert_lt((int)rf[i], 4);
+								}
+								ASSERT_ONLY(index_t toptmp = 0);
+								ASSERT_ONLY(index_t bottmp = 0);
+								assert(ebwtFw->contains(rf, &toptmp, &bottmp));
+#endif
+								index_t toprep = ebwtfw ? topm : topmp;
+								index_t botrep = ebwtfw ? botm : botmp;
+								assert_eq(toprep, toptmp);
+								assert_eq(botrep, bottmp);
+								hits.add1mmEe(toprep, botrep, &e, NULL, fw, score);
+							}
+						}
+					}
+				}
+				if(bot > top && match) {
+					assert_lt(rdc, 4);
+					if(dep == len-1) {
+						// Success; this is an exact hit
+						if(ebwtfw && repex) {
+							if(fw) {
+								results = true;
+								int64_t score = len * sc.match();
+								hits.addExactEeFw(
+												  ebwtfw ? top : topp,
+												  ebwtfw ? bot : botp,
+												  NULL, NULL, fw, score);
+								assert(ebwtFw->contains(seq, NULL, NULL));
+							} else {
+								results = true;
+								int64_t score = len * sc.match();
+								hits.addExactEeRc(
+												  ebwtfw ? top : topp,
+												  ebwtfw ? bot : botp,
+												  NULL, NULL, fw, score);
+								assert(ebwtFw->contains(seq, NULL, NULL));
+							}
+						}
+						break; // End of far loop
+					} else {
+						INIT_LOCS(top, bot, tloc, bloc, *ebwt);
+						assert(sanityPartial(ebwt, ebwtp, seq, len - dep - 1, len, rep1mm, top, bot, topp, botp));
+					}
+				} else {
+					break; // End of far loop
+				}
+			} // for(; dep < len; dep++)
+		} // for(int ebwtfw = 0; ebwtfw < 2; ebwtfw++)
+	} // for(int fw = 0; fw < 2; fw++)
+	return results;
+}
+
+/**
+ * Wrapper for initial invcation of searchSeed.
+ */
+template <typename index_t>
+bool SeedAligner<index_t>::searchSeedBi() {
+	return searchSeedBi(
+						0, 0,
+						0, 0, 0, 0,
+						SideLocus<index_t>(), SideLocus<index_t>(),
+						s_->cons[0], s_->cons[1], s_->cons[2], s_->overall,
+						NULL);
+}
+
+/**
+ * Get tloc, bloc ready for the next step.  If the new range is under
+ * the ceiling.
+ */
+template <typename index_t>
+inline void SeedAligner<index_t>::nextLocsBi(
+											 SideLocus<index_t>& tloc, // top locus
+											 SideLocus<index_t>& bloc, // bot locus
+											 index_t topf,             // top in BWT
+											 index_t botf,             // bot in BWT
+											 index_t topb,             // top in BWT'
+											 index_t botb,             // bot in BWT'
+											 int step                  // step to get ready for
+#if 0
+											 , const SABWOffTrack* prevOt, // previous tracker
+											 SABWOffTrack& ot            // current tracker
+#endif
+											 )
+{
+	assert_gt(botf, 0);
+	assert(ebwtBw_ == NULL || botb > 0);
+	assert_geq(step, 0); // next step can't be first one
+	assert(ebwtBw_ == NULL || botf-topf == botb-topb);
+	if(step == (int)s_->steps.size()) return; // no more steps!
+	// Which direction are we going in next?
+	if(s_->steps[step] > 0) {
+		// Left to right; use BWT'
+		if(botb - topb == 1) {
+			// Already down to 1 row; just init top locus
+			tloc.initFromRow(topb, ebwtBw_->eh(), ebwtBw_->ebwt());
+			bloc.invalidate();
+		} else {
+			SideLocus<index_t>::initFromTopBot(
+											   topb, botb, ebwtBw_->eh(), ebwtBw_->ebwt(), tloc, bloc);
+			assert(bloc.valid());
+		}
+	} else {
+		// Right to left; use BWT
+		if(botf - topf == 1) {
+			// Already down to 1 row; just init top locus
+			tloc.initFromRow(topf, ebwtFw_->eh(), ebwtFw_->ebwt());
+			bloc.invalidate();
+		} else {
+			SideLocus<index_t>::initFromTopBot(
+											   topf, botf, ebwtFw_->eh(), ebwtFw_->ebwt(), tloc, bloc);
+			assert(bloc.valid());
+		}
+	}
+	// Check if we should update the tracker with this refinement
+#if 0
+	if(botf-topf <= BW_OFF_TRACK_CEIL) {
+		if(ot.size() == 0 && prevOt != NULL && prevOt->size() > 0) {
+			// Inherit state from the predecessor
+			ot = *prevOt;
+		}
+		bool ltr = s_->steps[step-1] > 0;
+		int adj = abs(s_->steps[step-1])-1;
+		const Ebwt<index_t>* ebwt = ltr ? ebwtBw_ : ebwtFw_;
+		ot.update(
+				  ltr ? topb : topf,    // top
+				  ltr ? botb : botf,    // bot
+				  adj,                  // adj (to be subtracted from offset)
+				  ebwt->offs(),         // offs array
+				  ebwt->eh().offRate(), // offrate (sample = every 1 << offrate elts)
+				  NULL                  // dead
+				  );
+		assert_gt(ot.size(), 0);
+	}
+#endif
+	assert(botf - topf == 1 ||  bloc.valid());
+	assert(botf - topf > 1  || !bloc.valid());
+}
+
+/**
+ * Report a seed hit found by searchSeedBi(), but first try to extend it out in
+ * either direction as far as possible without hitting any edits.  This will
+ * allow us to prioritize the seed hits better later on.  Call reportHit() when
+ * we're done, which actually adds the hit to the cache.  Returns result from
+ * calling reportHit().
+ */
+template <typename index_t>
+bool SeedAligner<index_t>::extendAndReportHit(
+											  index_t topf,                      // top in BWT
+											  index_t botf,                      // bot in BWT
+											  index_t topb,                      // top in BWT'
+											  index_t botb,                      // bot in BWT'
+											  index_t len,                       // length of hit
+											  DoublyLinkedList<Edit> *prevEdit)  // previous edit
+{
+	index_t nlex = 0, nrex = 0;
+	index_t t[4], b[4];
+	index_t tp[4], bp[4];
+	SideLocus<index_t> tloc, bloc;
+	if(off_ > 0) {
+		const Ebwt<index_t> *ebwt = ebwtFw_;
+		assert(ebwt != NULL);
+		// Extend left using forward index
+		const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc;
+		// See what we get by extending 
+		index_t top = topf, bot = botf;
+		t[0] = t[1] = t[2] = t[3] = 0;
+		b[0] = b[1] = b[2] = b[3] = 0;
+		tp[0] = tp[1] = tp[2] = tp[3] = topb;
+		bp[0] = bp[1] = bp[2] = bp[3] = botb;
+		SideLocus<index_t> tloc, bloc;
+		INIT_LOCS(top, bot, tloc, bloc, *ebwt);
+		for(size_t ii = off_; ii > 0; ii--) {
+			size_t i = ii-1;
+			// Get char from read
+			int rdc = seq.get(i);
+			// See what we get by extending 
+			if(bloc.valid()) {
+				bwops_++;
+				t[0] = t[1] = t[2] = t[3] =
+				b[0] = b[1] = b[2] = b[3] = 0;
+				ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
+				SANITY_CHECK_4TUP(t, b, tp, bp);
+				int nonz = -1;
+				bool abort = false;
+				for(int j = 0; j < 4; j++) {
+					if(b[i] > t[i]) {
+						if(nonz >= 0) {
+							abort = true;
+							break;
+						}
+						nonz = j;
+						top = t[i]; bot = b[i];
+					}
+				}
+				if(abort || nonz != rdc) {
+					break;
+				}
+			} else {
+				assert_eq(bot, top+1);
+				bwops_++;
+				int c = ebwt->mapLF1(top, tloc);
+				if(c != rdc) {
+					break;
+				}
+				bot = top + 1;
+			}
+			if(++nlex == 255) {
+				break;
+			}
+			INIT_LOCS(top, bot, tloc, bloc, *ebwt);
+		}
+	}
+	size_t rdlen = read_->length();
+	size_t nright = rdlen - off_ - len;
+	if(nright > 0 && ebwtBw_ != NULL) {
+		const Ebwt<index_t> *ebwt = ebwtBw_;
+		assert(ebwt != NULL);
+		// Extend right using backward index
+		const BTDnaString& seq = fw_ ? read_->patFw : read_->patRc;
+		// See what we get by extending 
+		index_t top = topb, bot = botb;
+		t[0] = t[1] = t[2] = t[3] = 0;
+		b[0] = b[1] = b[2] = b[3] = 0;
+		tp[0] = tp[1] = tp[2] = tp[3] = topb;
+		bp[0] = bp[1] = bp[2] = bp[3] = botb;
+		INIT_LOCS(top, bot, tloc, bloc, *ebwt);
+		for(size_t i = off_ + len; i < rdlen; i++) {
+			// Get char from read
+			int rdc = seq.get(i);
+			// See what we get by extending 
+			if(bloc.valid()) {
+				bwops_++;
+				t[0] = t[1] = t[2] = t[3] =
+				b[0] = b[1] = b[2] = b[3] = 0;
+				ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
+				SANITY_CHECK_4TUP(t, b, tp, bp);
+				int nonz = -1;
+				bool abort = false;
+				for(int j = 0; j < 4; j++) {
+					if(b[i] > t[i]) {
+						if(nonz >= 0) {
+							abort = true;
+							break;
+						}
+						nonz = j;
+						top = t[i]; bot = b[i];
+					}
+				}
+				if(abort || nonz != rdc) {
+					break;
+				}
+			} else {
+				assert_eq(bot, top+1);
+				bwops_++;
+				int c = ebwt->mapLF1(top, tloc);
+				if(c != rdc) {
+					break;
+				}
+				bot = top + 1;
+			}
+			if(++nrex == 255) {
+				break;
+			}
+			INIT_LOCS(top, bot, tloc, bloc, *ebwt);
+		}
+	}
+	assert_lt(nlex, rdlen);
+	assert_leq(nlex, off_);
+	assert_lt(nrex, rdlen);
+	return reportHit(topf, botf, topb, botb, len, prevEdit);
+}
+
+/**
+ * Report a seed hit found by searchSeedBi() by adding it to the cache.  Return
+ * false if the hit could not be reported because of, e.g., cache exhaustion.
+ */
+template <typename index_t>
+bool SeedAligner<index_t>::reportHit(
+									 index_t topf,                      // top in BWT
+									 index_t botf,                      // bot in BWT
+									 index_t topb,                      // top in BWT'
+									 index_t botb,                      // bot in BWT'
+									 index_t len,                       // length of hit
+									 DoublyLinkedList<Edit> *prevEdit)  // previous edit
+{
+	// Add information about the seed hit to AlignmentCache.  This
+	// information eventually makes its way back to the SeedResults
+	// object when we call finishAlign(...).
+	BTDnaString& rf = tmprfdnastr_;
+	rf.clear();
+	edits_.clear();
+	if(prevEdit != NULL) {
+		prevEdit->toList(edits_);
+		Edit::sort(edits_);
+		assert(Edit::repOk(edits_, *seq_));
+		Edit::toRef(*seq_, edits_, rf);
+	} else {
+		rf = *seq_;
+	}
+	// Sanity check: shouldn't add the same hit twice.  If this
+	// happens, it may be because our zone Constraints are not set up
+	// properly and erroneously return true from acceptable() when they
+	// should return false in some cases.
+	assert_eq(hits_.size(), ca_->curNumRanges());
+	assert(hits_.insert(rf));
+	if(!ca_->addOnTheFly(rf, topf, botf, topb, botb)) {
+		return false;
+	}
+	assert_eq(hits_.size(), ca_->curNumRanges());
+#ifndef NDEBUG
+	// Sanity check that the topf/botf and topb/botb ranges really
+	// correspond to the reference sequence aligned to
+	{
+		BTDnaString rfr;
+		index_t tpf, btf, tpb, btb;
+		tpf = btf = tpb = btb = 0;
+		assert(ebwtFw_->contains(rf, &tpf, &btf));
+		if(ebwtBw_ != NULL) {
+			rfr = rf;
+			rfr.reverse();
+			assert(ebwtBw_->contains(rfr, &tpb, &btb));
+			assert_eq(tpf, topf);
+			assert_eq(btf, botf);
+			assert_eq(tpb, topb);
+			assert_eq(btb, botb);
+		}
+	}
+#endif
+	return true;
+}
+
+/**
+ * Given a seed, search.  Assumes zone 0 = no backtracking.
+ *
+ * Return a list of Seed hits.
+ * 1. Edits
+ * 2. Bidirectional BWT range(s) on either end
+ */
+template <typename index_t>
+bool SeedAligner<index_t>::searchSeedBi(
+										int step,                // depth into steps_[] array
+										int depth,               // recursion depth
+										index_t topf,            // top in BWT
+										index_t botf,            // bot in BWT
+										index_t topb,            // top in BWT'
+										index_t botb,            // bot in BWT'
+										SideLocus<index_t> tloc, // locus for top (perhaps unititialized)
+										SideLocus<index_t> bloc, // locus for bot (perhaps unititialized)
+										Constraint c0,           // constraints to enforce in seed zone 0
+										Constraint c1,           // constraints to enforce in seed zone 1
+										Constraint c2,           // constraints to enforce in seed zone 2
+										Constraint overall,      // overall constraints to enforce
+										DoublyLinkedList<Edit> *prevEdit  // previous edit
+#if 0
+										, const SABWOffTrack* prevOt // prev off tracker (if tracking started)
+#endif
+										)
+{
+	assert(s_ != NULL);
+	const InstantiatedSeed& s = *s_;
+	assert_gt(s.steps.size(), 0);
+	assert(ebwtBw_ == NULL || ebwtBw_->eh().ftabChars() == ebwtFw_->eh().ftabChars());
+#ifndef NDEBUG
+	for(int i = 0; i < 4; i++) {
+		assert(ebwtBw_ == NULL || ebwtBw_->fchr()[i] == ebwtFw_->fchr()[i]);
+	}
+#endif
+	if(step == (int)s.steps.size()) {
+		// Finished aligning seed
+		assert(c0.acceptable());
+		assert(c1.acceptable());
+		assert(c2.acceptable());
+		if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) {
+			return false; // Memory exhausted
+		}
+		return true;
+	}
+#ifndef NDEBUG
+	if(depth > 0) {
+		assert(botf - topf == 1 ||  bloc.valid());
+		assert(botf - topf > 1  || !bloc.valid());
+	}
+#endif
+	int off;
+	index_t tp[4], bp[4]; // dest BW ranges for "prime" index
+	if(step == 0) {
+		// Just starting
+		assert(prevEdit == NULL);
+		assert(!tloc.valid());
+		assert(!bloc.valid());
+		off = s.steps[0];
+		bool ltr = off > 0;
+		off = abs(off)-1;
+		// Check whether/how far we can jump using ftab or fchr
+		int ftabLen = ebwtFw_->eh().ftabChars();
+		if(ftabLen > 1 && ftabLen <= s.maxjump) {
+			if(!ltr) {
+				assert_geq(off+1, ftabLen-1);
+				off = off - ftabLen + 1;
+			}
+			ebwtFw_->ftabLoHi(*seq_, off, false, topf, botf);
+#ifdef NDEBUG
+			if(botf - topf == 0) return true;
+#endif
+#ifdef NDEBUG
+			if(ebwtBw_ != NULL) {
+				topb = ebwtBw_->ftabHi(*seq_, off);
+				botb = topb + (botf-topf);
+			}
+#else
+			if(ebwtBw_ != NULL) {
+				ebwtBw_->ftabLoHi(*seq_, off, false, topb, botb);
+				assert_eq(botf-topf, botb-topb);
+			}
+			if(botf - topf == 0) return true;
+#endif
+			step += ftabLen;
+		} else if(s.maxjump > 0) {
+			// Use fchr
+			int c = (*seq_)[off];
+			assert_range(0, 3, c);
+			topf = topb = ebwtFw_->fchr()[c];
+			botf = botb = ebwtFw_->fchr()[c+1];
+			if(botf - topf == 0) return true;
+			step++;
+		} else {
+			assert_eq(0, s.maxjump);
+			topf = topb = 0;
+			botf = botb = ebwtFw_->fchr()[4];
+		}
+		if(step == (int)s.steps.size()) {
+			// Finished aligning seed
+			assert(c0.acceptable());
+			assert(c1.acceptable());
+			assert(c2.acceptable());
+			if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) {
+				return false; // Memory exhausted
+			}
+			return true;
+		}
+		nextLocsBi(tloc, bloc, topf, botf, topb, botb, step);
+		assert(tloc.valid());
+	} else assert(prevEdit != NULL);
+	assert(tloc.valid());
+	assert(botf - topf == 1 ||  bloc.valid());
+	assert(botf - topf > 1  || !bloc.valid());
+	assert_geq(step, 0);
+	index_t t[4], b[4]; // dest BW ranges
+	Constraint* zones[3] = { &c0, &c1, &c2 };
+	ASSERT_ONLY(index_t lasttot = botf - topf);
+	for(int i = step; i < (int)s.steps.size(); i++) {
+		assert_gt(botf, topf);
+		assert(botf - topf == 1 ||  bloc.valid());
+		assert(botf - topf > 1  || !bloc.valid());
+		assert(ebwtBw_ == NULL || botf-topf == botb-topb);
+		assert(tloc.valid());
+		off = s.steps[i];
+		bool ltr = off > 0;
+		const Ebwt<index_t>* ebwt = ltr ? ebwtBw_ : ebwtFw_;
+		assert(ebwt != NULL);
+		if(ltr) {
+			tp[0] = tp[1] = tp[2] = tp[3] = topf;
+			bp[0] = bp[1] = bp[2] = bp[3] = botf;
+		} else {
+			tp[0] = tp[1] = tp[2] = tp[3] = topb;
+			bp[0] = bp[1] = bp[2] = bp[3] = botb;
+		}
+		t[0] = t[1] = t[2] = t[3] = b[0] = b[1] = b[2] = b[3] = 0;
+		if(bloc.valid()) {
+			// Range delimited by tloc/bloc has size >1.  If size == 1,
+			// we use a simpler query (see if(!bloc.valid()) blocks below)
+			bwops_++;
+			ebwt->mapBiLFEx(tloc, bloc, t, b, tp, bp);
+			ASSERT_ONLY(index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3]));
+			ASSERT_ONLY(index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3]));
+			assert_eq(tot, totp);
+			assert_leq(tot, lasttot);
+			ASSERT_ONLY(lasttot = tot);
+		}
+		index_t *tf = ltr ? tp : t, *tb = ltr ? t : tp;
+		index_t *bf = ltr ? bp : b, *bb = ltr ? b : bp;
+		off = abs(off)-1;
+		//
+		bool leaveZone = s.zones[i].first < 0;
+		//bool leaveZoneIns = zones_[i].second < 0;
+		Constraint& cons    = *zones[abs(s.zones[i].first)];
+		Constraint& insCons = *zones[abs(s.zones[i].second)];
+		int c = (*seq_)[off];  assert_range(0, 4, c);
+		int q = (*qual_)[off];
+		// Is it legal for us to advance on characters other than 'c'?
+		if(!(cons.mustMatch() && !overall.mustMatch()) || c == 4) {
+			// There may be legal edits
+			bool bail = false;
+			if(!bloc.valid()) {
+				// Range delimited by tloc/bloc has size 1
+				index_t ntop = ltr ? topb : topf;
+				bwops_++;
+				int cc = ebwt->mapLF1(ntop, tloc);
+				assert_range(-1, 3, cc);
+				if(cc < 0) bail = true;
+				else { t[cc] = ntop; b[cc] = ntop+1; }
+			}
+			if(!bail) {
+				if((cons.canMismatch(q, *sc_) && overall.canMismatch(q, *sc_)) || c == 4) {
+					Constraint oldCons = cons, oldOvCons = overall;
+					SideLocus<index_t> oldTloc = tloc, oldBloc = bloc;
+					if(c != 4) {
+						cons.chargeMismatch(q, *sc_);
+						overall.chargeMismatch(q, *sc_);
+					}
+					// Can leave the zone as-is
+					if(!leaveZone || (cons.acceptable() && overall.acceptable())) {
+						for(int j = 0; j < 4; j++) {
+							if(j == c || b[j] == t[j]) continue;
+							// Potential mismatch
+							nextLocsBi(tloc, bloc, tf[j], bf[j], tb[j], bb[j], i+1);
+							int loff = off;
+							if(!ltr) loff = (int)(s.steps.size() - loff - 1);
+							assert(prevEdit == NULL || prevEdit->next == NULL);
+							Edit edit(off, j, c, EDIT_TYPE_MM, false);
+							DoublyLinkedList<Edit> editl;
+							editl.payload = edit;
+							if(prevEdit != NULL) {
+								prevEdit->next = &editl;
+								editl.prev = prevEdit;
+							}
+							assert(editl.next == NULL);
+							bwedits_++;
+							if(!searchSeedBi(
+											 i+1,     // depth into steps_[] array
+											 depth+1, // recursion depth
+											 tf[j],   // top in BWT
+											 bf[j],   // bot in BWT
+											 tb[j],   // top in BWT'
+											 bb[j],   // bot in BWT'
+											 tloc,    // locus for top (perhaps unititialized)
+											 bloc,    // locus for bot (perhaps unititialized)
+											 c0,      // constraints to enforce in seed zone 0
+											 c1,      // constraints to enforce in seed zone 1
+											 c2,      // constraints to enforce in seed zone 2
+											 overall, // overall constraints to enforce
+											 &editl))  // latest edit
+							{
+								return false;
+							}
+							if(prevEdit != NULL) prevEdit->next = NULL;
+						}
+					} else {
+						// Not enough edits to make this path
+						// non-redundant with other seeds
+					}
+					cons = oldCons;
+					overall = oldOvCons;
+					tloc = oldTloc;
+					bloc = oldBloc;
+				}
+				if(cons.canGap() && overall.canGap()) {
+					throw 1; // TODO
+					int delEx = 0;
+					if(cons.canDelete(delEx, *sc_) && overall.canDelete(delEx, *sc_)) {
+						// Try delete
+					}
+					int insEx = 0;
+					if(insCons.canInsert(insEx, *sc_) && overall.canInsert(insEx, *sc_)) {
+						// Try insert
+					}
+				}
+			} // if(!bail)
+		}
+		if(c == 4) {
+			return true; // couldn't handle the N
+		}
+		if(leaveZone && (!cons.acceptable() || !overall.acceptable())) {
+			// Not enough edits to make this path non-redundant with
+			// other seeds
+			return true;
+		}
+		if(!bloc.valid()) {
+			assert(ebwtBw_ == NULL || bp[c] == tp[c]+1);
+			// Range delimited by tloc/bloc has size 1
+			index_t top = ltr ? topb : topf;
+			bwops_++;
+			t[c] = ebwt->mapLF1(top, tloc, c);
+			if(t[c] == (index_t)OFF_MASK) {
+				return true;
+			}
+			assert_geq(t[c], ebwt->fchr()[c]);
+			assert_lt(t[c],  ebwt->fchr()[c+1]);
+			b[c] = t[c]+1;
+			assert_gt(b[c], 0);
+		}
+		assert(ebwtBw_ == NULL || bf[c]-tf[c] == bb[c]-tb[c]);
+		assert_leq(bf[c]-tf[c], lasttot);
+		ASSERT_ONLY(lasttot = bf[c]-tf[c]);
+		if(b[c] == t[c]) {
+			return true;
+		}
+		topf = tf[c]; botf = bf[c];
+		topb = tb[c]; botb = bb[c];
+		if(i+1 == (int)s.steps.size()) {
+			// Finished aligning seed
+			assert(c0.acceptable());
+			assert(c1.acceptable());
+			assert(c2.acceptable());
+			if(!reportHit(topf, botf, topb, botb, seq_->length(), prevEdit)) {
+				return false; // Memory exhausted
+			}
+			return true;
+		}
+		nextLocsBi(tloc, bloc, tf[c], bf[c], tb[c], bb[c], i+1);
+	}
+	return true;
+}
+
+#endif /*ALIGNER_SEED_H_*/
diff --git a/aligner_seed_policy.cpp b/aligner_seed_policy.cpp
new file mode 100644
index 0000000..b50980b
--- /dev/null
+++ b/aligner_seed_policy.cpp
@@ -0,0 +1,888 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <limits>
+#include "ds.h"
+#include "aligner_seed_policy.h"
+#include "mem_ids.h"
+
+using namespace std;
+
+static int parseFuncType(const std::string& otype) {
+	string type = otype;
+	if(type == "C" || type == "Constant") {
+		return SIMPLE_FUNC_CONST;
+	} else if(type == "L" || type == "Linear") {
+		return SIMPLE_FUNC_LINEAR;
+	} else if(type == "S" || type == "Sqrt") {
+		return SIMPLE_FUNC_SQRT;
+	} else if(type == "G" || type == "Log") {
+		return SIMPLE_FUNC_LOG;
+	}
+	std::cerr << "Error: Bad function type '" << otype.c_str()
+	          << "'.  Should be C (constant), L (linear), "
+	          << "S (square root) or G (natural log)." << std::endl;
+	throw 1;
+}
+
+#define PARSE_FUNC(fv) { \
+	if(ctoks.size() >= 1) { \
+		fv.setType(parseFuncType(ctoks[0])); \
+	} \
+	if(ctoks.size() >= 2) { \
+		double co; \
+		istringstream tmpss(ctoks[1]); \
+		tmpss >> co; \
+		fv.setConst(co); \
+	} \
+	if(ctoks.size() >= 3) { \
+		double ce; \
+		istringstream tmpss(ctoks[2]); \
+		tmpss >> ce; \
+		fv.setCoeff(ce); \
+	} \
+	if(ctoks.size() >= 4) { \
+		double mn; \
+		istringstream tmpss(ctoks[3]); \
+		tmpss >> mn; \
+		fv.setMin(mn); \
+	} \
+	if(ctoks.size() >= 5) { \
+		double mx; \
+		istringstream tmpss(ctoks[4]); \
+		tmpss >> mx; \
+		fv.setMin(mx); \
+	} \
+}
+
+/**
+ * Parse alignment policy when provided in this format:
+ * <lab>=<val>;<lab>=<val>;<lab>=<val>...
+ *
+ * And label=value possibilities are:
+ *
+ * Bonus for a match
+ * -----------------
+ *
+ * MA=xx (default: MA=0, or MA=2 if --local is set)
+ *
+ *    xx = Each position where equal read and reference characters match up
+ *         in the alignment contriubtes this amount to the total score.
+ *
+ * Penalty for a mismatch
+ * ----------------------
+ *
+ * MMP={Cxx|Q|RQ} (default: MMP=C6)
+ *
+ *   Cxx = Each mismatch costs xx.  If MMP=Cxx is specified, quality
+ *         values are ignored when assessing penalities for mismatches.
+ *   Q   = Each mismatch incurs a penalty equal to the mismatched base's
+ *         value.
+ *   R   = Each mismatch incurs a penalty equal to the mismatched base's
+ *         rounded quality value.  Qualities are rounded off to the
+ *         nearest 10, and qualities greater than 30 are rounded to 30.
+ *
+ * Penalty for position with N (in either read or reference)
+ * ---------------------------------------------------------
+ *
+ * NP={Cxx|Q|RQ} (default: NP=C1)
+ *
+ *   Cxx = Each alignment position with an N in either the read or the
+ *         reference costs xx.  If NP=Cxx is specified, quality values are
+ *         ignored when assessing penalities for Ns.
+ *   Q   = Each alignment position with an N in either the read or the
+ *         reference incurs a penalty equal to the read base's quality
+ *         value.
+ *   R   = Each alignment position with an N in either the read or the
+ *         reference incurs a penalty equal to the read base's rounded
+ *         quality value.  Qualities are rounded off to the nearest 10,
+ *         and qualities greater than 30 are rounded to 30.
+ *
+ * Penalty for a read gap
+ * ----------------------
+ *
+ * RDG=xx,yy (default: RDG=5,3)
+ *
+ *   xx    = Read gap open penalty.
+ *   yy    = Read gap extension penalty.
+ *
+ * Total cost incurred by a read gap = xx + (yy * gap length)
+ *
+ * Penalty for a reference gap
+ * ---------------------------
+ *
+ * RFG=xx,yy (default: RFG=5,3)
+ *
+ *   xx    = Reference gap open penalty.
+ *   yy    = Reference gap extension penalty.
+ *
+ * Total cost incurred by a reference gap = xx + (yy * gap length)
+ *
+ * Minimum score for valid alignment
+ * ---------------------------------
+ *
+ * MIN=xx,yy (defaults: MIN=-0.6,-0.6, or MIN=0.0,0.66 if --local is set)
+ *
+ *   xx,yy = For a read of length N, the total score must be at least
+ *           xx + (read length * yy) for the alignment to be valid.  The
+ *           total score is the sum of all negative penalties (from
+ *           mismatches and gaps) and all positive bonuses.  The minimum
+ *           can be negative (and is by default in global alignment mode).
+ *
+ * Score floor for local alignment
+ * -------------------------------
+ *
+ * FL=xx,yy (defaults: FL=-Infinity,0.0, or FL=0.0,0.0 if --local is set)
+ *
+ *   xx,yy = If a cell in the dynamic programming table has a score less
+ *           than xx + (read length * yy), then no valid alignment can go
+ *           through it.  Defaults are highly recommended.
+ *
+ * N ceiling
+ * ---------
+ *
+ * NCEIL=xx,yy (default: NCEIL=0.0,0.15)
+ *
+ *   xx,yy = For a read of length N, the number of alignment
+ *           positions with an N in either the read or the
+ *           reference cannot exceed
+ *           ceiling = xx + (read length * yy).  If the ceiling is
+ *           exceeded, the alignment is considered invalid.
+ *
+ * Seeds
+ * -----
+ *
+ * SEED=mm,len,ival (default: SEED=0,22)
+ *
+ *   mm   = Maximum number of mismatches allowed within a seed.
+ *          Must be >= 0 and <= 2.  Note that 2-mismatch mode is
+ *          not fully sensitive; i.e. some 2-mismatch seed
+ *          alignments may be missed.
+ *   len  = Length of seed.
+ *   ival = Interval between seeds.  If not specified, seed
+ *          interval is determined by IVAL.
+ *
+ * Seed interval
+ * -------------
+ *
+ * IVAL={L|S|C},xx,yy (default: IVAL=S,1.0,0.0)
+ *
+ *   L  = let interval between seeds be a linear function of the
+ *        read length.  xx and yy are the constant and linear
+ *        coefficients respectively.  In other words, the interval
+ *        equals a * len + b, where len is the read length.
+ *        Intervals less than 1 are rounded up to 1.
+ *   S  = let interval between seeds be a function of the sqaure
+ *        root of the  read length.  xx and yy are the
+ *        coefficients.  In other words, the interval equals
+ *        a * sqrt(len) + b, where len is the read length.
+ *        Intervals less than 1 are rounded up to 1.
+ *   C  = Like S but uses cube root of length instead of square
+ *        root.
+ *
+ * Example 1:
+ *
+ *  SEED=1,10,5 and read sequence is TGCTATCGTACGATCGTAC:
+ *
+ *  The following seeds are extracted from the forward
+ *  representation of the read and aligned to the reference
+ *  allowing up to 1 mismatch:
+ *
+ *  Read:    TGCTATCGTACGATCGTACA
+ *
+ *  Seed 1+: TGCTATCGTA
+ *  Seed 2+:      TCGTACGATC
+ *  Seed 3+:           CGATCGTACA
+ *
+ *  ...and the following are extracted from the reverse-complement
+ *  representation of the read and align to the reference allowing
+ *  up to 1 mismatch:
+ *
+ *  Seed 1-: TACGATAGCA
+ *  Seed 2-:      GATCGTACGA
+ *  Seed 3-:           TGTACGATCG
+ *
+ * Example 2:
+ *
+ *  SEED=1,20,20 and read sequence is TGCTATCGTACGATC.  The seed
+ *  length is 20 but the read is only 15 characters long.  In this
+ *  case, Bowtie2 automatically shrinks the seed length to be equal
+ *  to the read length.
+ *
+ *  Read:    TGCTATCGTACGATC
+ *
+ *  Seed 1+: TGCTATCGTACGATC
+ *  Seed 1-: GATCGTACGATAGCA
+ *
+ * Example 3:
+ *
+ *  SEED=1,10,10 and read sequence is TGCTATCGTACGATC.  Only one seed
+ *  fits on the read; a second seed would overhang the end of the read
+ *  by 5 positions.  In this case, Bowtie2 extracts one seed.
+ *
+ *  Read:    TGCTATCGTACGATC
+ *
+ *  Seed 1+: TGCTATCGTA
+ *  Seed 1-: TACGATAGCA
+ */
+void SeedAlignmentPolicy::parseString(
+	const       std::string& s,
+	bool        local,
+	bool        noisyHpolymer,
+	bool        ignoreQuals,
+	int&        bonusMatchType,
+	int&        bonusMatch,
+	int&        penMmcType,
+	int&        penMmcMax,
+	int&        penMmcMin,
+	int&        penNType,
+	int&        penN,
+	int&        penRdExConst,
+	int&        penRfExConst,
+	int&        penRdExLinear,
+	int&        penRfExLinear,
+	SimpleFunc& costMin,
+	SimpleFunc& nCeil,
+	bool&       nCatPair,
+	int&        multiseedMms,
+	int&        multiseedLen,
+	SimpleFunc& multiseedIval,
+	size_t&     failStreak,
+	size_t&     seedRounds,
+    SimpleFunc* penCanSplice,
+    SimpleFunc* penNoncanSplice,
+    SimpleFunc* penIntronLen)
+{
+
+	bonusMatchType    = local ? DEFAULT_MATCH_BONUS_TYPE_LOCAL : DEFAULT_MATCH_BONUS_TYPE;
+	bonusMatch        = local ? DEFAULT_MATCH_BONUS_LOCAL : DEFAULT_MATCH_BONUS;
+	penMmcType        = ignoreQuals ? DEFAULT_MM_PENALTY_TYPE_IGNORE_QUALS :
+	                                  DEFAULT_MM_PENALTY_TYPE;
+	penMmcMax         = DEFAULT_MM_PENALTY_MAX;
+	penMmcMin         = DEFAULT_MM_PENALTY_MIN;
+	penNType          = DEFAULT_N_PENALTY_TYPE;
+	penN              = DEFAULT_N_PENALTY;
+	
+	const double DMAX = std::numeric_limits<double>::max();
+    /*
+	costMin.init(
+		local ? SIMPLE_FUNC_LOG : SIMPLE_FUNC_LINEAR,
+		local ? DEFAULT_MIN_CONST_LOCAL  : DEFAULT_MIN_CONST,
+		local ? DEFAULT_MIN_LINEAR_LOCAL : DEFAULT_MIN_LINEAR);
+    */
+    costMin.init(
+                 local ? SIMPLE_FUNC_LOG : SIMPLE_FUNC_CONST,
+                 local ? DEFAULT_MIN_CONST_LOCAL  : -18,
+                 local ? DEFAULT_MIN_LINEAR_LOCAL : 0);
+	nCeil.init(
+		SIMPLE_FUNC_LINEAR, 0.0f, DMAX,
+		DEFAULT_N_CEIL_CONST, DEFAULT_N_CEIL_LINEAR);
+	multiseedIval.init(
+		DEFAULT_IVAL, 1.0f, DMAX,
+		DEFAULT_IVAL_B, DEFAULT_IVAL_A);
+	nCatPair          = DEFAULT_N_CAT_PAIR;
+
+	if(!noisyHpolymer) {
+		penRdExConst  = DEFAULT_READ_GAP_CONST;
+		penRdExLinear = DEFAULT_READ_GAP_LINEAR;
+		penRfExConst  = DEFAULT_REF_GAP_CONST;
+		penRfExLinear = DEFAULT_REF_GAP_LINEAR;
+	} else {
+		penRdExConst  = DEFAULT_READ_GAP_CONST_BADHPOLY;
+		penRdExLinear = DEFAULT_READ_GAP_LINEAR_BADHPOLY;
+		penRfExConst  = DEFAULT_REF_GAP_CONST_BADHPOLY;
+		penRfExLinear = DEFAULT_REF_GAP_LINEAR_BADHPOLY;
+	}
+	
+	multiseedMms      = DEFAULT_SEEDMMS;
+	multiseedLen      = DEFAULT_SEEDLEN;
+	
+	EList<string> toks(MISC_CAT);
+	string tok;
+	istringstream ss(s);
+	int setting = 0;
+	// Get each ;-separated token
+	while(getline(ss, tok, ';')) {
+		setting++;
+		EList<string> etoks(MISC_CAT);
+		string etok;
+		// Divide into tokens on either side of =
+		istringstream ess(tok);
+		while(getline(ess, etok, '=')) {
+			etoks.push_back(etok);
+		}
+		// Must be exactly 1 =
+		if(etoks.size() != 2) {
+			cerr << "Error parsing alignment policy setting " << setting
+			     << "; must be bisected by = sign" << endl
+				 << "Policy: " << s.c_str() << endl;
+			assert(false); throw 1;
+		}
+		// LHS is tag, RHS value
+		string tag = etoks[0], val = etoks[1];
+		// Separate value into comma-separated tokens
+		EList<string> ctoks(MISC_CAT);
+		string ctok;
+		istringstream css(val);
+		while(getline(css, ctok, ',')) {
+			ctoks.push_back(ctok);
+		}
+		if(ctoks.size() == 0) {
+			cerr << "Error parsing alignment policy setting " << setting
+			     << "; RHS must have at least 1 token" << endl
+				 << "Policy: " << s.c_str() << endl;
+			assert(false); throw 1;
+		}
+		for(size_t i = 0; i < ctoks.size(); i++) {
+			if(ctoks[i].length() == 0) {
+				cerr << "Error parsing alignment policy setting " << setting
+				     << "; token " << i+1 << " on RHS had length=0" << endl
+					 << "Policy: " << s.c_str() << endl;
+				assert(false); throw 1;
+			}
+		}
+		// Bonus for a match
+		// MA=xx (default: MA=0, or MA=10 if --local is set)
+		if(tag == "MA") {
+			if(ctoks.size() != 1) {
+				cerr << "Error parsing alignment policy setting " << setting
+				     << "; RHS must have 1 token" << endl
+					 << "Policy: " << s.c_str() << endl;
+				assert(false); throw 1;
+			}
+			string tmp = ctoks[0];
+			istringstream tmpss(tmp);
+			tmpss >> bonusMatch;
+		}
+		// Scoring for mismatches
+		// MMP={Cxx|Q|RQ}
+		//        Cxx = constant, where constant is integer xx
+		//        Qxx = equal to quality, scaled
+		//        R   = equal to maq-rounded quality value (rounded to nearest
+		//              10, can't be greater than 30)
+		else if(tag == "MMP") {
+			if(ctoks.size() > 3) {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'"
+				     << "; RHS must have at most 3 tokens" << endl
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+			if(ctoks[0][0] == 'C') {
+				string tmp = ctoks[0].substr(1);
+				// Parse constant penalty
+				istringstream tmpss(tmp);
+				tmpss >> penMmcMax;
+				penMmcMin = penMmcMax;
+				// Parse constant penalty
+				penMmcType = COST_MODEL_CONSTANT;
+			} else if(ctoks[0][0] == 'Q') {
+				if(ctoks.size() >= 2) {
+					string tmp = ctoks[1];
+					istringstream tmpss(tmp);
+					tmpss >> penMmcMax;
+				} else {
+					penMmcMax = DEFAULT_MM_PENALTY_MAX;
+				}
+				if(ctoks.size() >= 3) {
+					string tmp = ctoks[2];
+					istringstream tmpss(tmp);
+					tmpss >> penMmcMin;
+				} else {
+					penMmcMin = DEFAULT_MM_PENALTY_MIN;
+				}
+				if(penMmcMin > penMmcMax) {
+					cerr << "Error: Maximum mismatch penalty (" << penMmcMax
+					     << ") is less than minimum penalty (" << penMmcMin
+						 << endl;
+					throw 1;
+				}
+				// Set type to =quality
+				penMmcType = COST_MODEL_QUAL;
+			} else if(ctoks[0][0] == 'R') {
+				// Set type to=Maq-quality
+				penMmcType = COST_MODEL_ROUNDED_QUAL;
+			} else {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'"
+				     << "; RHS must start with C, Q or R" << endl
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+		}
+		// Scoring for mismatches where read char=N
+		// NP={Cxx|Q|RQ}
+		//        Cxx = constant, where constant is integer xx
+		//        Q   = equal to quality
+		//        R   = equal to maq-rounded quality value (rounded to nearest
+		//              10, can't be greater than 30)
+		else if(tag == "NP") {
+			if(ctoks.size() != 1) {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'"
+				     << "; RHS must have 1 token" << endl
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+			if(ctoks[0][0] == 'C') {
+				string tmp = ctoks[0].substr(1);
+				// Parse constant penalty
+				istringstream tmpss(tmp);
+				tmpss >> penN;
+				// Parse constant penalty
+				penNType = COST_MODEL_CONSTANT;
+			} else if(ctoks[0][0] == 'Q') {
+				// Set type to =quality
+				penNType = COST_MODEL_QUAL;
+			} else if(ctoks[0][0] == 'R') {
+				// Set type to=Maq-quality
+				penNType = COST_MODEL_ROUNDED_QUAL;
+			} else {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'"
+				     << "; RHS must start with C, Q or R" << endl
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+		}
+		// Scoring for read gaps
+		// RDG=xx,yy,zz
+		//        xx = read gap open penalty
+		//        yy = read gap extension penalty constant coefficient
+		//             (defaults to open penalty)
+		//        zz = read gap extension penalty linear coefficient
+		//             (defaults to 0)
+		else if(tag == "RDG") {
+			if(ctoks.size() >= 1) {
+				istringstream tmpss(ctoks[0]);
+				tmpss >> penRdExConst;
+			} else {
+				penRdExConst = noisyHpolymer ?
+					DEFAULT_READ_GAP_CONST_BADHPOLY :
+					DEFAULT_READ_GAP_CONST;
+			}
+			if(ctoks.size() >= 2) {
+				istringstream tmpss(ctoks[1]);
+				tmpss >> penRdExLinear;
+			} else {
+				penRdExLinear = noisyHpolymer ?
+					DEFAULT_READ_GAP_LINEAR_BADHPOLY :
+					DEFAULT_READ_GAP_LINEAR;
+			}
+		}
+		// Scoring for reference gaps
+		// RFG=xx,yy,zz
+		//        xx = ref gap open penalty
+		//        yy = ref gap extension penalty constant coefficient
+		//             (defaults to open penalty)
+		//        zz = ref gap extension penalty linear coefficient
+		//             (defaults to 0)
+		else if(tag == "RFG") {
+			if(ctoks.size() >= 1) {
+				istringstream tmpss(ctoks[0]);
+				tmpss >> penRfExConst;
+			} else {
+				penRfExConst = noisyHpolymer ?
+					DEFAULT_REF_GAP_CONST_BADHPOLY :
+					DEFAULT_REF_GAP_CONST;
+			}
+			if(ctoks.size() >= 2) {
+				istringstream tmpss(ctoks[1]);
+				tmpss >> penRfExLinear;
+			} else {
+				penRfExLinear = noisyHpolymer ?
+					DEFAULT_REF_GAP_LINEAR_BADHPOLY :
+					DEFAULT_REF_GAP_LINEAR;
+			}
+		}
+		// Minimum score as a function of read length
+		// MIN=xx,yy
+		//        xx = constant coefficient
+		//        yy = linear coefficient
+		else if(tag == "MIN") {
+			PARSE_FUNC(costMin);
+		}
+		// Per-read N ceiling as a function of read length
+		// NCEIL=xx,yy
+		//        xx = N ceiling constant coefficient
+		//        yy = N ceiling linear coefficient (set to 0 if unspecified)
+		else if(tag == "NCEIL") {
+			PARSE_FUNC(nCeil);
+		}
+		/*
+		 * Seeds
+		 * -----
+		 *
+		 * SEED=mm,len,ival (default: SEED=0,22)
+		 *
+		 *   mm   = Maximum number of mismatches allowed within a seed.
+		 *          Must be >= 0 and <= 2.  Note that 2-mismatch mode is
+		 *          not fully sensitive; i.e. some 2-mismatch seed
+		 *          alignments may be missed.
+		 *   len  = Length of seed.
+		 *   ival = Interval between seeds.  If not specified, seed
+		 *          interval is determined by IVAL.
+		 */
+		else if(tag == "SEED") {
+			if(ctoks.size() > 2) {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'; RHS must have 1 or 2 tokens, "
+					 << "had " << ctoks.size() << ".  "
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+			if(ctoks.size() >= 1) {
+				istringstream tmpss(ctoks[0]);
+				tmpss >> multiseedMms;
+				if(multiseedMms > 1) {
+					cerr << "Error: -N was set to " << multiseedMms << ", but cannot be set greater than 1" << endl;
+					throw 1;
+				}
+				if(multiseedMms < 0) {
+					cerr << "Error: -N was set to a number less than 0 (" << multiseedMms << ")" << endl;
+					throw 1;
+				}
+			}
+			if(ctoks.size() >= 2) {
+				istringstream tmpss(ctoks[1]);
+				tmpss >> multiseedLen;
+			} else {
+				multiseedLen = DEFAULT_SEEDLEN;
+			}
+		}
+		else if(tag == "SEEDLEN") {
+			if(ctoks.size() > 1) {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'; RHS must have 1 token, "
+					 << "had " << ctoks.size() << ".  "
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+			if(ctoks.size() >= 1) {
+				istringstream tmpss(ctoks[0]);
+				tmpss >> multiseedLen;
+			}
+		}
+		else if(tag == "DPS") {
+			if(ctoks.size() > 1) {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'; RHS must have 1 token, "
+					 << "had " << ctoks.size() << ".  "
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+			if(ctoks.size() >= 1) {
+				istringstream tmpss(ctoks[0]);
+				tmpss >> failStreak;
+			}
+		}
+		else if(tag == "ROUNDS") {
+			if(ctoks.size() > 1) {
+				cerr << "Error parsing alignment policy setting "
+				     << "'" << tag.c_str() << "'; RHS must have 1 token, "
+					 << "had " << ctoks.size() << ".  "
+					 << "Policy: '" << s.c_str() << "'" << endl;
+				assert(false); throw 1;
+			}
+			if(ctoks.size() >= 1) {
+				istringstream tmpss(ctoks[0]);
+				tmpss >> seedRounds;
+			}
+		}
+		/*
+		 * Seed interval
+		 * -------------
+		 *
+		 * IVAL={L|S|C},a,b (default: IVAL=S,1.0,0.0)
+		 *
+		 *   L  = let interval between seeds be a linear function of the
+		 *        read length.  xx and yy are the constant and linear
+		 *        coefficients respectively.  In other words, the interval
+		 *        equals a * len + b, where len is the read length.
+		 *        Intervals less than 1 are rounded up to 1.
+		 *   S  = let interval between seeds be a function of the sqaure
+		 *        root of the  read length.  xx and yy are the
+		 *        coefficients.  In other words, the interval equals
+		 *        a * sqrt(len) + b, where len is the read length.
+		 *        Intervals less than 1 are rounded up to 1.
+		 *   C  = Like S but uses cube root of length instead of square
+		 *        root.
+		 */
+		else if(tag == "IVAL") {
+			PARSE_FUNC(multiseedIval);
+		}
+        else if(tag == "INTRONLEN") {
+            assert(penIntronLen != NULL);
+			PARSE_FUNC((*penIntronLen));
+		}
+		else {
+			// Unknown tag
+			cerr << "Unexpected alignment policy setting "
+				 << "'" << tag.c_str() << "'" << endl
+				 << "Policy: '" << s.c_str() << "'" << endl;
+			assert(false); throw 1;
+		}
+	}
+}
+
+#ifdef ALIGNER_SEED_POLICY_MAIN
+int main() {
+
+	int bonusMatchType;
+	int bonusMatch;
+	int penMmcType;
+	int penMmc;
+	int penNType;
+	int penN;
+	int penRdExConst;
+	int penRfExConst;
+	int penRdExLinear;
+	int penRfExLinear;
+	SimpleFunc costMin;
+	SimpleFunc costFloor;
+	SimpleFunc nCeil;
+	bool nCatPair;
+	int multiseedMms;
+	int multiseedLen;
+	SimpleFunc msIval;
+	SimpleFunc posfrac;
+	SimpleFunc rowmult;
+	uint32_t mhits;
+
+	{
+		cout << "Case 1: Defaults 1 ... ";
+		const char *pol = "";
+		SeedAlignmentPolicy::parseString(
+			string(pol),
+			false,              // --local?
+			false,              // noisy homopolymers a la 454?
+			false,              // ignore qualities?
+			bonusMatchType,
+			bonusMatch,
+			penMmcType,
+			penMmc,
+			penNType,
+			penN,
+			penRdExConst,
+			penRfExConst,
+			penRdExLinear,
+			penRfExLinear,
+			costMin,
+			costFloor,
+			nCeil,
+			nCatPair,
+			multiseedMms,
+			multiseedLen,
+			msIval,
+			mhits);
+		
+		assert_eq(DEFAULT_MATCH_BONUS_TYPE,   bonusMatchType);
+		assert_eq(DEFAULT_MATCH_BONUS,        bonusMatch);
+		assert_eq(DEFAULT_MM_PENALTY_TYPE,    penMmcType);
+		assert_eq(DEFAULT_MM_PENALTY_MAX,     penMmcMax);
+		assert_eq(DEFAULT_MM_PENALTY_MIN,     penMmcMin);
+		assert_eq(DEFAULT_N_PENALTY_TYPE,     penNType);
+		assert_eq(DEFAULT_N_PENALTY,          penN);
+		assert_eq(DEFAULT_MIN_CONST,          costMin.getConst());
+		assert_eq(DEFAULT_MIN_LINEAR,         costMin.getCoeff());
+		assert_eq(DEFAULT_FLOOR_CONST,        costFloor.getConst());
+		assert_eq(DEFAULT_FLOOR_LINEAR,       costFloor.getCoeff());
+		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
+		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
+
+		assert_eq(DEFAULT_READ_GAP_CONST,     penRdExConst);
+		assert_eq(DEFAULT_READ_GAP_LINEAR,    penRdExLinear);
+		assert_eq(DEFAULT_REF_GAP_CONST,      penRfExConst);
+		assert_eq(DEFAULT_REF_GAP_LINEAR,     penRfExLinear);
+		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
+		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
+		assert_eq(DEFAULT_IVAL,               msIval.getType());
+		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
+		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
+		
+		cout << "PASSED" << endl;
+	}
+
+	{
+		cout << "Case 2: Defaults 2 ... ";
+		const char *pol = "";
+		SeedAlignmentPolicy::parseString(
+			string(pol),
+			false,              // --local?
+			true,               // noisy homopolymers a la 454?
+			false,              // ignore qualities?
+			bonusMatchType,
+			bonusMatch,
+			penMmcType,
+			penMmc,
+			penNType,
+			penN,
+			penRdExConst,
+			penRfExConst,
+			penRdExLinear,
+			penRfExLinear,
+			costMin,
+			costFloor,
+			nCeil,
+			nCatPair,
+			multiseedMms,
+			multiseedLen,
+			msIval,
+			mhits);
+		
+		assert_eq(DEFAULT_MATCH_BONUS_TYPE,   bonusMatchType);
+		assert_eq(DEFAULT_MATCH_BONUS,        bonusMatch);
+		assert_eq(DEFAULT_MM_PENALTY_TYPE,    penMmcType);
+		assert_eq(DEFAULT_MM_PENALTY_MAX,     penMmc);
+		assert_eq(DEFAULT_MM_PENALTY_MIN,     penMmc);
+		assert_eq(DEFAULT_N_PENALTY_TYPE,     penNType);
+		assert_eq(DEFAULT_N_PENALTY,          penN);
+		assert_eq(DEFAULT_MIN_CONST,          costMin.getConst());
+		assert_eq(DEFAULT_MIN_LINEAR,         costMin.getCoeff());
+		assert_eq(DEFAULT_FLOOR_CONST,        costFloor.getConst());
+		assert_eq(DEFAULT_FLOOR_LINEAR,       costFloor.getCoeff());
+		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
+		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
+
+		assert_eq(DEFAULT_READ_GAP_CONST_BADHPOLY,  penRdExConst);
+		assert_eq(DEFAULT_READ_GAP_LINEAR_BADHPOLY, penRdExLinear);
+		assert_eq(DEFAULT_REF_GAP_CONST_BADHPOLY,   penRfExConst);
+		assert_eq(DEFAULT_REF_GAP_LINEAR_BADHPOLY,  penRfExLinear);
+		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
+		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
+		assert_eq(DEFAULT_IVAL,               msIval.getType());
+		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
+		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
+		
+		cout << "PASSED" << endl;
+	}
+
+	{
+		cout << "Case 3: Defaults 3 ... ";
+		const char *pol = "";
+		SeedAlignmentPolicy::parseString(
+			string(pol),
+			true,               // --local?
+			false,              // noisy homopolymers a la 454?
+			false,              // ignore qualities?
+			bonusMatchType,
+			bonusMatch,
+			penMmcType,
+			penMmc,
+			penNType,
+			penN,
+			penRdExConst,
+			penRfExConst,
+			penRdExLinear,
+			penRfExLinear,
+			costMin,
+			costFloor,
+			nCeil,
+			nCatPair,
+			multiseedMms,
+			multiseedLen,
+			msIval,
+			mhits);
+		
+		assert_eq(DEFAULT_MATCH_BONUS_TYPE_LOCAL,   bonusMatchType);
+		assert_eq(DEFAULT_MATCH_BONUS_LOCAL,        bonusMatch);
+		assert_eq(DEFAULT_MM_PENALTY_TYPE,    penMmcType);
+		assert_eq(DEFAULT_MM_PENALTY_MAX,     penMmcMax);
+		assert_eq(DEFAULT_MM_PENALTY_MIN,     penMmcMin);
+		assert_eq(DEFAULT_N_PENALTY_TYPE,     penNType);
+		assert_eq(DEFAULT_N_PENALTY,          penN);
+		assert_eq(DEFAULT_MIN_CONST_LOCAL,    costMin.getConst());
+		assert_eq(DEFAULT_MIN_LINEAR_LOCAL,   costMin.getCoeff());
+		assert_eq(DEFAULT_FLOOR_CONST_LOCAL,  costFloor.getConst());
+		assert_eq(DEFAULT_FLOOR_LINEAR_LOCAL, costFloor.getCoeff());
+		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
+		assert_eq(DEFAULT_N_CEIL_LINEAR,      nCeil.getCoeff());
+		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
+
+		assert_eq(DEFAULT_READ_GAP_CONST,     penRdExConst);
+		assert_eq(DEFAULT_READ_GAP_LINEAR,    penRdExLinear);
+		assert_eq(DEFAULT_REF_GAP_CONST,      penRfExConst);
+		assert_eq(DEFAULT_REF_GAP_LINEAR,     penRfExLinear);
+		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
+		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
+		assert_eq(DEFAULT_IVAL,               msIval.getType());
+		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
+		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
+
+		cout << "PASSED" << endl;
+	}
+
+	{
+		cout << "Case 4: Simple string 1 ... ";
+		const char *pol = "MMP=C44;MA=4;RFG=24,12;FL=C,8;RDG=2;NP=C4;MIN=C,7";
+		SeedAlignmentPolicy::parseString(
+			string(pol),
+			true,               // --local?
+			false,              // noisy homopolymers a la 454?
+			false,              // ignore qualities?
+			bonusMatchType,
+			bonusMatch,
+			penMmcType,
+			penMmc,
+			penNType,
+			penN,
+			penRdExConst,
+			penRfExConst,
+			penRdExLinear,
+			penRfExLinear,
+			costMin,
+			costFloor,
+			nCeil,
+			nCatPair,
+			multiseedMms,
+			multiseedLen,
+			msIval,
+			mhits);
+		
+		assert_eq(COST_MODEL_CONSTANT,        bonusMatchType);
+		assert_eq(4,                          bonusMatch);
+		assert_eq(COST_MODEL_CONSTANT,        penMmcType);
+		assert_eq(44,                         penMmc);
+		assert_eq(COST_MODEL_CONSTANT,        penNType);
+		assert_eq(4.0f,                       penN);
+		assert_eq(7,                          costMin.getConst());
+		assert_eq(DEFAULT_MIN_LINEAR_LOCAL,   costMin.getCoeff());
+		assert_eq(8,                          costFloor.getConst());
+		assert_eq(DEFAULT_FLOOR_LINEAR_LOCAL, costFloor.getCoeff());
+		assert_eq(DEFAULT_N_CEIL_CONST,       nCeil.getConst());
+		assert_eq(DEFAULT_N_CEIL_LINEAR,      nCeil.getCoeff());
+		assert_eq(DEFAULT_N_CAT_PAIR,         nCatPair);
+
+		assert_eq(2.0f,                       penRdExConst);
+		assert_eq(DEFAULT_READ_GAP_LINEAR,    penRdExLinear);
+		assert_eq(24.0f,                      penRfExConst);
+		assert_eq(12.0f,                      penRfExLinear);
+		assert_eq(DEFAULT_SEEDMMS,            multiseedMms);
+		assert_eq(DEFAULT_SEEDLEN,            multiseedLen);
+		assert_eq(DEFAULT_IVAL,               msIval.getType());
+		assert_eq(DEFAULT_IVAL_A,             msIval.getCoeff());
+		assert_eq(DEFAULT_IVAL_B,             msIval.getConst());
+
+		cout << "PASSED" << endl;
+	}
+}
+#endif /*def ALIGNER_SEED_POLICY_MAIN*/
diff --git a/aligner_seed_policy.h b/aligner_seed_policy.h
new file mode 100644
index 0000000..e6ed8a6
--- /dev/null
+++ b/aligner_seed_policy.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_SEED_POLICY_H_
+#define ALIGNER_SEED_POLICY_H_
+
+#include "scoring.h"
+#include "simple_func.h"
+
+#define DEFAULT_SEEDMMS 0
+#define DEFAULT_SEEDLEN 22
+
+#define DEFAULT_IVAL SIMPLE_FUNC_SQRT
+#define DEFAULT_IVAL_A 1.15f
+#define DEFAULT_IVAL_B 0.0f
+
+#define DEFAULT_UNGAPPED_HITS 6
+
+/**
+ * Encapsulates the set of all parameters that affect what the
+ * SeedAligner does with reads.
+ */
+class SeedAlignmentPolicy {
+
+public:
+
+	/**
+	 * Parse alignment policy when provided in this format:
+	 * <lab>=<val>;<lab>=<val>;<lab>=<val>...
+	 *
+	 * And label=value possibilities are:
+	 *
+	 * Bonus for a match
+	 * -----------------
+	 *
+	 * MA=xx (default: MA=0, or MA=2 if --local is set)
+	 *
+	 *    xx = Each position where equal read and reference characters match up
+	 *         in the alignment contriubtes this amount to the total score.
+	 *
+	 * Penalty for a mismatch
+	 * ----------------------
+	 *
+	 * MMP={Cxx|Q|RQ} (default: MMP=C6)
+	 *
+	 *   Cxx = Each mismatch costs xx.  If MMP=Cxx is specified, quality
+	 *         values are ignored when assessing penalities for mismatches.
+	 *   Q   = Each mismatch incurs a penalty equal to the mismatched base's
+	 *         value.
+	 *   R   = Each mismatch incurs a penalty equal to the mismatched base's
+	 *         rounded quality value.  Qualities are rounded off to the
+	 *         nearest 10, and qualities greater than 30 are rounded to 30.
+	 *
+	 * Penalty for position with N (in either read or reference)
+	 * ---------------------------------------------------------
+	 *
+	 * NP={Cxx|Q|RQ} (default: NP=C1)
+	 *
+	 *   Cxx = Each alignment position with an N in either the read or the
+	 *         reference costs xx.  If NP=Cxx is specified, quality values are
+	 *         ignored when assessing penalities for Ns.
+	 *   Q   = Each alignment position with an N in either the read or the
+	 *         reference incurs a penalty equal to the read base's quality
+	 *         value.
+	 *   R   = Each alignment position with an N in either the read or the
+	 *         reference incurs a penalty equal to the read base's rounded
+	 *         quality value.  Qualities are rounded off to the nearest 10,
+	 *         and qualities greater than 30 are rounded to 30.
+	 *
+	 * Penalty for a read gap
+	 * ----------------------
+	 *
+	 * RDG=xx,yy (default: RDG=5,3)
+	 *
+	 *   xx    = Read gap open penalty.
+	 *   yy    = Read gap extension penalty.
+	 *
+	 * Total cost incurred by a read gap = xx + (yy * gap length)
+	 *
+	 * Penalty for a reference gap
+	 * ---------------------------
+	 *
+	 * RFG=xx,yy (default: RFG=5,3)
+	 *
+	 *   xx    = Reference gap open penalty.
+	 *   yy    = Reference gap extension penalty.
+	 *
+	 * Total cost incurred by a reference gap = xx + (yy * gap length)
+	 *
+	 * Minimum score for valid alignment
+	 * ---------------------------------
+	 *
+	 * MIN=xx,yy (defaults: MIN=-0.6,-0.6, or MIN=0.0,0.66 if --local is set)
+	 *
+	 *   xx,yy = For a read of length N, the total score must be at least
+	 *           xx + (read length * yy) for the alignment to be valid.  The
+	 *           total score is the sum of all negative penalties (from
+	 *           mismatches and gaps) and all positive bonuses.  The minimum
+	 *           can be negative (and is by default in global alignment mode).
+	 *
+	 * N ceiling
+	 * ---------
+	 *
+	 * NCEIL=xx,yy (default: NCEIL=0.0,0.15)
+	 *
+	 *   xx,yy = For a read of length N, the number of alignment
+	 *           positions with an N in either the read or the
+	 *           reference cannot exceed
+	 *           ceiling = xx + (read length * yy).  If the ceiling is
+	 *           exceeded, the alignment is considered invalid.
+	 *
+	 * Seeds
+	 * -----
+	 *
+	 * SEED=mm,len,ival (default: SEED=0,22)
+	 *
+	 *   mm   = Maximum number of mismatches allowed within a seed.
+	 *          Must be >= 0 and <= 2.  Note that 2-mismatch mode is
+	 *          not fully sensitive; i.e. some 2-mismatch seed
+	 *          alignments may be missed.
+	 *   len  = Length of seed.
+	 *   ival = Interval between seeds.  If not specified, seed
+	 *          interval is determined by IVAL.
+	 *
+	 * Seed interval
+	 * -------------
+	 *
+	 * IVAL={L|S|C},xx,yy (default: IVAL=S,1.0,0.0)
+	 *
+	 *   L  = let interval between seeds be a linear function of the
+	 *        read length.  xx and yy are the constant and linear
+	 *        coefficients respectively.  In other words, the interval
+	 *        equals a * len + b, where len is the read length.
+	 *        Intervals less than 1 are rounded up to 1.
+	 *   S  = let interval between seeds be a function of the sqaure
+	 *        root of the  read length.  xx and yy are the
+	 *        coefficients.  In other words, the interval equals
+	 *        a * sqrt(len) + b, where len is the read length.
+	 *        Intervals less than 1 are rounded up to 1.
+	 *   C  = Like S but uses cube root of length instead of square
+	 *        root.
+	 *
+	 * Example 1:
+	 *
+	 *  SEED=1,10,5 and read sequence is TGCTATCGTACGATCGTAC:
+	 *
+	 *  The following seeds are extracted from the forward
+	 *  representation of the read and aligned to the reference
+	 *  allowing up to 1 mismatch:
+	 *
+	 *  Read:    TGCTATCGTACGATCGTACA
+	 *
+	 *  Seed 1+: TGCTATCGTA
+	 *  Seed 2+:      TCGTACGATC
+	 *  Seed 3+:           CGATCGTACA
+	 *
+	 *  ...and the following are extracted from the reverse-complement
+	 *  representation of the read and align to the reference allowing
+	 *  up to 1 mismatch:
+	 *
+	 *  Seed 1-: TACGATAGCA
+	 *  Seed 2-:      GATCGTACGA
+	 *  Seed 3-:           TGTACGATCG
+	 *
+	 * Example 2:
+	 *
+	 *  SEED=1,20,20 and read sequence is TGCTATCGTACGATC.  The seed
+	 *  length is 20 but the read is only 15 characters long.  In this
+	 *  case, Bowtie2 automatically shrinks the seed length to be equal
+	 *  to the read length.
+	 *
+	 *  Read:    TGCTATCGTACGATC
+	 *
+	 *  Seed 1+: TGCTATCGTACGATC
+	 *  Seed 1-: GATCGTACGATAGCA
+	 *
+	 * Example 3:
+	 *
+	 *  SEED=1,10,10 and read sequence is TGCTATCGTACGATC.  Only one seed
+	 *  fits on the read; a second seed would overhang the end of the read
+	 *  by 5 positions.  In this case, Bowtie2 extracts one seed.
+	 *
+	 *  Read:    TGCTATCGTACGATC
+	 *
+	 *  Seed 1+: TGCTATCGTA
+	 *  Seed 1-: TACGATAGCA
+	 */
+	static void parseString(
+		const       std::string& s,
+		bool        local,
+		bool        noisyHpolymer,
+		bool        ignoreQuals,
+		int&        bonusMatchType,
+		int&        bonusMatch,
+		int&        penMmcType,
+		int&        penMmcMax,
+		int&        penMmcMin,
+		int&        penNType,
+		int&        penN,
+		int&        penRdExConst,
+		int&        penRfExConst,
+		int&        penRdExLinear,
+		int&        penRfExLinear,
+		SimpleFunc& costMin,
+		SimpleFunc& nCeil,
+		bool&       nCatPair,
+		int&        multiseedMms,
+		int&        multiseedLen,
+		SimpleFunc& multiseedIval,
+		size_t&     failStreak,
+		size_t&     seedRounds,
+        SimpleFunc* penCanSplice = NULL,
+        SimpleFunc* penNoncanSplice = NULL,
+        SimpleFunc* penIntronLen = NULL);
+};
+
+#endif /*ndef ALIGNER_SEED_POLICY_H_*/
diff --git a/aligner_sw.cpp b/aligner_sw.cpp
new file mode 100644
index 0000000..de35257
--- /dev/null
+++ b/aligner_sw.cpp
@@ -0,0 +1,2990 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <limits>
+// -- BTL remove --
+//#include <stdlib.h>
+//#include <sys/time.h>
+// -- --
+#include "aligner_sw.h"
+#include "aligner_result.h"
+#include "search_globals.h"
+#include "scoring.h"
+#include "mask.h"
+
+/**
+ * Initialize with a new read.
+ */
+void SwAligner::initRead(
+	const BTDnaString& rdfw, // forward read sequence
+	const BTDnaString& rdrc, // revcomp read sequence
+	const BTString& qufw,    // forward read qualities
+	const BTString& qurc,    // reverse read qualities
+	size_t rdi,              // offset of first read char to align
+	size_t rdf,              // offset of last read char to align
+	const Scoring& sc)       // scoring scheme
+{
+	assert_gt(rdf, rdi);
+	int nceil = sc.nCeil.f<int>((double)rdfw.length());
+	rdfw_    = &rdfw;      // read sequence
+	rdrc_    = &rdrc;      // read sequence
+	qufw_    = &qufw;      // read qualities
+	qurc_    = &qurc;      // read qualities
+	rdi_     = rdi;        // offset of first read char to align
+	rdf_     = rdf;        // offset of last read char to align
+	sc_      = ≻        // scoring scheme
+	nceil_   = nceil;      // max # Ns allowed in ref portion of aln
+	readSse16_ = false;    // true -> sse16 from now on for this read
+	initedRead_ = true;
+#ifndef NO_SSE
+	sseU8fwBuilt_  = false;  // built fw query profile, 8-bit score
+	sseU8rcBuilt_  = false;  // built rc query profile, 8-bit score
+	sseI16fwBuilt_ = false;  // built fw query profile, 16-bit score
+	sseI16rcBuilt_ = false;  // built rc query profile, 16-bit score
+#endif
+}
+
+/**
+ * Initialize with a new alignment problem.
+ */
+void SwAligner::initRef(
+	bool fw,               // whether to forward or revcomp read is aligning
+	TRefId refidx,         // id of reference aligned against
+	const DPRect& rect,    // DP rectangle
+	char *rf,              // reference sequence
+	size_t rfi,            // offset of first reference char to align to
+	size_t rff,            // offset of last reference char to align to
+	TRefOff reflen,        // length of reference sequence
+	const Scoring& sc,     // scoring scheme
+	TAlScore minsc,        // minimum score
+	bool enable8,          // use 8-bit SSE if possible?
+	size_t cminlen,        // minimum length for using checkpointing scheme
+	size_t cpow2,          // interval b/t checkpointed diags; 1 << this
+	bool doTri,            // triangular mini-fills?
+	bool extend)           // is this a seed extension?
+{
+	size_t readGaps = sc.maxReadGaps(minsc, rdfw_->length());
+	size_t refGaps  = sc.maxRefGaps(minsc, rdfw_->length());
+	assert_geq(readGaps, 0);
+	assert_geq(refGaps, 0);
+	assert_gt(rff, rfi);
+	rdgap_       = readGaps;  // max # gaps in read
+	rfgap_       = refGaps;   // max # gaps in reference
+	state_       = STATE_INITED;
+	fw_          = fw;       // orientation
+	rd_          = fw ? rdfw_ : rdrc_; // read sequence
+	qu_          = fw ? qufw_ : qurc_; // quality sequence
+	refidx_      = refidx;   // id of reference aligned against
+	rf_          = rf;       // reference sequence
+	rfi_         = rfi;      // offset of first reference char to align to
+	rff_         = rff;      // offset of last reference char to align to
+	reflen_      = reflen;   // length of entire reference sequence
+	rect_        = ▭    // DP rectangle
+	minsc_       = minsc;    // minimum score
+	cural_       = 0;        // idx of next alignment to give out
+	initedRef_   = true;     // indicate we've initialized the ref portion
+	enable8_     = enable8;  // use 8-bit SSE if possible?
+	extend_      = extend;   // true iff this is a seed extension
+	cperMinlen_  = cminlen;  // reads shorter than this won't use checkpointer
+	cperPerPow2_ = cpow2;    // interval b/t checkpointed diags; 1 << this
+	cperEf_      = true;     // whether to checkpoint H, E, and F
+	cperTri_     = doTri;    // triangular mini-fills?
+	bter_.initRef(
+		fw_ ? rdfw_->buf() : // in: read sequence
+			  rdrc_->buf(), 
+		fw_ ? qufw_->buf() : // in: quality sequence
+			  qurc_->buf(),
+                  // daehwan
+		// rd_->length(),       // in: read sequence length
+                  rdf_ - rdi_,
+		rf_ + rfi_,          // in: reference sequence
+		rff_ - rfi_,         // in: in-rectangle reference sequence length
+		reflen,              // in: total reference sequence length
+		refidx_,             // in: reference id
+		rfi_,                // in: reference offset
+		fw_,                 // in: orientation
+		rect_,               // in: DP rectangle
+		&cper_,              // in: checkpointer
+		*sc_,                // in: scoring scheme
+		nceil_);             // in: N ceiling
+}
+	
+/**
+ * Given a read, an alignment orientation, a range of characters in a referece
+ * sequence, and a bit-encoded version of the reference, set up and execute the
+ * corresponding dynamic programming problem.
+ *
+ * The caller has already narrowed down the relevant portion of the reference
+ * using, e.g., the location of a seed hit, or the range of possible fragment
+ * lengths if we're searching for the opposite mate in a pair.
+ */
+void SwAligner::initRef(
+	bool fw,               // whether to forward or revcomp read is aligning
+	TRefId refidx,         // reference aligned against
+	const DPRect& rect,    // DP rectangle
+	const BitPairReference& refs, // Reference strings
+	TRefOff reflen,        // length of reference sequence
+	const Scoring& sc,     // scoring scheme
+	TAlScore minsc,        // minimum score
+	bool enable8,          // use 8-bit SSE if possible?
+	size_t cminlen,        // minimum length for using checkpointing scheme
+	size_t cpow2,          // interval b/t checkpointed diags; 1 << this
+	bool doTri,            // triangular mini-fills?
+	bool extend,           // true iff this is a seed extension
+	size_t  upto,          // count the number of Ns up to this offset
+	size_t& nsUpto)        // output: the number of Ns up to 'upto'
+{
+	TRefOff rfi = rect.refl;
+	TRefOff rff = rect.refr + 1;
+	assert_gt(rff, rfi);
+	// Capture an extra reference character outside the rectangle so that we
+	// can check matches in the next column over to the right
+	rff++;
+	// rflen = full length of the reference substring to consider, including
+	// overhang off the boundaries of the reference sequence
+	const size_t rflen = (size_t)(rff - rfi);
+	// Figure the number of Ns we're going to add to either side
+	size_t leftNs  =
+		(rfi >= 0               ? 0 : (size_t)std::abs(static_cast<long>(rfi)));
+	leftNs = min(leftNs, rflen);
+	size_t rightNs =
+		(rff <= (TRefOff)reflen ? 0 : (size_t)std::abs(static_cast<long>(rff - reflen)));
+	rightNs = min(rightNs, rflen);
+	// rflenInner = length of just the portion that doesn't overhang ref ends
+	assert_geq(rflen, leftNs + rightNs);
+	const size_t rflenInner = rflen - (leftNs + rightNs);
+#ifndef NDEBUG
+	bool haveRfbuf2 = false;
+	EList<char> rfbuf2(rflen);
+	// This is really slow, so only do it some of the time
+	if((rand() % 10) == 0) {
+		TRefOff rfii = rfi;
+		for(size_t i = 0; i < rflen; i++) {
+			if(rfii < 0 || (TRefOff)rfii >= reflen) {
+				rfbuf2.push_back(4);
+			} else {
+				rfbuf2.push_back(refs.getBase(refidx, (uint32_t)rfii));
+			}
+			rfii++;
+		}
+		haveRfbuf2 = true;
+	}
+#endif
+	// rfbuf_ = uint32_t list large enough to accommodate both the reference
+	// sequence and any Ns we might add to either side.
+	rfwbuf_.resize((rflen + 16) / 4);
+	int offset = refs.getStretch(
+		rfwbuf_.ptr(),               // buffer to store words in
+		refidx,                      // which reference
+		(rfi < 0) ? 0 : (size_t)rfi, // starting offset (can't be < 0)
+		rflenInner                   // length to grab (exclude overhang)
+		ASSERT_ONLY(, tmp_destU32_));// for BitPairReference::getStretch()
+	assert_leq(offset, 16);
+	rf_ = (char*)rfwbuf_.ptr() + offset;
+	// Shift ref chars away from 0 so we can stick Ns at the beginning
+	if(leftNs > 0) {
+		// Slide everyone down
+		for(size_t i = rflenInner; i > 0; i--) {
+			rf_[i+leftNs-1] = rf_[i-1];
+		}
+		// Add Ns
+		for(size_t i = 0; i < leftNs; i++) {
+			rf_[i] = 4;
+		}
+	}
+	if(rightNs > 0) {
+		// Add Ns to the end
+		for(size_t i = 0; i < rightNs; i++) {
+			rf_[i + leftNs + rflenInner] = 4;
+		}
+	}
+#ifndef NDEBUG
+	// Sanity check reference characters
+	for(size_t i = 0; i < rflen; i++) {
+		assert(!haveRfbuf2 || rf_[i] == rfbuf2[i]);
+		assert_range(0, 4, (int)rf_[i]);
+	}
+#endif
+	// Count Ns and convert reference characters into A/C/G/T masks.  Ambiguous
+	// nucleotides (IUPAC codes) have more than one mask bit set.  If a
+	// reference scanner was provided, use it to opportunistically resolve seed
+	// hits.
+	nsUpto = 0;
+	for(size_t i = 0; i < rflen; i++) {
+		// rf_[i] gets mask version of refence char, with N=16
+		if(i < upto && rf_[i] > 3) {
+			nsUpto++;
+		}
+		rf_[i] = (1 << rf_[i]);
+	}
+	// Correct for having captured an extra reference character
+	rff--;
+	initRef(
+		fw,          // whether to forward or revcomp read is aligning
+		refidx,      // id of reference aligned against
+		rect,        // DP rectangle
+		rf_,         // reference sequence, wrapped up in BTString object
+		0,           // use the whole thing
+		(size_t)(rff - rfi), // ditto
+		reflen,      // reference length
+		sc,          // scoring scheme
+		minsc,       // minimum score
+		enable8,     // use 8-bit SSE if possible?
+		cminlen,     // minimum length for using checkpointing scheme
+		cpow2,       // interval b/t checkpointed diags; 1 << this
+		doTri,       // triangular mini-fills?
+		extend);     // true iff this is a seed extension
+}
+
+/**
+ * Align read 'rd' to reference using read & reference information given
+ * last time init() was called.
+ */
+bool SwAligner::align(
+                      RandomSource& rnd, // source of pseudo-randoms
+                      TAlScore& best)    // best alignment score observed in DP matrix
+{
+    assert(initedRef() && initedRead());
+    assert_eq(STATE_INITED, state_);
+    state_ = STATE_ALIGNED;
+    // Reset solutions lists
+    btncand_.clear();
+    btncanddone_.clear();
+    btncanddoneSucc_ = btncanddoneFail_ = 0;
+    best = std::numeric_limits<TAlScore>::min();
+    sse8succ_ = sse16succ_ = false;
+    int flag = 0;
+    size_t rdlen = rdf_ - rdi_;
+    bool checkpointed = rdlen >= cperMinlen_;
+    bool gathered = false; // Did gathering happen along with alignment?
+    if(sc_->monotone) {
+        // End-to-end
+        if(enable8_ && !readSse16_ && minsc_ >= -254) {
+            // 8-bit end-to-end
+            if(checkpointed) {
+                best = alignGatherEE8(flag, false);
+                if(flag == 0) {
+                    gathered = true;
+                }
+            } else {
+                best = alignNucleotidesEnd2EndSseU8(flag, false);
+#ifndef NDEBUG
+                int flagtmp = 0;
+                TAlScore besttmp = alignGatherEE8(flagtmp, true); // debug
+                assert_eq(flagtmp, flag);
+                assert_eq(besttmp, best);
+#endif
+            }
+            sse8succ_ = (flag == 0);
+#ifndef NDEBUG
+            {
+                int flag2 = 0;
+                TAlScore best2 = alignNucleotidesEnd2EndSseI16(flag2, true);
+                {
+                    int flagtmp = 0;
+                    TAlScore besttmp = alignGatherEE16(flagtmp, true);
+                    assert_eq(flagtmp, flag2);
+                    assert(flag2 != 0 || best2 == besttmp);
+                }
+                assert(flag < 0 || best == best2);
+                sse16succ_ = (flag2 == 0);
+            }
+#endif /*ndef NDEBUG*/
+        } else {
+            // 16-bit end-to-end
+            if(checkpointed) {
+                best = alignGatherEE16(flag, false);
+                if(flag == 0) {
+                    gathered = true;
+                }
+            } else {
+                best = alignNucleotidesEnd2EndSseI16(flag, false);
+#ifndef NDEBUG
+                int flagtmp = 0;
+                TAlScore besttmp = alignGatherEE16(flagtmp, true);
+                assert_eq(flagtmp, flag);
+                assert_eq(besttmp, best);
+#endif
+            }
+            sse16succ_ = (flag == 0);
+        }
+    } else {
+        // Local
+        flag = -2;
+        if(enable8_ && !readSse16_) {
+            // 8-bit local
+            if(checkpointed) {
+                best = alignGatherLoc8(flag, false);
+                if(flag == 0) {
+                    gathered = true;
+                }
+            } else {
+                best = alignNucleotidesLocalSseU8(flag, false);
+#ifndef NDEBUG
+                int flagtmp = 0;
+                TAlScore besttmp = alignGatherLoc8(flagtmp, true);
+                assert_eq(flag, flagtmp);
+                assert_eq(best, besttmp);
+#endif
+            }
+        }
+        if(flag == -2) {
+            // 16-bit local
+            flag = 0;
+            if(checkpointed) {
+                best = alignNucleotidesLocalSseI16(flag, false);
+                best = alignGatherLoc16(flag, false);
+                if(flag == 0) {
+                    gathered = true;
+                }
+            } else {
+                best = alignNucleotidesLocalSseI16(flag, false);
+#ifndef NDEBUG
+                int flagtmp = 0;
+                TAlScore besttmp = alignGatherLoc16(flagtmp, true);
+                assert_eq(flag, flagtmp);
+                assert_eq(best, besttmp);
+#endif
+            }
+            sse16succ_ = (flag == 0);
+        } else {
+            sse8succ_ = (flag == 0);
+#ifndef NDEBUG
+            int flag2 = 0;
+            TAlScore best2 = alignNucleotidesLocalSseI16(flag2, true);
+            {
+                int flagtmp = 0;
+                TAlScore besttmp = alignGatherLoc16(flagtmp, true);
+                assert_eq(flag2, flagtmp);
+                assert(flag2 != 0 || best2 == besttmp);
+            }
+            assert(flag2 < 0 || best == best2);
+            sse16succ_ = (flag2 == 0);
+#endif /*ndef NDEBUG*/
+        }
+    }
+#ifndef NDEBUG
+    if(!checkpointed && (rand() & 15) == 0 && sse8succ_ && sse16succ_) {
+        SSEData& d8  = fw_ ? sseU8fw_  : sseU8rc_;
+        SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
+        assert_eq(d8.mat_.nrow(), d16.mat_.nrow());
+        assert_eq(d8.mat_.ncol(), d16.mat_.ncol());
+        for(size_t i = 0; i < d8.mat_.nrow(); i++) {
+            for(size_t j = 0; j < colstop_; j++) {
+                int h8  = d8.mat_.helt(i, j);
+                int h16 = d16.mat_.helt(i, j);
+                int e8  = d8.mat_.eelt(i, j);
+                int e16 = d16.mat_.eelt(i, j);
+                int f8  = d8.mat_.felt(i, j);
+                int f16 = d16.mat_.felt(i, j);
+                TAlScore h8s  =
+                (sc_->monotone ? (h8  - 0xff  ) : h8);
+                TAlScore h16s =
+                (sc_->monotone ? (h16 - 0x7fff) : (h16 + 0x8000));
+                TAlScore e8s  =
+                (sc_->monotone ? (e8  - 0xff  ) : e8);
+                TAlScore e16s =
+                (sc_->monotone ? (e16 - 0x7fff) : (e16 + 0x8000));
+                TAlScore f8s  =
+                (sc_->monotone ? (f8  - 0xff  ) : f8);
+                TAlScore f16s =
+                (sc_->monotone ? (f16 - 0x7fff) : (f16 + 0x8000));
+                if(h8s < minsc_) {
+                    h8s = minsc_ - 1;
+                }
+                if(h16s < minsc_) {
+                    h16s = minsc_ - 1;
+                }
+                if(e8s < minsc_) {
+                    e8s = minsc_ - 1;
+                }
+                if(e16s < minsc_) {
+                    e16s = minsc_ - 1;
+                }
+                if(f8s < minsc_) {
+                    f8s = minsc_ - 1;
+                }
+                if(f16s < minsc_) {
+                    f16s = minsc_ - 1;
+                }
+                if((h8 != 0 || (int16_t)h16 != (int16_t)0x8000) && h8 > 0) {
+                    assert_eq(h8s, h16s);
+                }
+                if((e8 != 0 || (int16_t)e16 != (int16_t)0x8000) && e8 > 0) {
+                    assert_eq(e8s, e16s);
+                }
+                if((f8 != 0 || (int16_t)f16 != (int16_t)0x8000) && f8 > 0) {
+                    assert_eq(f8s, f16s);
+                }
+            }
+        }
+    }
+#endif
+    assert(repOk());
+    cural_ = 0;
+    if(best == MIN_I64 || best < minsc_) {
+        return false;
+    }
+    if(!gathered) {
+        // Look for solutions using SSE matrix
+        assert(sse8succ_ || sse16succ_);
+        if(sc_->monotone) {
+            if(sse8succ_) {
+                gatherCellsNucleotidesEnd2EndSseU8(best);
+#ifndef NDEBUG
+                if(sse16succ_) {
+                    cand_tmp_ = btncand_;
+                    gatherCellsNucleotidesEnd2EndSseI16(best);
+                    cand_tmp_.sort();
+                    btncand_.sort();
+                    assert(cand_tmp_ == btncand_);
+                }
+#endif /*ndef NDEBUG*/
+            } else {
+                gatherCellsNucleotidesEnd2EndSseI16(best);
+            }
+        } else {
+            if(sse8succ_) {
+                gatherCellsNucleotidesLocalSseU8(best);
+#ifndef NDEBUG
+                if(sse16succ_) {
+                    cand_tmp_ = btncand_;
+                    gatherCellsNucleotidesLocalSseI16(best);
+                    cand_tmp_.sort();
+                    btncand_.sort();
+                    assert(cand_tmp_ == btncand_);
+                }
+#endif /*ndef NDEBUG*/
+            } else {
+                gatherCellsNucleotidesLocalSseI16(best);
+            }
+        }
+    }
+    if(!btncand_.empty()) {
+        btncand_.sort();
+    }
+    return !btncand_.empty();
+}
+
+/**
+ * Populate the given SwResult with information about the "next best"
+ * alignment if there is one.  If there isn't one, false is returned.  Note
+ * that false might be returned even though a call to done() would have
+ * returned false.
+ */
+bool SwAligner::nextAlignment(
+	SwResult& res,
+	TAlScore minsc,
+	RandomSource& rnd)
+{
+	assert(initedRead() && initedRef());
+	assert_eq(STATE_ALIGNED, state_);
+	assert(repOk());
+	if(done()) {
+		res.reset();
+		return false;
+	}
+	assert(!done());
+	size_t off = 0, nbts = 0;
+	assert_lt(cural_, btncand_.size());
+	assert(res.repOk());
+	// For each candidate cell that we should try to backtrack from...
+	const size_t candsz = btncand_.size();
+	size_t SQ = dpRows() >> 4;
+	if(SQ == 0) SQ = 1;
+	size_t rdlen = rdf_ - rdi_;
+	bool checkpointed = rdlen >= cperMinlen_;
+	while(cural_ < candsz) {
+		// Doing 'continue' anywhere in here simply causes us to move on to the
+		// next candidate
+		if(btncand_[cural_].score < minsc) {
+			btncand_[cural_].fate = BT_CAND_FATE_FILT_SCORE;
+			nbtfiltsc_++; cural_++; continue;
+		}
+		nbts = 0;
+		assert(sse8succ_ || sse16succ_);
+		size_t row = btncand_[cural_].row;
+		size_t col = btncand_[cural_].col;
+		assert_lt(row, dpRows());
+		assert_lt((TRefOff)col, rff_-rfi_);
+		if(sse16succ_) {
+			SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+			if(!checkpointed && d.mat_.reset_[row] && d.mat_.reportedThrough(row, col)) {
+				// Skipping this candidate because a previous candidate already
+				// moved through this cell
+				btncand_[cural_].fate = BT_CAND_FATE_FILT_START;
+				//cerr << "  skipped becuase starting cell was covered" << endl;
+				nbtfiltst_++; cural_++; continue;
+			}
+		} else if(sse8succ_) {
+			SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+			if(!checkpointed && d.mat_.reset_[row] && d.mat_.reportedThrough(row, col)) {
+				// Skipping this candidate because a previous candidate already
+				// moved through this cell
+				btncand_[cural_].fate = BT_CAND_FATE_FILT_START;
+				//cerr << "  skipped becuase starting cell was covered" << endl;
+				nbtfiltst_++; cural_++; continue;
+			}
+		}
+		if(sc_->monotone) {
+			bool ret = false;
+			if(sse8succ_) {
+				uint32_t reseed = rnd.nextU32() + 1;
+				rnd.init(reseed);
+				res.reset();
+				if(checkpointed) {
+					size_t maxiter = MAX_SIZE_T;
+					size_t niter = 0;
+					ret = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res,      // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter,  // max # extensions to try
+						niter,    // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+				} else {
+					ret = backtraceNucleotidesEnd2EndSseU8(
+						btncand_[cural_].score, // in: expected score
+						res,    // out: store results (edits and scores) here
+						off,    // out: store diagonal projection of origin
+						nbts,   // out: # backtracks
+						row,    // start in this rectangle row
+						col,    // start in this rectangle column
+						rnd);   // random gen, to choose among equal paths
+				}
+#ifndef NDEBUG
+				// if(...) statement here should check not whether the primary
+				// alignment was checkpointed, but whether a checkpointed
+				// alignment was done at all.
+				if(!checkpointed) {
+					SwResult res2;
+					size_t maxiter2 = MAX_SIZE_T;
+					size_t niter2 = 0;
+					bool ret2 = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res2,     // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter2, // max # extensions to try
+						niter2,   // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+					// After the first alignment, there's no guarantee we'll
+					// get the same answer from both backtrackers because of
+					// differences in how they handle marking cells as
+					// reported-through.
+					assert(cural_ > 0 || !ret || ret == ret2);
+				}
+				if(sse16succ_ && !checkpointed) {
+					SwResult res2;
+					size_t off2, nbts2 = 0;
+					rnd.init(reseed);
+					bool ret2 = backtraceNucleotidesEnd2EndSseI16(
+						btncand_[cural_].score, // in: expected score
+						res2,   // out: store results (edits and scores) here
+						off2,   // out: store diagonal projection of origin
+						nbts2,  // out: # backtracks
+						row,    // start in this rectangle row
+						col,    // start in this rectangle column
+						rnd);   // random gen, to choose among equal paths
+					assert_eq(ret, ret2);
+					assert_eq(nbts, nbts2);
+					assert(!ret || res2.alres.score() == res.alres.score());
+#if 0
+					if(!checkpointed && (rand() & 15) == 0) {
+						// Check that same cells are reported through
+						SSEData& d8  = fw_ ? sseU8fw_  : sseU8rc_;
+						SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
+						for(size_t i = d8.mat_.nrow(); i > 0; i--) {
+							for(size_t j = 0; j < d8.mat_.ncol(); j++) {
+								assert_eq(d8.mat_.reportedThrough(i-1, j),
+										  d16.mat_.reportedThrough(i-1, j));
+							}
+						}
+					}
+#endif
+				}
+#endif
+				rnd.init(reseed+1); // debug/release pseudo-randoms in lock step
+			} else if(sse16succ_) {
+				uint32_t reseed = rnd.nextU32() + 1;
+				res.reset();
+				if(checkpointed) {
+					size_t maxiter = MAX_SIZE_T;
+					size_t niter = 0;
+					ret = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res,      // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter,  // max # extensions to try
+						niter,    // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+				} else {
+					ret = backtraceNucleotidesEnd2EndSseI16(
+						btncand_[cural_].score, // in: expected score
+						res,    // out: store results (edits and scores) here
+						off,    // out: store diagonal projection of origin
+						nbts,   // out: # backtracks
+						row,    // start in this rectangle row
+						col,    // start in this rectangle column
+						rnd);   // random gen, to choose among equal paths
+				}
+#ifndef NDEBUG
+				// if(...) statement here should check not whether the primary
+				// alignment was checkpointed, but whether a checkpointed
+				// alignment was done at all.
+				if(!checkpointed) {
+					SwResult res2;
+					size_t maxiter2 = MAX_SIZE_T;
+					size_t niter2 = 0;
+					bool ret2 = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res2,     // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter2, // max # extensions to try
+						niter2,   // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+					// After the first alignment, there's no guarantee we'll
+					// get the same answer from both backtrackers because of
+					// differences in how they handle marking cells as
+					// reported-through.
+					assert(cural_ > 0 || !ret || ret == ret2);
+				}
+#endif
+				rnd.init(reseed); // debug/release pseudo-randoms in lock step
+			}
+			if(ret) {
+				btncand_[cural_].fate = BT_CAND_FATE_SUCCEEDED;
+				break;
+			} else {
+				btncand_[cural_].fate = BT_CAND_FATE_FAILED;
+			}
+		} else {
+			// Local alignment
+			// Check if this solution is "dominated" by a prior one.
+			// Domination is a heuristic designed to eliminate the vast
+			// majority of valid-but-redundant candidates lying in the
+			// "penumbra" of a high-scoring alignment.
+			bool dom = false;
+			{
+				size_t donesz = btncanddone_.size();
+				const size_t col = btncand_[cural_].col;
+				const size_t row = btncand_[cural_].row;
+				for(size_t i = 0; i < donesz; i++) {
+					assert_gt(btncanddone_[i].fate, 0);
+					size_t colhi = col, rowhi = row;
+					size_t rowlo = btncanddone_[i].row;
+					size_t collo = btncanddone_[i].col;
+					if(colhi < collo) swap(colhi, collo);
+					if(rowhi < rowlo) swap(rowhi, rowlo);
+					if(colhi - collo <= SQ && rowhi - rowlo <= SQ) {
+						// Skipping this candidate because it's "dominated" by
+						// a previous candidate
+						dom = true;
+						break;
+					}
+				}
+			}
+			if(dom) {
+				btncand_[cural_].fate = BT_CAND_FATE_FILT_DOMINATED;
+				nbtfiltdo_++;
+				cural_++;
+				continue;
+			}
+			bool ret = false;
+			if(sse8succ_) {
+				uint32_t reseed = rnd.nextU32() + 1;
+				res.reset();
+				rnd.init(reseed);
+				if(checkpointed) {
+					size_t maxiter = MAX_SIZE_T;
+					size_t niter = 0;
+					ret = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res,      // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter,  // max # extensions to try
+						niter,    // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+				} else {
+					ret = backtraceNucleotidesLocalSseU8(
+						btncand_[cural_].score, // in: expected score
+						res,    // out: store results (edits and scores) here
+						off,    // out: store diagonal projection of origin
+						nbts,   // out: # backtracks
+						row,    // start in this rectangle row
+						col,    // start in this rectangle column
+						rnd);   // random gen, to choose among equal paths
+				}
+#ifndef NDEBUG
+				// if(...) statement here should check not whether the primary
+				// alignment was checkpointed, but whether a checkpointed
+				// alignment was done at all.
+				if(!checkpointed) {
+					SwResult res2;
+					size_t maxiter2 = MAX_SIZE_T;
+					size_t niter2 = 0;
+					bool ret2 = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res2,     // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter2, // max # extensions to try
+						niter2,   // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+					// After the first alignment, there's no guarantee we'll
+					// get the same answer from both backtrackers because of
+					// differences in how they handle marking cells as
+					// reported-through.
+					assert(cural_ > 0 || !ret || ret == ret2);
+				}
+				if(!checkpointed && sse16succ_) {
+					SwResult res2;
+					size_t off2, nbts2 = 0;
+					rnd.init(reseed); // same b/t backtrace calls
+					bool ret2 = backtraceNucleotidesLocalSseI16(
+						btncand_[cural_].score, // in: expected score
+						res2,   // out: store results (edits and scores) here
+						off2,   // out: store diagonal projection of origin
+						nbts2,  // out: # backtracks
+						row,    // start in this rectangle row
+						col,    // start in this rectangle column
+						rnd);   // random gen, to choose among equal paths
+					assert_eq(ret, ret2);
+					assert_eq(nbts, nbts2);
+					assert(!ret || res2.alres.score() == res.alres.score());
+#if 0
+					if(!checkpointed && (rand() & 15) == 0) {
+						// Check that same cells are reported through
+						SSEData& d8  = fw_ ? sseU8fw_  : sseU8rc_;
+						SSEData& d16 = fw_ ? sseI16fw_ : sseI16rc_;
+						for(size_t i = d8.mat_.nrow(); i > 0; i--) {
+							for(size_t j = 0; j < d8.mat_.ncol(); j++) {
+								assert_eq(d8.mat_.reportedThrough(i-1, j),
+										  d16.mat_.reportedThrough(i-1, j));
+							}
+						}
+					}
+#endif
+				}
+#endif
+				rnd.init(reseed+1); // debug/release pseudo-randoms in lock step
+			} else if(sse16succ_) {
+				uint32_t reseed = rnd.nextU32() + 1;
+				res.reset();
+				if(checkpointed) {
+					size_t maxiter = MAX_SIZE_T;
+					size_t niter = 0;
+					ret = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res,      // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter,  // max # extensions to try
+						niter,    // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+				} else {
+					ret = backtraceNucleotidesLocalSseI16(
+						btncand_[cural_].score, // in: expected score
+						res,    // out: store results (edits and scores) here
+						off,    // out: store diagonal projection of origin
+						nbts,   // out: # backtracks
+						row,    // start in this rectangle row
+						col,    // start in this rectangle column
+						rnd);   // random gen, to choose among equal paths
+				}
+#ifndef NDEBUG
+				// if(...) statement here should check not whether the primary
+				// alignment was checkpointed, but whether a checkpointed
+				// alignment was done at all.
+				if(!checkpointed) {
+					SwResult res2;
+					size_t maxiter2 = MAX_SIZE_T;
+					size_t niter2 = 0;
+					bool ret2 = backtrace(
+						btncand_[cural_].score, // in: expected score
+						true,     // in: use mini-fill?
+						true,     // in: use checkpoints?
+						res2,     // out: store results (edits and scores) here
+						off,      // out: store diagonal projection of origin
+						row,      // start in this rectangle row
+						col,      // start in this rectangle column
+						maxiter2, // max # extensions to try
+						niter2,   // # extensions tried
+						rnd);     // random gen, to choose among equal paths
+					// After the first alignment, there's no guarantee we'll
+					// get the same answer from both backtrackers because of
+					// differences in how they handle marking cells as
+					// reported-through.
+					assert(cural_ > 0 || !ret || ret == ret2);
+				}
+#endif
+				rnd.init(reseed); // same b/t backtrace calls
+			}
+			if(ret) {
+				btncand_[cural_].fate = BT_CAND_FATE_SUCCEEDED;
+				btncanddone_.push_back(btncand_[cural_]);
+				btncanddoneSucc_++;
+				assert(res.repOk());
+				break;
+			} else {
+				btncand_[cural_].fate = BT_CAND_FATE_FAILED;
+				btncanddone_.push_back(btncand_[cural_]);
+				btncanddoneFail_++;
+			}
+		}
+		cural_++;
+	} // while(cural_ < btncand_.size())
+	if(cural_ == btncand_.size()) {
+		assert(res.repOk());
+		return false;
+	}
+	// assert(!res.alres.empty());
+	assert(res.repOk());
+	if(!fw_) {
+		// All edits are currently w/r/t upstream end; if read aligned
+		// to Crick strand, we need to invert them so that they're
+		// w/r/t the read's 5' end instead.
+		// res.alres.invertEdits();
+	}
+	cural_++;
+	assert(res.repOk());
+	return true;
+}
+
+#ifdef MAIN_ALIGNER_SW
+
+#include <sstream>
+#include <utility>
+#include <getopt.h>
+#include "scoring.h"
+#include "aligner_seed_policy.h"
+
+int  gGapBarrier;
+int  gSnpPhred;
+static int bonusMatchType;   // how to reward matches
+static int bonusMatch;       // constant if match bonus is a constant
+static int penMmcType;       // how to penalize mismatches
+static int penMmc;           // constant if mm pelanty is a constant
+static int penNType;         // how to penalize Ns in the read
+static int penN;             // constant if N pelanty is a constant
+static bool nPairCat;        // true -> concatenate mates before N filter
+static int penRdExConst;     // constant coeff for cost of gap in read
+static int penRfExConst;     // constant coeff for cost of gap in ref
+static int penRdExLinear;    // linear coeff for cost of gap in read
+static int penRfExLinear;    // linear coeff for cost of gap in ref
+static float costMinConst;   // constant coeff for min score w/r/t read len
+static float costMinLinear;  // linear coeff for min score w/r/t read len
+static float costFloorConst; // constant coeff for score floor w/r/t read len
+static float costFloorLinear;// linear coeff for score floor w/r/t read len
+static float nCeilConst;     // constant coeff for N ceiling w/r/t read len
+static float nCeilLinear;    // linear coeff for N ceiling w/r/t read len
+static bool  nCatPair;       // concat mates before applying N filter?
+static int multiseedMms;     // mismatches permitted in a multiseed seed
+static int multiseedLen;     // length of multiseed seeds
+static int multiseedIvalType;
+static float multiseedIvalA;
+static float multiseedIvalB;
+static float posmin;
+static float posfrac;
+static float rowmult;
+
+enum {
+	ARG_TESTS = 256
+};
+
+static const char *short_opts = "s:m:r:d:i:";
+static struct option long_opts[] = {
+	{(char*)"snppen",       required_argument, 0, 's'},
+	{(char*)"misspen",      required_argument, 0, 'm'},
+	{(char*)"seed",         required_argument, 0, 'r'},
+	{(char*)"align-policy", no_argument,       0, 'A'},
+	{(char*)"test",         no_argument,       0, ARG_TESTS},
+};
+
+static void printUsage(ostream& os) {
+	os << "Usage: aligner_sw <read-seq> <ref-nuc-seq> [options]*" << endl;
+	os << "Options:" << endl;
+	os << "  -s/--snppen <int>   penalty incurred by SNP; used for decoding"
+	   << endl;
+	os << "  -m/--misspen <int>  quality to use for read chars" << endl;
+	os << "  -r/-seed <int>      seed for pseudo-random generator" << endl;
+}
+
+/**
+ * Parse a T from a string 's'
+ */
+template<typename T>
+T parse(const char *s) {
+	T tmp;
+	stringstream ss(s);
+	ss >> tmp;
+	return tmp;
+}
+
+static EList<bool> stbuf, enbuf;
+static BTDnaString btread;
+static BTString btqual;
+static BTString btref;
+static BTString btref2;
+
+static BTDnaString readrc;
+static BTString qualrc;
+
+/**
+ * Helper function for running a case consisting of a read (sequence
+ * and quality), a reference string, and an offset that anchors the 0th
+ * character of the read to a reference position.
+ */
+static void doTestCase(
+	SwAligner&         al,
+	const BTDnaString& read,
+	const BTString&    qual,
+	const BTString&    refin,
+	TRefOff            off,
+	EList<bool>       *en,
+	const Scoring&     sc,
+	TAlScore           minsc,
+	SwResult&          res,
+	bool               nsInclusive,
+	bool               filterns,
+	uint32_t           seed)
+{
+	RandomSource rnd(seed);
+	btref2 = refin;
+	assert_eq(read.length(), qual.length());
+	size_t nrow = read.length();
+	TRefOff rfi, rff;
+	// Calculate the largest possible number of read and reference gaps given
+	// 'minsc' and 'pens'
+	size_t maxgaps;
+	size_t padi, padf;
+	{
+		int readGaps = sc.maxReadGaps(minsc, read.length());
+		int refGaps = sc.maxRefGaps(minsc, read.length());
+		assert_geq(readGaps, 0);
+		assert_geq(refGaps, 0);
+		int maxGaps = max(readGaps, refGaps);
+		padi = 2 * maxGaps;
+		padf = maxGaps;
+		maxgaps = (size_t)maxGaps;
+	}
+	size_t nceil = (size_t)sc.nCeil.f((double)read.length());
+	size_t width = 1 + padi + padf;
+	rfi = off;
+	off = 0;
+	// Pad the beginning of the reference with Ns if necessary
+	if(rfi < padi) {
+		size_t beginpad = (size_t)(padi - rfi);
+		for(size_t i = 0; i < beginpad; i++) {
+			btref2.insert('N', 0);
+			off--;
+		}
+		rfi = 0;
+	} else {
+		rfi -= padi;
+	}
+	assert_geq(rfi, 0);
+	// Pad the end of the reference with Ns if necessary
+	while(rfi + nrow + padi + padf > btref2.length()) {
+		btref2.append('N');
+	}
+	rff = rfi + nrow + padi + padf;
+	// Convert reference string to masks
+	for(size_t i = 0; i < btref2.length(); i++) {
+		if(toupper(btref2[i]) == 'N' && !nsInclusive) {
+			btref2.set(16, i);
+		} else {
+			int num = 0;
+			int alts[] = {4, 4, 4, 4};
+			decodeNuc(toupper(btref2[i]), num, alts);
+			assert_leq(num, 4);
+			assert_gt(num, 0);
+			btref2.set(0, i);
+			for(int j = 0; j < num; j++) {
+				btref2.set(btref2[i] | (1 << alts[j]), i);
+			}
+		}
+	}
+	bool fw = true;
+	uint32_t refidx = 0;
+	size_t solwidth = width;
+	if(maxgaps >= solwidth) {
+		solwidth = 0;
+	} else {
+		solwidth -= maxgaps;
+	}
+	if(en == NULL) {
+		enbuf.resize(solwidth);
+		enbuf.fill(true);
+		en = &enbuf;
+	}
+	assert_geq(rfi, 0);
+	assert_gt(rff, rfi);
+	readrc = read;
+	qualrc = qual;
+	al.initRead(
+		read,          // read sequence
+		readrc,
+		qual,          // read qualities
+		qualrc,
+		0,             // offset of first character within 'read' to consider
+		read.length(), // offset of last char (exclusive) in 'read' to consider
+		floorsc);      // local-alignment score floor
+	al.initRef(
+		fw,            // 'read' is forward version of read?
+		refidx,        // id of reference aligned to
+		off,           // offset of upstream ref char aligned against
+		btref2.wbuf(), // reference sequence (masks)
+		rfi,           // offset of first char in 'ref' to consider
+		rff,           // offset of last char (exclusive) in 'ref' to consider
+		width,         // # bands to do (width of parallelogram)
+		solwidth,      // # rightmost cols where solns can end
+		sc,            // scoring scheme
+		minsc,         // minimum score for valid alignment
+		maxgaps,       // max of max # read gaps, ref gaps
+		0,             // amount to truncate on left-hand side
+		en);           // mask indicating which columns we can end in
+	if(filterns) {
+		al.filter((int)nceil);
+	}
+	al.align(rnd);
+}
+
+/**
+ * Another interface for running a case.
+ */
+static void doTestCase2(
+	SwAligner&         al,
+	const char        *read,
+	const char        *qual,
+	const char        *refin,
+	TRefOff            off,
+	const Scoring&     sc,
+	float              costMinConst,
+	float              costMinLinear,
+	SwResult&          res,
+	bool               nsInclusive = false,
+	bool               filterns = false,
+	uint32_t           seed = 0)
+{
+	btread.install(read, true);
+	TAlScore minsc = (TAlScore)(Scoring::linearFunc(
+		btread.length(),
+		costMinConst,
+		costMinLinear));
+	TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
+		btread.length(),
+		costFloorConst,
+		costFloorLinear));
+	btqual.install(qual);
+	btref.install(refin);
+	doTestCase(
+		al,
+		btread,
+		btqual,
+		btref,
+		off,
+		NULL,
+		sc,  
+		minsc,
+		floorsc,
+		res,
+		nsInclusive,
+		filterns,
+		seed
+	);
+}
+
+/**
+ * Another interface for running a case.
+ */
+static void doTestCase3(
+	SwAligner&         al,
+	const char        *read,
+	const char        *qual,
+	const char        *refin,
+	TRefOff            off,
+	Scoring&           sc,
+	float              costMinConst,
+	float              costMinLinear,
+	float              nCeilConst,
+	float              nCeilLinear,
+	SwResult&          res,
+	bool               nsInclusive = false,
+	bool               filterns = false,
+	uint32_t           seed = 0)
+{
+	btread.install(read, true);
+	// Calculate the penalty ceiling for the read
+	TAlScore minsc = (TAlScore)(Scoring::linearFunc(
+		btread.length(),
+		costMinConst,
+		costMinLinear));
+	TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
+		btread.length(),
+		costFloorConst,
+		costFloorLinear));
+	btqual.install(qual);
+	btref.install(refin);
+	sc.nCeil.setType(SIMPLE_FUNC_LINEAR);
+	sc.nCeil.setConst(costMinConst);
+	sc.nCeil.setCoeff(costMinLinear);
+	doTestCase(
+		al,
+		btread,
+		btqual,
+		btref,
+		off,
+		NULL,
+		sc,  
+		minsc,
+		floorsc,
+		res,
+		nsInclusive,
+		filterns,
+		seed
+	);
+}
+
+/**
+ * Another interface for running a case.  Like doTestCase3 but caller specifies
+ * st_ and en_ lists.
+ */
+static void doTestCase4(
+	SwAligner&         al,
+	const char        *read,
+	const char        *qual,
+	const char        *refin,
+	TRefOff            off,
+	EList<bool>&       en,
+	Scoring&           sc,
+	float              costMinConst,
+	float              costMinLinear,
+	float              nCeilConst,
+	float              nCeilLinear,
+	SwResult&          res,
+	bool               nsInclusive = false,
+	bool               filterns = false,
+	uint32_t           seed = 0)
+{
+	btread.install(read, true);
+	// Calculate the penalty ceiling for the read
+	TAlScore minsc = (TAlScore)(Scoring::linearFunc(
+		btread.length(),
+		costMinConst,
+		costMinLinear));
+	TAlScore floorsc = (TAlScore)(Scoring::linearFunc(
+		btread.length(),
+		costFloorConst,
+		costFloorLinear));
+	btqual.install(qual);
+	btref.install(refin);
+	sc.nCeil.setType(SIMPLE_FUNC_LINEAR);
+	sc.nCeil.setConst(costMinConst);
+	sc.nCeil.setCoeff(costMinLinear);
+	doTestCase(
+		al,
+		btread,
+		btqual,
+		btref,
+		off,
+		&en,
+		sc,  
+		minsc,
+		floorsc,
+		res,
+		nsInclusive,
+		filterns,
+		seed
+	);
+}
+
+/**
+ * Do a set of unit tests.
+ */
+static void doTests() {
+	bonusMatchType  = DEFAULT_MATCH_BONUS_TYPE;
+	bonusMatch      = DEFAULT_MATCH_BONUS;
+	penMmcType      = DEFAULT_MM_PENALTY_TYPE;
+	penMmc          = DEFAULT_MM_PENALTY;
+	penSnp          = DEFAULT_SNP_PENALTY;
+	penNType        = DEFAULT_N_PENALTY_TYPE;
+	penN            = DEFAULT_N_PENALTY;
+	nPairCat        = DEFAULT_N_CAT_PAIR;
+	penRdExConst    = DEFAULT_READ_GAP_CONST;
+	penRfExConst    = DEFAULT_REF_GAP_CONST;
+	penRdExLinear   = DEFAULT_READ_GAP_LINEAR;
+	penRfExLinear   = DEFAULT_REF_GAP_LINEAR;
+	costMinConst    = DEFAULT_MIN_CONST;
+	costMinLinear   = DEFAULT_MIN_LINEAR;
+	costFloorConst  = DEFAULT_FLOOR_CONST;
+	costFloorLinear = DEFAULT_FLOOR_LINEAR;
+	nCeilConst      = 1.0f; // constant factor in N ceil w/r/t read len
+	nCeilLinear     = 0.1f; // coeff of linear term in N ceil w/r/t read len
+	multiseedMms    = DEFAULT_SEEDMMS;
+	multiseedLen    = DEFAULT_SEEDLEN;
+	// Set up penalities
+	Scoring sc(
+		bonusMatch,
+		penMmcType,    // how to penalize mismatches
+		30,        // constant if mm pelanty is a constant
+		30,        // penalty for decoded SNP
+		costMinConst,  // constant factor in N ceiling w/r/t read length
+		costMinLinear, // coeff of linear term in N ceiling w/r/t read length
+		costFloorConst,  // constant factor in N ceiling w/r/t read length
+		costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
+		nCeilConst,    // constant factor in N ceiling w/r/t read length
+		nCeilLinear,   // coeff of linear term in N ceiling w/r/t read length
+		penNType,      // how to penalize Ns in the read
+		penN,          // constant if N pelanty is a constant
+		nPairCat,      // true -> concatenate mates before N filtering
+		25,  // constant coeff for cost of gap in read
+		25,  // constant coeff for cost of gap in ref
+		15, // linear coeff for cost of gap in read
+		15, // linear coeff for cost of gap in ref
+		1,             // # rows at top/bot can only be entered diagonally
+		-1,            // min row idx to backtrace from; -1 = no limit
+		false          // sort results first by row then by score?
+	);
+	// Set up alternative penalities
+	Scoring sc2(
+		bonusMatch,
+		COST_MODEL_QUAL, // how to penalize mismatches
+		30,          // constant if mm pelanty is a constant
+		30,          // penalty for decoded SNP
+		costMinConst,  // constant factor in N ceiling w/r/t read length
+		costMinLinear, // coeff of linear term in N ceiling w/r/t read length
+		costFloorConst,  // constant factor in N ceiling w/r/t read length
+		costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
+		1.0f,            // constant factor in N ceiling w/r/t read length
+		1.0f,            // coeff of linear term in N ceiling w/r/t read length
+		penNType,        // how to penalize Ns in the read
+		penN,            // constant if N pelanty is a constant
+		nPairCat,        // true -> concatenate mates before N filtering
+		25,    // constant coeff for cost of gap in read
+		25,    // constant coeff for cost of gap in ref
+		15,   // linear coeff for cost of gap in read
+		15,   // linear coeff for cost of gap in ref
+		1,               // # rows at top/bot can only be entered diagonally
+		-1,              // min row idx to backtrace from; -1 = no limit
+		false            // sort results first by row then by score?
+	);
+	SwResult res;
+	
+	//
+	// Basic nucleotide-space tests
+	//
+	cerr << "Running tests..." << endl;
+	int tests = 1;
+	bool nIncl = false;
+	bool nfilter = false;
+
+	SwAligner al;
+	RandomSource rnd(73);
+	for(int i = 0; i < 3; i++) {
+		cerr << "  Test " << tests++ << " (nuc space, offset "
+		     << (i*4) << ", exact)...";
+		sc.rdGapConst = 40;
+		sc.rfGapConst = 40;
+		sc.rdGapLinear = 15;
+		sc.rfGapLinear = 15;
+	//        A           C           G           T           A           C           G           T
+	//    H   E   F   H   E   F   H   E   F   H   E   F   H   E   F   H   E   F   H   E   F   H   E   F
+	// A  0   lo  lo -30  lo  lo -30  lo  lo -30 lo lo 0 lo lo -30 lo lo-30 lo lo-30 lo lo
+	// C -30  lo -55  0  -85 -85 -55 -55 -85
+	// G -30  lo -70 -55 -85 -55  0 -100-100
+	// T -30  lo -85 -60 -85 -70 -55-100 -55
+	// A  0   lo -85 -55 -55 -85 -70 -70 -70
+	// C -30  lo -55  0  -85-100 -55 -55 -85
+	// G -30  lo -70 -55 -85 -55  0 -100-100
+	// T -30  lo -85 -60 -85 -70 -55-100 -55
+		doTestCase2(
+			al,
+			"ACGTACGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 0);
+		assert_eq(res.alres.score().ns(), 0);
+		assert(res.alres.ned().empty());
+		assert(res.alres.aed().empty());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1mm allowed by minsc)...";
+		sc.setMmPen(COST_MODEL_CONSTANT, 30);
+		//sc.setMatchBonus(10);
+		doTestCase2(
+			al,
+			"ACGTTCGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -30);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1mm allowed by minsc, check qual 1)...";
+		doTestCase2(
+			al,
+			"ACGTTCGT",         // read
+			"ABCDEFGH",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc2,                // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		size_t lo, hi;
+		if(i == 0) {
+			lo = 0; hi = 1;
+		} else if(i == 1) {
+			lo = 1; hi = 2;
+		} else {
+			lo = 2; hi = 3;
+		}
+		for(size_t j = lo; j < hi; j++) {
+			al.nextAlignment(res, rnd);
+			assert(!res.empty());
+			assert_eq(j*4, res.alres.refoff());
+			assert_eq(8, res.alres.refExtent());
+			assert_eq(res.alres.score().gaps(), 0);
+			assert_eq(res.alres.score().score(), -36);
+			assert_eq(res.alres.score().ns(), 0);
+			res.reset();
+		}
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1mm allowed by minsc, check qual 2)...";
+		doTestCase2(
+			al,
+			"ACGAACGT",         // read
+			"ABCDEFGH",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc2,                // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -35);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		assert(res.empty());
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1mm allowed by minsc, check qual )...";
+		assert(res.empty());
+		doTestCase2(
+			al,
+			"TCGTACGT",         // read
+			"ABCDEFGH",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc2,                // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -32);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		assert(res.empty());
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1mm at the beginning, allowed by minsc)...";
+		doTestCase2(
+			al,
+			"CCGTACGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -30);
+		assert_eq(res.alres.score().ns(), 0);
+		assert_eq(1, res.alres.ned().size());
+		assert_eq(0, res.alres.aed().size());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 n in read, allowed)...";
+		doTestCase3(
+			al,
+			"ACGTNCGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			1.0f,               // allow 1 N
+			0.0f,               // allow 1 N
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -1);
+		assert_eq(res.alres.score().ns(), 1);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 2 n in read, allowed)...";
+		doTestCase3(
+			al,
+			"ACGNNCGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			2.0f,               // const coeff for N ceiling
+			0.0f,               // linear coeff for N ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -2);
+		assert_eq(res.alres.score().ns(), 2);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 2 n in read, 1 at beginning, allowed)...";
+		doTestCase2(
+			al,
+			"NCGTNCGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -2);
+		assert_eq(res.alres.score().ns(), 2);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 n in ref, allowed)...";
+		doTestCase2(
+			al,
+			"ACGTACGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTNCGTACGTANGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), -1);
+		assert_eq(res.alres.score().ns(), 1);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1mm disallowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTTCGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-10.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		// Read gap with equal read and ref gap penalties
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", read gap allowed by minsc)...";
+		assert(res.empty());
+		sc.rfGapConst = 25;
+		sc.rdGapConst = 25;
+		sc.rfGapLinear = 15;
+		sc.rdGapLinear = 15;
+		doTestCase2(
+			al,
+			"ACGTCGT",          // read
+			"IIIIIII",          // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -40);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", read gap disallowed by minsc)...";
+		sc.rfGapConst = 25;
+		sc.rdGapConst = 25;
+		sc.rfGapLinear = 15;
+		sc.rdGapLinear = 15;
+		doTestCase2(
+			al,
+			"ACGTCGT",          // read
+			"IIIIIII",          // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		res.reset();
+
+		cerr << "PASSED" << endl;
+		// Ref gap with equal read and ref gap penalties
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", ref gap allowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTAACGT",        // read
+			"IIIIIIIII",        // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -40);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", read gap disallowed by gap barrier)...";
+		sc.rfGapConst = 25;
+		sc.rdGapConst = 25;
+		sc.rfGapLinear = 15;
+		sc.rdGapLinear = 15;
+		sc.gapbar = 4;
+		doTestCase2(
+			al,
+			"ACGTCGT",          // read
+			"IIIIIII",          // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		sc.gapbar = 1;
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		res.reset();
+
+		cerr << "PASSED" << endl;
+		// Ref gap with equal read and ref gap penalties
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", ref gap allowed by minsc, gapbar=3)...";
+		sc.gapbar = 3;
+		doTestCase2(
+			al,
+			"ACGTAACGT",        // read
+			"IIIIIIIII",        // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		sc.gapbar = 1;
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -40);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		// Ref gap with equal read and ref gap penalties
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", ref gap allowed by minsc, gapbar=4)...";
+		sc.gapbar = 4;
+		doTestCase2(
+			al,
+			"ACGTAACGT",        // read
+			"IIIIIIIII",        // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		sc.gapbar = 1;
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -40);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", ref gap disallowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTAACGT",        // read
+			"IIIIIIIII",        // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		assert(al.done());
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", ref gap disallowed by gap barrier)...";
+		sc.gapbar = 5;
+		doTestCase2(
+			al,
+			"ACGTAACGT",        // read
+			"IIIIIIIII",        // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		sc.gapbar = 1;
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		assert(al.done());
+		cerr << "PASSED" << endl;
+		
+		// Read gap with one read gap and zero ref gaps allowed
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 read gap, ref gaps disallowed by minsc)...";
+		sc.rfGapConst = 35;
+		sc.rdGapConst = 25;
+		sc.rfGapLinear = 20;
+		sc.rdGapLinear = 10;
+		doTestCase2(
+			al,
+			"ACGTCGT",          // read
+			"IIIIIII",          // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -35);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", gaps disallowed by minsc)...";
+		sc.rfGapConst = 25;
+		sc.rdGapConst = 25;
+		sc.rfGapLinear = 10;
+		sc.rdGapLinear = 10;
+		doTestCase2(
+			al,
+			"ACGTCGT",          // read
+			"IIIIIII",          // qual 
+			"ACGTACGTACGTACGT", // ref 
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		assert(res.empty());
+		cerr << "PASSED" << endl;
+		
+		// Ref gap with one ref gap and zero read gaps allowed
+		sc.rfGapConst = 25;
+		sc.rdGapConst = 35;
+		sc.rfGapLinear = 12;
+		sc.rdGapLinear = 22;
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 ref gap, read gaps disallowed by minsc)...";
+		assert(res.empty());
+		doTestCase2(
+			al,
+			"ACGTAACGT",
+			"IIIIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -37);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+			<< ", gaps disallowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTAACGT",
+			"IIIIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		cerr << "PASSED" << endl;
+		
+		// Read gap with one read gap and two ref gaps allowed
+		sc.rfGapConst = 20;
+		sc.rdGapConst = 25;
+		sc.rfGapLinear = 10;
+		sc.rdGapLinear = 15;
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 read gap, 2 ref gaps allowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTCGT",
+			"IIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -40);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", gaps disallowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTCGT",
+			"IIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		cerr << "PASSED" << endl;
+
+		// Ref gap with one ref gap and two read gaps allowed
+		sc.rfGapConst = 25;
+		sc.rdGapConst = 11;  // if this were 10, we'd have ties
+		sc.rfGapLinear = 15;
+		sc.rdGapLinear = 10;
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 ref gap, 2 read gaps allowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTAACGT",
+			"IIIIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -40);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4) << ", gaps disallowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTAACGT",
+			"IIIIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		res.reset();
+		assert(al.done());
+		cerr << "PASSED" << endl;
+		
+		// Read gap with two read gaps and two ref gaps allowed
+		sc.rfGapConst = 15;
+		sc.rdGapConst = 15;
+		sc.rfGapLinear = 10;
+		sc.rdGapLinear = 10;
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 2 ref gaps, 2 read gaps allowed by minsc)...";
+		doTestCase3(
+			al,
+			"ACGTCGT",
+			"IIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-40.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			1.0,                // const coeff for N ceiling
+			0.0,                // linear coeff for N ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			true);              // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		if(!res.empty()) {
+			//al.printResultStacked(res, cerr); cerr << endl;
+		}
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -25);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		// The following alignment is possible when i == 2:
+		//   ACGTACGTACGTACGTN
+		// A             x
+		// C              x
+		// G               x
+		// T                x
+		// C                x
+		// G                x
+		// T                 x
+		assert(i == 2 || res.empty());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		sc.rfGapConst = 10;
+		sc.rdGapConst = 10;
+		sc.rfGapLinear = 10;
+		sc.rdGapLinear = 10;
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 ref gap, 1 read gap allowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTCGT",
+			"IIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -20);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		// Ref gap with two ref gaps and zero read gaps allowed
+		sc.rfGapConst = 15;
+		sc.rdGapConst = 15;
+		sc.rfGapLinear = 5;
+		sc.rdGapLinear = 5;
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 2 ref gaps, 2 read gaps allowed by minsc)...";
+		// Careful: it might be possible for the read to align with overhang
+		// instead of with a gap
+		doTestCase3(
+			al,
+			"ACGTAACGT",
+			"IIIIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-35.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			1.0f,               // needed to avoid overhang alignments
+			0.0f,               // needed to avoid overhang alignments
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			true);              // filter Ns
+		if(i == 0) {
+			lo = 0; hi = 1;
+		} else if(i == 1) {
+			lo = 1; hi = 2;
+		} else {
+			lo = 2; hi = 3;
+		}
+		for(size_t j = lo; j < hi; j++) {
+			al.nextAlignment(res, rnd);
+			assert(!res.empty());
+			//al.printResultStacked(res, cerr); cerr << endl;
+			assert(res.alres.refoff() == 0 ||
+			       res.alres.refoff() == 4 ||
+				   res.alres.refoff() == 8);
+			assert_eq(8, res.alres.refExtent());
+			assert_eq(res.alres.score().gaps(), 1);
+			assert_eq(res.alres.score().score(), -20);
+			assert_eq(res.alres.score().ns(), 0);
+			res.reset();
+		}
+		al.nextAlignment(res, rnd);
+		//assert(res.empty());
+		//res.reset();
+		cerr << "PASSED" << endl;
+		
+		sc.rfGapConst = 25;
+		sc.rdGapConst = 25;
+		sc.rfGapLinear = 4;
+		sc.rdGapLinear = 4;
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1 ref gap, 1 read gap allowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTAACGT",
+			"IIIIIIIII",
+			"ACGTACGTACGTACGT",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 1);
+		assert_eq(res.alres.score().score(), -29);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", short read)...";
+		doTestCase2(
+			al,
+			"A",
+			"I",
+			"AAAAAAAAAAAA",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-30.0f,             // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 0);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		if(i == 0) {
+			cerr << "  Test " << tests++
+			     << " (nuc space, offset 0, short read & ref)...";
+			doTestCase2(
+				al,
+				"A",
+				"I",
+				"A",
+				0,                  // off
+				sc,                 // scoring scheme
+				-30.0f,             // const coeff for cost ceiling
+				0.0f,               // linear coeff for cost ceiling
+				res,                // result
+				nIncl,              // Ns inclusive (not mismatches)
+				nfilter);           // filter Ns
+			al.nextAlignment(res, rnd);
+			assert(!res.empty());
+			assert_eq(res.alres.score().gaps(), 0);
+			assert_eq(res.alres.score().score(), 0);
+			assert_eq(res.alres.score().ns(), 0);
+			res.reset();
+			cerr << "PASSED" << endl;
+		}
+
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", short read, many allowed gaps)...";
+		doTestCase2(
+			al,
+			"A",
+			"I",
+			"AAAAAAAAAAAA",
+			i*4,                // off
+			sc,                 // scoring scheme
+			-150.0f,            // const coeff for cost ceiling
+			0.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 0);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		if(i == 0) {
+			cerr << "  Test " << tests++
+			     << " (nuc space, offset 0, short read & ref, "
+				 << "many allowed gaps)...";
+			doTestCase2(
+				al,
+				"A",
+				"I",
+				"A",
+				0,                  // off
+				sc,                 // scoring scheme
+				-150.0f,            // const coeff for cost ceiling
+				0.0f,               // linear coeff for cost ceiling
+				res,                // result
+				nIncl,              // Ns inclusive (not mismatches)
+				nfilter);           // filter Ns
+			al.nextAlignment(res, rnd);
+			assert(!res.empty());
+			assert_eq(res.alres.score().gaps(), 0);
+			assert_eq(res.alres.score().score(), 0);
+			assert_eq(res.alres.score().ns(), 0);
+			res.reset();
+			cerr << "PASSED" << endl;
+		}
+	}
+
+	// A test case where a valid alignment with a worse score should be
+	// accepted over a valid alignment with a better score but too many
+	// Ns
+	cerr << "  Test " << tests++ << " (N ceiling 1)...";
+	sc.mmcostType = COST_MODEL_CONSTANT;
+	sc.mmcost = 10;
+	sc.snp = 30;
+	sc.nCeilConst  = 0.0f;
+	sc.nCeilLinear = 0.0f;
+	sc.rfGapConst  = 10;
+	sc.rdGapLinear = 10;
+	sc.rfGapConst  = 10;
+	sc.rfGapLinear = 10;
+	sc.setNPen(COST_MODEL_CONSTANT, 2);
+	sc.gapbar = 1;
+	// No Ns allowed, so this hit should be filtered
+	doTestCase2(
+		al,
+		"ACGTACGT", // read seq
+		"IIIIIIII", // read quals
+		"NCGTACGT", // ref seq
+		0,          // offset
+		sc,         // scoring scheme
+		-25.0f,     // const coeff for cost ceiling
+		0.0f,       // linear coeff for cost ceiling
+		res,        // result
+		false,      // ns are in inclusive
+		true,       // nfilter
+		0);
+	al.nextAlignment(res, rnd);
+	assert(res.empty());
+	cerr << "PASSED" << endl;
+	res.reset();
+
+	// 1 N allowed, so this hit should stand
+	cerr << "  Test " << tests++ << " (N ceiling 2)...";
+	doTestCase3(
+		al,
+		"ACGTACGT", // read seq
+		"IIIIIIII", // read quals
+		"NCGTACGT", // ref seq
+		0,          // offset
+		sc,         // scoring scheme
+		-25.0f,     // const coeff for cost ceiling
+		0.0f,       // linear coeff for cost ceiling
+		1.0f,       // constant coefficient for # Ns allowed
+		0.0f,       // linear coefficient for # Ns allowed
+		res,        // result
+		false,      // ns are in inclusive
+		false,      // nfilter - NOTE: FILTER OFF
+		0);
+	al.nextAlignment(res, rnd);
+	assert(!res.empty());
+	assert_eq(0,  res.alres.score().gaps());
+	assert_eq(-2, res.alres.score().score());
+	assert_eq(1,  res.alres.score().ns());
+	cerr << "PASSED" << endl;
+	res.reset();
+
+	// 1 N allowed, but we set st_ such that this hit should not stand
+	for(size_t i = 0; i < 2; i++) {
+		cerr << "  Test " << tests++ << " (N ceiling 2 with st_ override)...";
+		EList<bool> en;
+		en.resize(3); en.fill(true);
+		if(i == 1) {
+			en[1] = false;
+		}
+		sc.rfGapConst  = 10;
+		sc.rdGapLinear = 10;
+		sc.rfGapConst  = 10;
+		sc.rfGapLinear = 10;
+		doTestCase4(
+			al,
+			"ACGTACGT", // read seq
+			"IIIIIIII", // read quals
+			"NCGTACGT", // ref seq
+			0,          // offset
+			en,         // rectangle columns where solution can end
+			sc,         // scoring scheme
+			-25.0f,     // const coeff for cost ceiling
+			0.0f,       // linear coeff for cost ceiling
+			1.0f,       // constant coefficient for # Ns allowed
+			0.0f,       // linear coefficient for # Ns allowed
+			res,        // result
+			false,      // ns are in inclusive
+			false,      // nfilter - NOTE: FILTER OFF
+			0);
+		al.nextAlignment(res, rnd);
+		if(i > 0) {
+			assert(res.empty());
+		} else {
+			assert(!res.empty());
+		}
+		cerr << "PASSED" << endl;
+		res.reset();
+	}
+
+	// No Ns allowed, so this hit should be filtered
+	cerr << "  Test " << tests++ << " (N ceiling 3)...";
+	sc.nCeilConst = 1.0f;
+	sc.nCeilLinear = 0.0f;
+	doTestCase2(
+		al,
+		"ACGTACGT", // read seq
+		"IIIIIIII", // read quals
+		"NCGTACGT", // ref seq
+		0,          // offset
+		sc,         // scoring scheme
+		-25.0f,     // const coeff for cost ceiling
+		0.0f,       // linear coeff for cost ceiling
+		res,        // result
+		false,      // ns are in inclusive
+		true,       // nfilter - NOTE: FILTER ON
+		0);
+	al.nextAlignment(res, rnd);
+	assert(!res.empty());
+	assert_eq(0,  res.alres.score().gaps());
+	assert_eq(-2, res.alres.score().score());
+	assert_eq(1,  res.alres.score().ns());
+	cerr << "PASSED" << endl;
+	res.reset();
+
+	// No Ns allowed, so this hit should be filtered
+	cerr << "  Test " << tests++ << " (redundant alignment elimination 1)...";
+	sc.nCeilConst = 1.0f;
+	sc.nCeilLinear = 0.0f;
+	sc.rfGapConst  = 25;
+	sc.rdGapLinear = 15;
+	sc.rfGapConst  = 25;
+	sc.rfGapLinear = 15;
+	doTestCase2(
+		al,
+		//                   1         2         3         4
+		//         01234567890123456789012345678901234567890123456
+		          "AGGCTATGCCTCTGACGCGATATCGGCGCCCACTTCAGAGCTAACCG",
+		          "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII",
+		  "TTTTTTTTAGGCTATGCCTCTGACGCGATATCGGCGCCCACTTCAGAGCTAACCGTTTTTTT",
+		// 01234567890123456789012345678901234567890123456789012345678901
+		//           1         2         3         4         5         6
+		8,          // offset
+		sc,         // scoring scheme
+		-25.0f,     // const coeff for cost ceiling
+		-5.0f,      // linear coeff for cost ceiling
+		res,        // result
+		false,      // ns are in inclusive
+		true,       // nfilter - NOTE: FILTER ON
+		0);
+	al.nextAlignment(res, rnd);
+	assert(!res.empty());
+	assert_eq(8, res.alres.refoff());
+	assert_eq(47, res.alres.refExtent());
+	assert_eq(0, res.alres.score().gaps());
+	assert_eq(0, res.alres.score().score());
+	assert_eq(0, res.alres.score().ns());
+	res.reset();
+	al.nextAlignment(res, rnd);
+	assert(res.empty());
+	assert(al.done());
+	cerr << "PASSED" << endl;
+	res.reset();
+	
+}
+
+/**
+ * Do a set of unit tests for local alignment.
+ */
+static void doLocalTests() {
+	bonusMatchType  = DEFAULT_MATCH_BONUS_TYPE;
+	bonusMatch      = DEFAULT_MATCH_BONUS_LOCAL;
+	penMmcType      = DEFAULT_MM_PENALTY_TYPE;
+	penMmc          = DEFAULT_MM_PENALTY;
+	penSnp          = DEFAULT_SNP_PENALTY;
+	penNType        = DEFAULT_N_PENALTY_TYPE;
+	penN            = DEFAULT_N_PENALTY;
+	nPairCat        = DEFAULT_N_CAT_PAIR;
+	penRdExConst    = DEFAULT_READ_GAP_CONST;
+	penRfExConst    = DEFAULT_REF_GAP_CONST;
+	penRdExLinear   = DEFAULT_READ_GAP_LINEAR;
+	penRfExLinear   = DEFAULT_REF_GAP_LINEAR;
+	costMinConst    = DEFAULT_MIN_CONST_LOCAL;
+	costMinLinear   = DEFAULT_MIN_LINEAR_LOCAL;
+	costFloorConst  = DEFAULT_FLOOR_CONST_LOCAL;
+	costFloorLinear = DEFAULT_FLOOR_LINEAR_LOCAL;
+	nCeilConst      = 1.0f; // constant factor in N ceil w/r/t read len
+	nCeilLinear     = 0.1f; // coeff of linear term in N ceil w/r/t read len
+	multiseedMms    = DEFAULT_SEEDMMS;
+	multiseedLen    = DEFAULT_SEEDLEN;
+	// Set up penalities
+	Scoring sc(
+		10,
+		penMmcType,    // how to penalize mismatches
+		30,            // constant if mm pelanty is a constant
+		penSnp,        // penalty for decoded SNP
+		costMinConst,  // constant factor in N ceiling w/r/t read length
+		costMinLinear, // coeff of linear term in N ceiling w/r/t read length
+		costFloorConst,  // constant factor in N ceiling w/r/t read length
+		costFloorLinear, // coeff of linear term in N ceiling w/r/t read length
+		nCeilConst,    // constant factor in N ceiling w/r/t read length
+		nCeilLinear,   // coeff of linear term in N ceiling w/r/t read length
+		penNType,      // how to penalize Ns in the read
+		penN,          // constant if N pelanty is a constant
+		nPairCat,      // true -> concatenate mates before N filtering
+		25,            // constant coeff for cost of gap in read
+		25,            // constant coeff for cost of gap in ref
+		15,            // linear coeff for cost of gap in read
+		15,            // linear coeff for cost of gap in ref
+		1,             // # rows at top/bot can only be entered diagonally
+		-1,            // min row idx to backtrace from; -1 = no limit
+		false          // sort results first by row then by score?
+	);
+	SwResult res;
+	
+	//
+	// Basic nucleotide-space tests
+	//
+	cerr << "Running local tests..." << endl;
+	int tests = 1;
+	bool nIncl = false;
+	bool nfilter = false;
+
+	SwAligner al;
+	RandomSource rnd(73);
+	for(int i = 0; i < 3; i++) {
+		cerr << "  Test " << tests++ << " (short nuc space, offset "
+		     << (i*4) << ", exact)...";
+		sc.rdGapConst = 40;
+		sc.rfGapConst = 40;
+		doTestCase2(
+			al,
+			"ACGT",             // read
+			"IIII",             // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			0.0f,               // const coeff for cost ceiling
+			8.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(4, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 40);
+		assert_eq(res.alres.score().ns(), 0);
+		assert(res.alres.ned().empty());
+		assert(res.alres.aed().empty());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		//     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+		//     A C G T A C G T A C G T A C G T
+		// 0 C
+		// 1 C   x
+		// 2 G     x
+		// 3 T       x
+
+		cerr << "  Test " << tests++ << " (short nuc space, offset "
+		     << (i*4) << ", 1mm)...";
+		sc.rdGapConst = 40;
+		sc.rfGapConst = 40;
+		doTestCase2(
+			al,
+			"CCGT",             // read
+			"IIII",             // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			0.0f,               // const coeff for cost ceiling
+			7.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4+1, res.alres.refoff());
+		assert_eq(3, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 30);
+		assert_eq(res.alres.score().ns(), 0);
+		assert(res.alres.ned().empty());
+		assert(res.alres.aed().empty());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (short nuc space, offset "
+		     << (i*4) << ", 1mm)...";
+		sc.rdGapConst = 40;
+		sc.rfGapConst = 40;
+		doTestCase2(
+			al,
+			"ACGA",             // read
+			"IIII",             // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			0.0f,               // const coeff for cost ceiling
+			7.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(3, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 30);
+		assert_eq(res.alres.score().ns(), 0);
+		assert(res.alres.ned().empty());
+		assert(res.alres.aed().empty());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		if(i == 0) {
+			cerr << "  Test " << tests++ << " (short nuc space, offset "
+				 << (i*4) << ", 1mm, match bonus=20)...";
+			sc.rdGapConst = 40;
+			sc.rfGapConst = 40;
+			sc.setMatchBonus(20);
+			doTestCase2(
+				al,
+				"TTGT",             // read
+				"IIII",             // qual
+				"TTGA",             // ref in
+				i*4,                // off
+				sc,                 // scoring scheme
+				25.0f,               // const coeff for cost ceiling
+				0.0f,               // linear coeff for cost ceiling
+				res,                // result
+				nIncl,              // Ns inclusive (not mismatches)
+				nfilter);           // filter Ns
+			assert(!al.done());
+			al.nextAlignment(res, rnd);
+			assert(!res.empty());
+			assert_eq(i*4, res.alres.refoff());
+			assert_eq(3, res.alres.refExtent());
+			assert_eq(res.alres.score().gaps(), 0);
+			assert_eq(res.alres.score().score(), 60);
+			assert_eq(res.alres.score().ns(), 0);
+			assert(res.alres.ned().empty());
+			assert(res.alres.aed().empty());
+			res.reset();
+			al.nextAlignment(res, rnd);
+			assert(res.empty());
+			assert(al.done());
+			res.reset();
+			sc.setMatchBonus(10);
+			cerr << "PASSED" << endl;
+		}
+
+		cerr << "  Test " << tests++ << " (nuc space, offset "
+		     << (i*4) << ", exact)...";
+		sc.rdGapConst = 40;
+		sc.rfGapConst = 40;
+		doTestCase2(
+			al,
+			"ACGTACGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			0.0f,               // const coeff for cost ceiling
+			8.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 80);
+		assert_eq(res.alres.score().ns(), 0);
+		assert(res.alres.ned().empty());
+		assert(res.alres.aed().empty());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		assert(res.empty());
+		assert(al.done());
+		res.reset();
+		cerr << "PASSED" << endl;
+		
+		cerr << "  Test " << tests++ << " (long nuc space, offset "
+		     << (i*8) << ", exact)...";
+		sc.rdGapConst = 40;
+		sc.rfGapConst = 40;
+		doTestCase2(
+			al,
+			"ACGTACGTACGTACGTACGTA", // read
+			"IIIIIIIIIIIIIIIIIIIII",  // qual
+			"ACGTACGTACGTACGTACGTACGTACGTACGTACGTA", // ref in
+		//   ACGTACGTACGTACGTACGT
+		//           ACGTACGTACGTACGTACGT
+		//                   ACGTACGTACGTACGTACGT
+			i*8,                // off
+			sc,                 // scoring scheme
+			0.0f,               // const coeff for cost ceiling
+			8.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*8, res.alres.refoff());
+		assert_eq(21, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 210);
+		assert_eq(res.alres.score().ns(), 0);
+		assert(res.alres.ned().empty());
+		assert(res.alres.aed().empty());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		//assert(res.empty());
+		//assert(al.done());
+		res.reset();
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (nuc space, offset " << (i*4)
+		     << ", 1mm allowed by minsc)...";
+		doTestCase2(
+			al,
+			"ACGTTCGT",         // read
+			"IIIIIIII",         // qual
+			"ACGTACGTACGTACGT", // ref in
+			i*4,                // off
+			sc,                 // scoring scheme
+			0.0f,               // const coeff for cost ceiling
+			5.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*4, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 40);
+		assert_eq(res.alres.score().ns(), 0);
+		res.reset();
+		al.nextAlignment(res, rnd);
+		//assert(res.empty());
+		//assert(al.done());
+		cerr << "PASSED" << endl;
+
+		cerr << "  Test " << tests++ << " (long nuc space, offset "
+		     << (i*8) << ", 6mm allowed by minsc)...";
+		sc.rdGapConst = 50;
+		sc.rfGapConst = 50;
+		sc.rdGapLinear = 45;
+		sc.rfGapLinear = 45;
+		doTestCase2(
+			al,
+			"ACGTACGATGCATCGTACGTA", // read
+			"IIIIIIIIIIIIIIIIIIIII",  // qual
+			"ACGTACGTACGTACGTACGTACGTACGTACGTACGTA", // ref in
+		//   ACGTACGTACGTACGTACGT
+		//           ACGTACGTACGTACGTACGT
+		//                   ACGTACGTACGTACGTACGT
+			i*8,                // off
+			sc,                 // scoring scheme
+			0.0f,               // const coeff for cost ceiling
+			1.0f,               // linear coeff for cost ceiling
+			res,                // result
+			nIncl,              // Ns inclusive (not mismatches)
+			nfilter);           // filter Ns
+		assert(!al.done());
+		al.nextAlignment(res, rnd);
+		assert(!res.empty());
+		assert_eq(i*8 + 13, res.alres.refoff());
+		assert_eq(8, res.alres.refExtent());
+		assert_eq(res.alres.score().gaps(), 0);
+		assert_eq(res.alres.score().score(), 80);
+		assert_eq(res.alres.score().ns(), 0);
+		assert(res.alres.ned().empty());
+		assert(res.alres.aed().empty());
+		res.reset();
+		al.nextAlignment(res, rnd);
+		res.reset();
+		cerr << "PASSED" << endl;
+	}
+}
+
+int main(int argc, char **argv) {
+	int option_index = 0;
+	int next_option;
+	unsigned seed = 0;
+	gGapBarrier = 1;
+	gSnpPhred = 30;
+	bool nsInclusive = false;
+	bool nfilter = false;
+	bonusMatchType  = DEFAULT_MATCH_BONUS_TYPE;
+	bonusMatch      = DEFAULT_MATCH_BONUS;
+	penMmcType      = DEFAULT_MM_PENALTY_TYPE;
+	penMmc          = DEFAULT_MM_PENALTY;
+	penSnp          = DEFAULT_SNP_PENALTY;
+	penNType        = DEFAULT_N_PENALTY_TYPE;
+	penN            = DEFAULT_N_PENALTY;
+	penRdExConst    = DEFAULT_READ_GAP_CONST;
+	penRfExConst    = DEFAULT_REF_GAP_CONST;
+	penRdExLinear   = DEFAULT_READ_GAP_LINEAR;
+	penRfExLinear   = DEFAULT_REF_GAP_LINEAR;
+	costMinConst    = DEFAULT_MIN_CONST;
+	costMinLinear   = DEFAULT_MIN_LINEAR;
+	costFloorConst  = DEFAULT_FLOOR_CONST;
+	costFloorLinear = DEFAULT_FLOOR_LINEAR;
+	nCeilConst      = 1.0f; // constant factor in N ceiling w/r/t read length
+	nCeilLinear     = 1.0f; // coeff of linear term in N ceiling w/r/t read length
+	nCatPair        = false;
+	multiseedMms    = DEFAULT_SEEDMMS;
+	multiseedLen    = DEFAULT_SEEDLEN;
+	multiseedIvalType = DEFAULT_IVAL;
+	multiseedIvalA    = DEFAULT_IVAL_A;
+	multiseedIvalB    = DEFAULT_IVAL_B;
+	mhits           = 1;
+	do {
+		next_option = getopt_long(argc, argv, short_opts, long_opts, &option_index);
+		switch (next_option) {
+			case 's': gSnpPhred  = parse<int>(optarg); break;
+			case 'r': seed       = parse<unsigned>(optarg); break;
+			case ARG_TESTS: {
+				doTests();
+				cout << "PASSED end-to-ends" << endl;
+				doLocalTests();
+				cout << "PASSED locals" << endl;
+				return 0;
+			}
+			case 'A': {
+				bool localAlign = false;
+				bool noisyHpolymer = false;
+				bool ignoreQuals = false;
+				SeedAlignmentPolicy::parseString(
+					optarg,
+					localAlign,
+					noisyHpolymer,
+					ignoreQuals,
+					bonusMatchType,
+					bonusMatch,
+					penMmcType,
+					penMmc,
+					penNType,
+					penN,
+					penRdExConst,
+					penRfExConst,
+					penRdExLinear,
+					penRfExLinear,
+					costMinConst,
+					costMinLinear,
+					costFloorConst,
+					costFloorLinear,
+					nCeilConst,
+					nCeilLinear,
+					nCatPair,
+					multiseedMms,
+					multiseedLen,
+					multiseedIvalType,
+					multiseedIvalA,
+					multiseedIvalB,
+					posmin);
+				break;
+			}
+			case -1: break;
+			default: {
+				cerr << "Unknown option: " << (char)next_option << endl;
+				printUsage(cerr);
+				exit(1);
+			}
+		}
+	} while(next_option != -1);
+	srand(seed);
+	if(argc - optind < 4) {
+		cerr << "Need at least 4 arguments" << endl;
+		printUsage(cerr);
+		exit(1);
+	}
+	BTDnaString read;
+	BTString ref, qual;
+	// Get read
+	read.installChars(argv[optind]);
+	// Get qualities
+	qual.install(argv[optind+1]);
+	assert_eq(read.length(), qual.length());
+	// Get reference
+	ref.install(argv[optind+2]);
+	// Get reference offset
+	size_t off = parse<size_t>(argv[optind+3]);
+	// Set up penalities
+	Scoring sc(
+		false,         // local alignment?
+		false,         // bad homopolymer?
+		bonusMatchType,
+		bonusMatch,
+		penMmcType,    // how to penalize mismatches
+		penMmc,        // constant if mm pelanty is a constant
+		costMinConst,
+		costMinLinear,
+		costFloorConst,
+		costFloorLinear,
+		nCeilConst,    // N ceiling constant coefficient
+		nCeilLinear,   // N ceiling linear coefficient
+		penNType,      // how to penalize Ns in the read
+		penN,          // constant if N pelanty is a constant
+		nCatPair,      // true -> concatenate mates before N filtering
+		penRdExConst,  // constant cost of extending a gap in the read
+		penRfExConst,  // constant cost of extending a gap in the reference
+		penRdExLinear, // coeff of linear term for cost of gap extension in read
+		penRfExLinear  // coeff of linear term for cost of gap extension in ref
+	);
+	// Calculate the penalty ceiling for the read
+	TAlScore minsc = Scoring::linearFunc(
+		read.length(),
+		costMinConst,
+		costMinLinear);
+	TAlScore floorsc = Scoring::linearFunc(
+		read.length(),
+		costFloorConst,
+		costFloorLinear);
+	SwResult res;
+	SwAligner al;
+	doTestCase(
+		al,
+		read,
+		qual,
+		ref,
+		off,
+		NULL,
+		sc,  
+		minsc,
+		res,
+		nsInclusive,
+		nfilter,
+		seed);
+}
+#endif /*MAIN_ALIGNER_SW*/
diff --git a/aligner_sw.h b/aligner_sw.h
new file mode 100644
index 0000000..699b098
--- /dev/null
+++ b/aligner_sw.h
@@ -0,0 +1,648 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * aligner_sw.h
+ *
+ * Classes and routines for solving dynamic programming problems in aid of read
+ * alignment.  Goals include the ability to handle:
+ *
+ * - Both read alignment, where the query must align end-to-end, and local
+ *   alignment, where we seek a high-scoring alignment that need not involve
+ *   the entire query.
+ * - Situations where: (a) we've found a seed hit and are trying to extend it
+ *   into a larger hit, (b) we've found an alignment for one mate of a pair and
+ *   are trying to find a nearby alignment for the other mate, (c) we're
+ *   aligning against an entire reference sequence.
+ * - Caller-specified indicators for what columns of the dynamic programming
+ *   matrix we are allowed to start in or end in.
+ *
+ * TODO:
+ *
+ * - A slicker way to filter out alignments that violate a ceiling placed on
+ *   the number of Ns permitted in the reference portion of the alignment.
+ *   Right now we accomplish this by masking out ending columns that correspond
+ *   to *ungapped* alignments with too many Ns.  This results in false
+ *   positives and false negatives for gapped alignments.  The margin of error
+ *   (# of Ns by which we might miscount) is bounded by the number of gaps.
+ */
+
+/**
+ *  |-maxgaps-|
+ *  ***********oooooooooooooooooooooo    -
+ *   ***********ooooooooooooooooooooo    |
+ *    ***********oooooooooooooooooooo    |
+ *     ***********ooooooooooooooooooo    |
+ *      ***********oooooooooooooooooo    |
+ *       ***********ooooooooooooooooo read len
+ *        ***********oooooooooooooooo    |
+ *         ***********ooooooooooooooo    |
+ *          ***********oooooooooooooo    |
+ *           ***********ooooooooooooo    |
+ *            ***********oooooooooooo    -
+ *            |-maxgaps-|
+ *  |-readlen-|
+ *  |-------skip--------|
+ */
+
+#ifndef ALIGNER_SW_H_
+#define ALIGNER_SW_H_
+
+#define INLINE_CUPS
+
+#include <stdint.h>
+#include <iostream>
+#include <limits>
+#include "threading.h"
+#include <emmintrin.h>
+#include "aligner_sw_common.h"
+#include "aligner_sw_nuc.h"
+#include "ds.h"
+#include "aligner_seed.h"
+#include "reference.h"
+#include "random_source.h"
+#include "mem_ids.h"
+#include "aligner_result.h"
+#include "mask.h"
+#include "dp_framer.h"
+#include "aligner_swsse.h"
+#include "aligner_bt.h"
+
+#define QUAL2(d, f) sc_->mm((int)(*rd_)[rdi_ + d], \
+							(int)  rf_ [rfi_ + f], \
+							(int)(*qu_)[rdi_ + d] - 33)
+#define QUAL(d)     sc_->mm((int)(*rd_)[rdi_ + d], \
+							(int)(*qu_)[rdi_ + d] - 33)
+#define N_SNP_PEN(c) (((int)rf_[rfi_ + c] > 15) ? sc_->n(30) : sc_->penSnp)
+
+/**
+ * SwAligner
+ * =========
+ *
+ * Ensapsulates facilities for alignment using dynamic programming.  Handles
+ * alignment of nucleotide reads against known reference nucleotides.
+ *
+ * The class is stateful.  First the user must call init() to initialize the
+ * object with details regarding the dynamic programming problem to be solved.
+ * Next, the user calls align() to fill the dynamic programming matrix and
+ * calculate summaries describing the solutions.  Finally the user calls 
+ * nextAlignment(...), perhaps repeatedly, to populate the SwResult object with
+ * the next result.  Results are dispensend in best-to-worst, left-to-right
+ * order.
+ *
+ * The class expects the read string, quality string, and reference string
+ * provided by the caller live at least until the user is finished aligning and
+ * obtaining alignments from this object.
+ *
+ * There is a design tradeoff between hiding/exposing details of the genome and
+ * its strands to the SwAligner.  In a sense, a better design is to hide
+ * details such as the id of the reference sequence aligned to, or whether
+ * we're aligning the read in its original forward orientation or its reverse
+ * complement.  But this means that any alignment results returned by SwAligner
+ * have to be extended to include those details before they're useful to the
+ * caller.  We opt for messy but expedient - the reference id and orientation
+ * of the read are given to SwAligner, remembered, and used to populate
+ * SwResults.
+ *
+ * LOCAL VS GLOBAL
+ *
+ * The dynamic programming aligner supports both local and global alignment,
+ * and one option in between.  To implement global alignment, the aligner (a)
+ * allows negative scores (i.e. doesn't necessarily clamp them up to 0), (b)
+ * checks in rows other than the last row for acceptable solutions, and (c)
+ * optionally adds a bonus to the score for matches.
+ * 
+ * For global alignment, we:
+ *
+ * (a) Allow negative scores
+ * (b) Check only in the last row
+ * (c) Either add a bonus for matches or not (doesn't matter)
+ *
+ * For local alignment, we:
+ *
+ * (a) Clamp scores to 0
+ * (b) Check in any row for a sufficiently high score
+ * (c) Add a bonus for matches
+ *
+ * An in-between solution is to allow alignments to be curtailed on the
+ * right-hand side if a better score can be achieved thereby, but not on the
+ * left.  For this, we:
+ *
+ * (a) Allow negative scores
+ * (b) Check in any row for a sufficiently high score
+ * (c) Either add a bonus for matches or not (doesn't matter)
+ *
+ * REDUNDANT ALIGNMENTS
+ *
+ * When are two alignments distinct and when are they redundant (not distinct)?
+ * At one extreme, we might say the best alignment from any given dynamic
+ * programming problem is redundant with all other alignments from that
+ # problem.  At the other extreme, we might say that any two alignments with
+ * distinct starting points and edits are distinct.  The former is probably too
+ * conservative for mate-finding DP problems.  The latter is certainly too
+ * permissive, since two alignments that differ only in how gaps are arranged
+ * should not be considered distinct.
+ *
+ * Some in-between solutions are:
+ *
+ * (a) If two alignments share an end point on either end, they are redundant.
+ *     Otherwise, they are distinct.
+ * (b) If two alignments share *both* end points, they are redundant.
+ * (c) If two alignments share any cells in the DP table, they are redundant.
+ * (d) 2 alignments are redundant if either end within N poss of each other
+ * (e) Like (d) but both instead of either
+ * (f, g) Like d, e, but where N is tied to maxgaps somehow
+ *
+ * Why not (a)?  One reason is that it's possible for two alignments to have
+ * different start & end positions but share many cells.  Consider alignments 1
+ * and 2 below; their end-points are labeled.
+ *
+ *  1 2
+ *  \ \
+ *    -\
+ *      \
+ *       \
+ *        \
+ *        -\
+ *        \ \
+ *        1 2
+ *
+ * 1 and 2 are distinct according to (a) but they share many cells in common.
+ *
+ * Why not (f, g)?  It fixes the problem with (a) above by forcing the
+ * alignments to be spread so far that they can't possibly share diagonal cells
+ * in common
+ */
+class SwAligner {
+
+	typedef std::pair<size_t, size_t> SizeTPair;
+
+	// States that the aligner can be in
+	enum {
+		STATE_UNINIT,  // init() hasn't been called yet
+		STATE_INITED,  // init() has been called, but not align()
+		STATE_ALIGNED, // align() has been called
+	};
+	
+	const static size_t ALPHA_SIZE = 5;
+
+public:
+
+	explicit SwAligner() :
+		sseU8fw_(DP_CAT),
+		sseU8rc_(DP_CAT),
+		sseI16fw_(DP_CAT),
+		sseI16rc_(DP_CAT),
+		state_(STATE_UNINIT),
+		initedRead_(false),
+		readSse16_(false),
+		initedRef_(false),
+		rfwbuf_(DP_CAT),
+		btnstack_(DP_CAT),
+		btcells_(DP_CAT),
+		btdiag_(),
+		btncand_(DP_CAT),
+		btncanddone_(DP_CAT),
+		btncanddoneSucc_(0),
+		btncanddoneFail_(0),
+		cper_(),
+		cperMinlen_(),
+		cperPerPow2_(),
+		cperEf_(),
+		cperTri_(),
+		colstop_(0),
+		lastsolcol_(0),
+		cural_(0)
+		ASSERT_ONLY(, cand_tmp_(DP_CAT))
+	{ }
+
+	/**
+	 * Prepare the dynamic programming driver with a new read and a new scoring
+	 * scheme.
+	 */
+	void initRead(
+		const BTDnaString& rdfw, // read sequence for fw read
+		const BTDnaString& rdrc, // read sequence for rc read
+		const BTString& qufw,    // read qualities for fw read
+		const BTString& qurc,    // read qualities for rc read
+		size_t rdi,              // offset of first read char to align
+		size_t rdf,              // offset of last read char to align
+		const Scoring& sc);      // scoring scheme
+	
+	/**
+	 * Initialize with a new alignment problem.
+	 */
+	void initRef(
+		bool fw,               // whether to forward or revcomp read is aligning
+		TRefId refidx,         // id of reference aligned against
+		const DPRect& rect,    // DP rectangle
+		char *rf,              // reference sequence
+		size_t rfi,            // offset of first reference char to align to
+		size_t rff,            // offset of last reference char to align to
+		TRefOff reflen,        // length of reference sequence
+		const Scoring& sc,     // scoring scheme
+		TAlScore minsc,        // minimum score
+		bool enable8,          // use 8-bit SSE if possible?
+		size_t cminlen,        // minimum length for using checkpointing scheme
+		size_t cpow2,          // interval b/t checkpointed diags; 1 << this
+		bool doTri,            // triangular mini-fills?
+		bool extend);          // true iff this is a seed extension
+
+	/**
+	 * Given a read, an alignment orientation, a range of characters in a
+	 * referece sequence, and a bit-encoded version of the reference,
+	 * execute the corresponding dynamic programming problem.
+	 *
+	 * Here we expect that the caller has already narrowed down the relevant
+	 * portion of the reference (e.g. using a seed hit) and all we do is
+	 * banded dynamic programming in the vicinity of that portion.  This is not
+	 * the function to call if we are trying to solve the whole alignment
+	 * problem with dynamic programming (that is TODO).
+	 *
+	 * Returns true if an alignment was found, false otherwise.
+	 */
+	void initRef(
+		bool fw,               // whether to forward or revcomp read aligned
+		TRefId refidx,         // reference aligned against
+		const DPRect& rect,    // DP rectangle
+		const BitPairReference& refs, // Reference strings
+		TRefOff reflen,        // length of reference sequence
+		const Scoring& sc,     // scoring scheme
+		TAlScore minsc,        // minimum alignment score
+		bool enable8,          // use 8-bit SSE if possible?
+		size_t cminlen,        // minimum length for using checkpointing scheme
+		size_t cpow2,          // interval b/t checkpointed diags; 1 << this
+		bool doTri,            // triangular mini-fills?
+		bool extend,           // true iff this is a seed extension
+		size_t  upto,          // count the number of Ns up to this offset
+		size_t& nsUpto);       // output: the number of Ns up to 'upto'
+
+	/**
+	 * Given a read, an alignment orientation, a range of characters in a
+	 * referece sequence, and a bit-encoded version of the reference, set up
+	 * and execute the corresponding ungapped alignment problem.  There can
+	 * only be one solution.
+	 *
+	 * The caller has already narrowed down the relevant portion of the
+	 * reference using, e.g., the location of a seed hit, or the range of
+	 * possible fragment lengths if we're searching for the opposite mate in a
+	 * pair.
+	 */
+	int ungappedAlign(
+		const BTDnaString&      rd,     // read sequence (could be RC)
+		const BTString&         qu,     // qual sequence (could be rev)
+		const Coord&            coord,  // coordinate aligned to
+		const BitPairReference& refs,   // Reference strings
+		size_t                  reflen, // length of reference sequence
+		const Scoring&          sc,     // scoring scheme
+		bool                    ohang,  // allow overhang?
+		TAlScore                minsc,  // minimum score
+		SwResult&               res);   // put alignment result here
+
+	/**
+	 * Align read 'rd' to reference using read & reference information given
+	 * last time init() was called.  Uses dynamic programming.
+	 */
+	bool align(RandomSource& rnd, TAlScore& best);
+	
+	/**
+	 * Populate the given SwResult with information about the "next best"
+	 * alignment if there is one.  If there isn't one, false is returned.  Note
+	 * that false might be returned even though a call to done() would have
+	 * returned false.
+	 */
+	bool nextAlignment(
+		SwResult& res,
+		TAlScore minsc,
+		RandomSource& rnd);
+	
+	/**
+	 * Print out an alignment result as an ASCII DP table.
+	 */
+	void printResultStacked(
+		const SwResult& res,
+		std::ostream& os)
+	{
+		// res.alres.printStacked(*rd_, os);
+	}
+	
+	/**
+	 * Return true iff there are no more solution cells to backtace from.
+	 * Note that this may return false in situations where there are actually
+	 * no more solutions, but that hasn't been discovered yet.
+	 */
+	bool done() const {
+		assert(initedRead() && initedRef());
+		return cural_ == btncand_.size();
+	}
+
+	/**
+	 * Return true iff this SwAligner has been initialized with a read to align.
+	 */
+	inline bool initedRef() const { return initedRef_; }
+
+	/**
+	 * Return true iff this SwAligner has been initialized with a reference to
+	 * align against.
+	 */
+	inline bool initedRead() const { return initedRead_; }
+	
+	/**
+	 * Reset, signaling that we're done with this dynamic programming problem
+	 * and won't be asking for any more alignments.
+	 */
+	inline void reset() { initedRef_ = initedRead_ = false; }
+
+#ifndef NDEBUG
+	/**
+	 * Check that aligner is internally consistent.
+	 */
+	bool repOk() const {
+		assert_gt(dpRows(), 0);
+		// Check btncand_
+		for(size_t i = 0; i < btncand_.size(); i++) {
+			assert(btncand_[i].repOk());
+			assert_geq(btncand_[i].score, minsc_);
+		}
+		return true;
+	}
+#endif
+	
+	/**
+	 * Return the number of alignments given out so far by nextAlignment().
+	 */
+	size_t numAlignmentsReported() const { return cural_; }
+
+	/**
+	 * Merge tallies in the counters related to filling the DP table.
+	 */
+	void merge(
+		SSEMetrics& sseU8ExtendMet,
+		SSEMetrics& sseU8MateMet,
+		SSEMetrics& sseI16ExtendMet,
+		SSEMetrics& sseI16MateMet,
+		uint64_t&   nbtfiltst,
+		uint64_t&   nbtfiltsc,
+		uint64_t&   nbtfiltdo)
+	{
+		sseU8ExtendMet.merge(sseU8ExtendMet_);
+		sseU8MateMet.merge(sseU8MateMet_);
+		sseI16ExtendMet.merge(sseI16ExtendMet_);
+		sseI16MateMet.merge(sseI16MateMet_);
+		nbtfiltst += nbtfiltst_;
+		nbtfiltsc += nbtfiltsc_;
+		nbtfiltdo += nbtfiltdo_;
+	}
+	
+	/**
+	 * Reset all the counters related to filling in the DP table to 0.
+	 */
+	void resetCounters() {
+		sseU8ExtendMet_.reset();
+		sseU8MateMet_.reset();
+		sseI16ExtendMet_.reset();
+		sseI16MateMet_.reset();
+		nbtfiltst_ = nbtfiltsc_ = nbtfiltdo_ = 0;
+	}
+	
+	/**
+	 * Return the size of the DP problem.
+	 */
+	size_t size() const {
+		return dpRows() * (rff_ - rfi_);
+	}
+
+protected:
+	
+	/**
+	 * Return the number of rows that will be in the dynamic programming table.
+	 */
+	inline size_t dpRows() const {
+		assert(initedRead_);
+		return rdf_ - rdi_;
+	}
+
+	/**
+	 * Align nucleotides from read 'rd' to the reference string 'rf' using
+	 * vector instructions.  Return the score of the best alignment found, or
+	 * the minimum integer if an alignment could not be found.  Flag is set to
+	 * 0 if an alignment is found, -1 if no valid alignment is found, or -2 if
+	 * the score saturated at any point during alignment.
+	 */
+	TAlScore alignNucleotidesEnd2EndSseU8(  // unsigned 8-bit elements
+		int& flag, bool debug);
+	TAlScore alignNucleotidesLocalSseU8(    // unsigned 8-bit elements
+		int& flag, bool debug);
+	TAlScore alignNucleotidesEnd2EndSseI16( // signed 16-bit elements
+		int& flag, bool debug);
+	TAlScore alignNucleotidesLocalSseI16(   // signed 16-bit elements
+		int& flag, bool debug);
+	
+	/**
+	 * Aligns by filling a dynamic programming matrix with the SSE-accelerated,
+	 * banded DP approach of Farrar.  As it goes, it determines which cells we
+	 * might backtrace from and tallies the best (highest-scoring) N backtrace
+	 * candidate cells per diagonal.  Also returns the alignment score of the best
+	 * alignment in the matrix.
+	 *
+	 * This routine does *not* maintain a matrix holding the entire matrix worth of
+	 * scores, nor does it maintain any other dense O(mn) data structure, as this
+	 * would quickly exhaust memory for queries longer than about 10,000 kb.
+	 * Instead, in the fill stage it maintains two columns worth of scores at a
+	 * time (current/previous, or right/left) - these take O(m) space.  When
+	 * finished with the current column, it determines which cells from the
+	 * previous column, if any, are candidates we might backtrace from to find a
+	 * full alignment.  A candidate cell has a score that rises above the threshold
+	 * and isn't improved upon by a match in the next column.  The best N
+	 * candidates per diagonal are stored in a O(m + n) data structure.
+	 */
+	TAlScore alignGatherEE8(                // unsigned 8-bit elements
+		int& flag, bool debug);
+	TAlScore alignGatherLoc8(               // unsigned 8-bit elements
+		int& flag, bool debug);
+	TAlScore alignGatherEE16(               // signed 16-bit elements
+		int& flag, bool debug);
+	TAlScore alignGatherLoc16(              // signed 16-bit elements
+		int& flag, bool debug);
+	
+	/**
+	 * Build query profile look up tables for the read.  The query profile look
+	 * up table is organized as a 1D array indexed by [i][j] where i is the
+	 * reference character in the current DP column (0=A, 1=C, etc), and j is
+	 * the segment of the query we're currently working on.
+	 */
+	void buildQueryProfileEnd2EndSseU8(bool fw);
+	void buildQueryProfileLocalSseU8(bool fw);
+
+	/**
+	 * Build query profile look up tables for the read.  The query profile look
+	 * up table is organized as a 1D array indexed by [i][j] where i is the
+	 * reference character in the current DP column (0=A, 1=C, etc), and j is
+	 * the segment of the query we're currently working on.
+	 */
+	void buildQueryProfileEnd2EndSseI16(bool fw);
+	void buildQueryProfileLocalSseI16(bool fw);
+	
+	bool gatherCellsNucleotidesLocalSseU8(TAlScore best);
+	bool gatherCellsNucleotidesEnd2EndSseU8(TAlScore best);
+
+	bool gatherCellsNucleotidesLocalSseI16(TAlScore best);
+	bool gatherCellsNucleotidesEnd2EndSseI16(TAlScore best);
+
+	bool backtraceNucleotidesLocalSseU8(
+		TAlScore       escore, // in: expected score
+		SwResult&      res,    // out: store results (edits and scores) here
+		size_t&        off,    // out: store diagonal projection of origin
+		size_t&        nbts,   // out: # backtracks
+		size_t         row,    // start in this rectangle row
+		size_t         col,    // start in this rectangle column
+		RandomSource&  rand);  // random gen, to choose among equal paths
+
+	bool backtraceNucleotidesLocalSseI16(
+		TAlScore       escore, // in: expected score
+		SwResult&      res,    // out: store results (edits and scores) here
+		size_t&        off,    // out: store diagonal projection of origin
+		size_t&        nbts,   // out: # backtracks
+		size_t         row,    // start in this rectangle row
+		size_t         col,    // start in this rectangle column
+		RandomSource&  rand);  // random gen, to choose among equal paths
+
+	bool backtraceNucleotidesEnd2EndSseU8(
+		TAlScore       escore, // in: expected score
+		SwResult&      res,    // out: store results (edits and scores) here
+		size_t&        off,    // out: store diagonal projection of origin
+		size_t&        nbts,   // out: # backtracks
+		size_t         row,    // start in this rectangle row
+		size_t         col,    // start in this rectangle column
+		RandomSource&  rand);  // random gen, to choose among equal paths
+
+	bool backtraceNucleotidesEnd2EndSseI16(
+		TAlScore       escore, // in: expected score
+		SwResult&      res,    // out: store results (edits and scores) here
+		size_t&        off,    // out: store diagonal projection of origin
+		size_t&        nbts,   // out: # backtracks
+		size_t         row,    // start in this rectangle row
+		size_t         col,    // start in this rectangle column
+		RandomSource&  rand);  // random gen, to choose among equal paths
+
+	bool backtrace(
+		TAlScore       escore, // in: expected score
+		bool           fill,   // in: use mini-fill?
+		bool           usecp,  // in: use checkpoints?
+		SwResult&      res,    // out: store results (edits and scores) here
+		size_t&        off,    // out: store diagonal projection of origin
+		size_t         row,    // start in this rectangle row
+		size_t         col,    // start in this rectangle column
+		size_t         maxiter,// max # extensions to try
+		size_t&        niter,  // # extensions tried
+		RandomSource&  rnd)    // random gen, to choose among equal paths
+	{
+		bter_.initBt(
+			escore,              // in: alignment score
+			row,                 // in: start in this row
+			col,                 // in: start in this column
+			fill,                // in: use mini-fill?
+			usecp,               // in: use checkpoints?
+			cperTri_,            // in: triangle-shaped mini-fills?
+			rnd);                // in: random gen, to choose among equal paths
+		assert(bter_.inited());
+		size_t nrej = 0;
+		if(bter_.emptySolution()) {
+			return false;
+		} else {
+			return bter_.nextAlignment(maxiter, res, off, nrej, niter, rnd);
+		}
+	}
+
+	const BTDnaString  *rd_;     // read sequence
+	const BTString     *qu_;     // read qualities
+	const BTDnaString  *rdfw_;   // read sequence for fw read
+	const BTDnaString  *rdrc_;   // read sequence for rc read
+	const BTString     *qufw_;   // read qualities for fw read
+	const BTString     *qurc_;   // read qualities for rc read
+	TReadOff            rdi_;    // offset of first read char to align
+	TReadOff            rdf_;    // offset of last read char to align
+	bool                fw_;     // true iff read sequence is original fw read
+	TRefId              refidx_; // id of reference aligned against
+	TRefOff             reflen_; // length of entire reference sequence
+	const DPRect*       rect_;   // DP rectangle
+	char               *rf_;     // reference sequence
+	TRefOff             rfi_;    // offset of first ref char to align to
+	TRefOff             rff_;    // offset of last ref char to align to (excl)
+	size_t              rdgap_;  // max # gaps in read
+	size_t              rfgap_;  // max # gaps in reference
+	bool                enable8_;// enable 8-bit sse
+	bool                extend_; // true iff this is a seed-extend problem
+	const Scoring      *sc_;     // penalties for edit types
+	TAlScore            minsc_;  // penalty ceiling for valid alignments
+	int                 nceil_;  // max # Ns allowed in ref portion of aln
+
+	bool                sse8succ_;  // whether 8-bit worked
+	bool                sse16succ_; // whether 16-bit worked
+	SSEData             sseU8fw_;   // buf for fw query, 8-bit score
+	SSEData             sseU8rc_;   // buf for rc query, 8-bit score
+	SSEData             sseI16fw_;  // buf for fw query, 16-bit score
+	SSEData             sseI16rc_;  // buf for rc query, 16-bit score
+	bool                sseU8fwBuilt_;   // built fw query profile, 8-bit score
+	bool                sseU8rcBuilt_;   // built rc query profile, 8-bit score
+	bool                sseI16fwBuilt_;  // built fw query profile, 16-bit score
+	bool                sseI16rcBuilt_;  // built rc query profile, 16-bit score
+
+	SSEMetrics			sseU8ExtendMet_;
+	SSEMetrics			sseU8MateMet_;
+	SSEMetrics			sseI16ExtendMet_;
+	SSEMetrics			sseI16MateMet_;
+
+	int                 state_;        // state
+	bool                initedRead_;   // true iff initialized with initRead
+	bool                readSse16_;    // true -> sse16 from now on for read
+	bool                initedRef_;    // true iff initialized with initRef
+	EList<uint32_t>     rfwbuf_;       // buffer for wordized ref stretches
+	
+	EList<DpNucFrame>    btnstack_;    // backtrace stack for nucleotides
+	EList<SizeTPair>     btcells_;     // cells involved in current backtrace
+
+	NBest<DpBtCandidate> btdiag_;      // per-diagonal backtrace candidates
+	EList<DpBtCandidate> btncand_;     // cells we might backtrace from
+	EList<DpBtCandidate> btncanddone_; // candidates that we investigated
+	size_t              btncanddoneSucc_; // # investigated and succeeded
+	size_t              btncanddoneFail_; // # investigated and failed
+	
+	BtBranchTracer       bter_;        // backtracer
+	
+	Checkpointer         cper_;        // structure for saving checkpoint cells
+	size_t               cperMinlen_;  // minimum length for using checkpointer
+	size_t               cperPerPow2_; // checkpoint every 1 << perpow2 diags (& next)
+	bool                 cperEf_;      // store E and F in addition to H?
+	bool                 cperTri_;     // checkpoint for triangular mini-fills?
+	
+	size_t              colstop_;      // bailed on DP loop after this many cols
+	size_t              lastsolcol_;   // last DP col with valid cell
+	size_t              cural_;        // index of next alignment to be given
+	
+	uint64_t nbtfiltst_; // # candidates filtered b/c starting cell was seen
+	uint64_t nbtfiltsc_; // # candidates filtered b/c score uninteresting
+	uint64_t nbtfiltdo_; // # candidates filtered b/c dominated by other cell
+	
+	ASSERT_ONLY(SStringExpandable<uint32_t> tmp_destU32_);
+	ASSERT_ONLY(BTDnaString tmp_editstr_, tmp_refstr_);
+	ASSERT_ONLY(EList<DpBtCandidate> cand_tmp_);
+};
+
+#endif /*ALIGNER_SW_H_*/
diff --git a/aligner_sw_common.h b/aligner_sw_common.h
new file mode 100644
index 0000000..48d0169
--- /dev/null
+++ b/aligner_sw_common.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_SW_COMMON_H_
+#define ALIGNER_SW_COMMON_H_
+
+#include "aligner_result.h"
+
+/**
+ * Encapsulates the result of a dynamic programming alignment, including
+ * colorspace alignments.  In our case, the result is a combination of:
+ *
+ * 1. All the nucleotide edits
+ * 2. All the "edits" where an ambiguous reference char is resolved to
+ *    an unambiguous char.
+ * 3. All the color edits (if applicable)
+ * 4. All the color miscalls (if applicable).  This is a subset of 3.
+ * 5. The score of the best alginment
+ * 6. The score of the second-best alignment
+ *
+ * Having scores for the best and second-best alignments gives us an
+ * idea of where gaps may make reassembly beneficial.
+ */
+struct SwResult {
+
+	SwResult() :
+		alres(),
+		sws(0),
+		swcups(0),
+		swrows(0),
+		swskiprows(0),
+		swskip(0),
+		swsucc(0),
+		swfail(0),
+		swbts(0)
+	{ }
+
+	/**
+	 * Clear all contents.
+	 */
+	void reset() {
+		sws = swcups = swrows = swskiprows = swskip = swsucc =
+		swfail = swbts = 0;
+		alres.reset();
+	}
+	
+	/**
+	 * Reverse all edit lists.
+	 */
+	void reverse() {
+	}
+	
+	/**
+	 * Return true iff no result has been installed.
+	 */
+	bool empty() const {
+        return false;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that result is internally consistent.
+	 */
+	bool repOk() const {
+		//assert(alres.repOk());
+		return true;
+	}
+
+	/**
+	 * Check that result is internally consistent w/r/t read.
+	 */
+	bool repOk(const Read& rd) const {
+		//assert(alres.repOk(rd));
+		return true;
+	}
+#endif
+
+	AlnRes alres;
+	uint64_t sws;    // # DP problems solved
+	uint64_t swcups; // # DP cell updates
+	uint64_t swrows; // # DP row updates
+	uint64_t swskiprows; // # skipped DP row updates (b/c no valid alignments can go thru row)
+	uint64_t swskip; // # DP problems skipped by sse filter
+	uint64_t swsucc; // # DP problems resulting in alignment
+	uint64_t swfail; // # DP problems not resulting in alignment
+	uint64_t swbts;  // # DP backtrace steps
+	
+	int nup;         // upstream decoded nucleotide; for colorspace reads
+	int ndn;         // downstream decoded nucleotide; for colorspace reads
+};
+
+/**
+ * Encapsulates counters that measure how much work has been done by
+ * the dynamic programming driver and aligner.
+ */
+struct SwMetrics {
+
+	SwMetrics() : mutex_m() {
+	    reset();
+	}
+
+	void reset() {
+		sws = swcups = swrows = swskiprows = swskip = swsucc = swfail = swbts =
+		sws10 = sws5 = sws3 =
+		rshit = ungapsucc = ungapfail = ungapnodec = 0;
+		exatts = exranges = exrows = exsucc = exooms = 0;
+		mm1atts = mm1ranges = mm1rows = mm1succ = mm1ooms = 0;
+		sdatts = sdranges = sdrows = sdsucc = sdooms = 0;
+	}
+	
+	void init(
+		uint64_t sws_,
+		uint64_t sws10_,
+		uint64_t sws5_,
+		uint64_t sws3_,
+		uint64_t swcups_,
+		uint64_t swrows_,
+		uint64_t swskiprows_,
+		uint64_t swskip_,
+		uint64_t swsucc_,
+		uint64_t swfail_,
+		uint64_t swbts_,
+		uint64_t rshit_,
+		uint64_t ungapsucc_,
+		uint64_t ungapfail_,
+		uint64_t ungapnodec_,
+		uint64_t exatts_,
+		uint64_t exranges_,
+		uint64_t exrows_,
+		uint64_t exsucc_,
+		uint64_t exooms_,
+		uint64_t mm1atts_,
+		uint64_t mm1ranges_,
+		uint64_t mm1rows_,
+		uint64_t mm1succ_,
+		uint64_t mm1ooms_,
+		uint64_t sdatts_,
+		uint64_t sdranges_,
+		uint64_t sdrows_,
+		uint64_t sdsucc_,
+		uint64_t sdooms_)
+	{
+		sws        = sws_;
+		sws10      = sws10_;
+		sws5       = sws5_;
+		sws3       = sws3_;
+		swcups     = swcups_;
+		swrows     = swrows_;
+		swskiprows = swskiprows_;
+		swskip     = swskip_;
+		swsucc     = swsucc_;
+		swfail     = swfail_;
+		swbts      = swbts_;
+		ungapsucc  = ungapsucc_;
+		ungapfail  = ungapfail_;
+		ungapnodec = ungapnodec_;
+		
+		// Exact end-to-end attempts
+		exatts     = exatts_;
+		exranges   = exranges_;
+		exrows     = exrows_;
+		exsucc     = exsucc_;
+		exooms     = exooms_;
+
+		// 1-mismatch end-to-end attempts
+		mm1atts    = mm1atts_;
+		mm1ranges  = mm1ranges_;
+		mm1rows    = mm1rows_;
+		mm1succ    = mm1succ_;
+		mm1ooms    = mm1ooms_;
+		
+		// Seed attempts
+		sdatts     = sdatts_;
+		sdranges   = sdranges_;
+		sdrows     = sdrows_;
+		sdsucc     = sdsucc_;
+		sdooms     = sdooms_;
+	}
+	
+	/**
+	 * Merge (add) the counters in the given SwResult object into this
+	 * SwMetrics object.
+	 */
+	void update(const SwResult& r) {
+		sws        += r.sws;
+		swcups     += r.swcups;
+		swrows     += r.swrows;
+		swskiprows += r.swskiprows;
+		swskip     += r.swskip;
+		swsucc     += r.swsucc;
+		swfail     += r.swfail;
+		swbts      += r.swbts;
+	}
+	
+	/**
+	 * Merge (add) the counters in the given SwMetrics object into this
+	 * object.  This is the only safe way to update a SwMetrics shared
+	 * by multiple threads.
+	 */
+	void merge(const SwMetrics& r, bool getLock = false) {
+        ThreadSafe ts(&mutex_m, getLock);
+		sws        += r.sws;
+		sws10      += r.sws10;
+		sws5       += r.sws5;
+		sws3       += r.sws3;
+		swcups     += r.swcups;
+		swrows     += r.swrows;
+		swskiprows += r.swskiprows;
+		swskip     += r.swskip;
+		swsucc     += r.swsucc;
+		swfail     += r.swfail;
+		swbts      += r.swbts;
+		rshit      += r.rshit;
+		ungapsucc  += r.ungapsucc;
+		ungapfail  += r.ungapfail;
+		ungapnodec += r.ungapnodec;
+		exatts     += r.exatts;
+		exranges   += r.exranges;
+		exrows     += r.exrows;
+		exsucc     += r.exsucc;
+		exooms     += r.exooms;
+		mm1atts    += r.mm1atts;
+		mm1ranges  += r.mm1ranges;
+		mm1rows    += r.mm1rows;
+		mm1succ    += r.mm1succ;
+		mm1ooms    += r.mm1ooms;
+		sdatts     += r.sdatts;
+		sdranges   += r.sdranges;
+		sdrows     += r.sdrows;
+		sdsucc     += r.sdsucc;
+		sdooms     += r.sdooms;
+	}
+	
+	void tallyGappedDp(size_t readGaps, size_t refGaps) {
+		size_t mx = max(readGaps, refGaps);
+		if(mx < 10) sws10++;
+		if(mx < 5)  sws5++;
+		if(mx < 3)  sws3++;
+	}
+
+	uint64_t sws;        // # DP problems solved
+	uint64_t sws10;      // # DP problems solved where max gaps < 10
+	uint64_t sws5;       // # DP problems solved where max gaps < 5
+	uint64_t sws3;       // # DP problems solved where max gaps < 3
+	uint64_t swcups;     // # DP cell updates
+	uint64_t swrows;     // # DP row updates
+	uint64_t swskiprows; // # skipped DP rows (b/c no valid alns go thru row)
+	uint64_t swskip;     // # DP problems skipped by sse filter
+	uint64_t swsucc;     // # DP problems resulting in alignment
+	uint64_t swfail;     // # DP problems not resulting in alignment
+	uint64_t swbts;      // # DP backtrace steps
+	uint64_t rshit;      // # DP problems avoided b/c seed hit was redundant
+	uint64_t ungapsucc;  // # DP problems avoided b/c seed hit was redundant
+	uint64_t ungapfail;  // # DP problems avoided b/c seed hit was redundant
+	uint64_t ungapnodec; // # DP problems avoided b/c seed hit was redundant
+
+	uint64_t exatts;     // total # attempts at exact-hit end-to-end aln
+	uint64_t exranges;   // total # ranges returned by exact-hit queries
+	uint64_t exrows;     // total # rows returned by exact-hit queries
+	uint64_t exsucc;     // exact-hit yielded non-empty result
+	uint64_t exooms;     // exact-hit offset memory exhausted
+	
+	uint64_t mm1atts;    // total # attempts at 1mm end-to-end aln
+	uint64_t mm1ranges;  // total # ranges returned by 1mm-hit queries
+	uint64_t mm1rows;    // total # rows returned by 1mm-hit queries
+	uint64_t mm1succ;    // 1mm-hit yielded non-empty result
+	uint64_t mm1ooms;    // 1mm-hit offset memory exhausted
+
+	uint64_t sdatts;     // total # attempts to find seed alignments
+	uint64_t sdranges;   // total # seed-alignment ranges found
+	uint64_t sdrows;     // total # seed-alignment rows found
+	uint64_t sdsucc;     // # times seed alignment yielded >= 1 hit
+	uint64_t sdooms;     // # times an OOM occurred during seed alignment
+
+	MUTEX_T mutex_m;
+};
+
+// The various ways that one might backtrack from a later cell (either oall,
+// rdgap or rfgap) to an earlier cell
+enum {
+	SW_BT_OALL_DIAG,         // from oall cell to oall cell
+	SW_BT_OALL_REF_OPEN,     // from oall cell to oall cell
+	SW_BT_OALL_READ_OPEN,    // from oall cell to oall cell
+	SW_BT_RDGAP_EXTEND,      // from rdgap cell to rdgap cell
+	SW_BT_RFGAP_EXTEND       // from rfgap cell to rfgap cell
+};
+
+#endif /*def ALIGNER_SW_COMMON_H_*/
diff --git a/aligner_sw_nuc.h b/aligner_sw_nuc.h
new file mode 100644
index 0000000..6bec1de
--- /dev/null
+++ b/aligner_sw_nuc.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_SW_NUC_H_
+#define ALIGNER_SW_NUC_H_
+
+#include <stdint.h>
+#include "aligner_sw_common.h"
+#include "aligner_result.h"
+
+/**
+ * Encapsulates a backtrace stack frame.  Includes enough information that we
+ * can "pop" back up to this frame and choose to make a different backtracking
+ * decision.  The information included is:
+ *
+ * 1. The mask at the decision point.  When we first move through the mask and
+ *    when we backtrack to it, we're careful to mask out the bit corresponding
+ *    to the path we're taking.  When we move through it after removing the
+ *    last bit from the mask, we're careful to pop it from the stack.
+ * 2. The sizes of the edit lists.  When we backtrack, we resize the lists back
+ *    down to these sizes to get rid of any edits introduced since the branch
+ *    point.
+ */
+struct DpNucFrame {
+
+	/**
+	 * Initialize a new DpNucFrame stack frame.
+	 */
+	void init(
+		size_t   nedsz_,
+		size_t   aedsz_,
+		size_t   celsz_,
+		size_t   row_,
+		size_t   col_,
+		size_t   gaps_,
+		size_t   readGaps_,
+		size_t   refGaps_,
+		AlnScore score_,
+		int      ct_)
+	{
+		nedsz    = nedsz_;
+		aedsz    = aedsz_;
+		celsz    = celsz_;
+		row      = row_;
+		col      = col_;
+		gaps     = gaps_;
+		readGaps = readGaps_;
+		refGaps  = refGaps_;
+		score    = score_;
+		ct       = ct_;
+	}
+
+	size_t   nedsz;    // size of the nucleotide edit list at branch (before
+	                   // adding the branch edit)
+	size_t   aedsz;    // size of ambiguous nucleotide edit list at branch
+	size_t   celsz;    // size of cell-traversed list at branch
+	size_t   row;      // row of cell where branch occurred
+	size_t   col;      // column of cell where branch occurred
+	size_t   gaps;     // number of gaps before branch occurred
+	size_t   readGaps; // number of read gaps before branch occurred
+	size_t   refGaps;  // number of ref gaps before branch occurred
+	AlnScore score;    // score where branch occurred
+	int      ct;       // table type (oall, rdgap or rfgap)
+};
+
+enum {
+	BT_CAND_FATE_SUCCEEDED = 1,
+	BT_CAND_FATE_FAILED,
+	BT_CAND_FATE_FILT_START,     // skipped b/c starting cell already explored
+	BT_CAND_FATE_FILT_DOMINATED, // skipped b/c it was dominated
+	BT_CAND_FATE_FILT_SCORE      // skipped b/c score not interesting anymore
+};
+
+/**
+ * Encapsulates a cell that we might want to backtrace from.
+ */
+struct DpBtCandidate {
+
+	DpBtCandidate() { reset(); }
+	
+	DpBtCandidate(size_t row_, size_t col_, TAlScore score_) {
+		init(row_, col_, score_);
+	}
+	
+	void reset() { init(0, 0, 0); }
+	
+	void init(size_t row_, size_t col_, TAlScore score_) {
+		row = row_;
+		col = col_;
+		score = score_;
+		// 0 = invalid; this should be set later according to what happens
+		// before / during the backtrace
+		fate = 0; 
+	}
+	
+	/** 
+	 * Return true iff this candidate is (heuristically) dominated by the given
+	 * candidate.  We say that candidate A dominates candidate B if (a) B is
+	 * somewhere in the N x N square that extends up and to the left of A,
+	 * where N is an arbitrary number like 20, and (b) B's score is <= than
+	 * A's.
+	 */
+	inline bool dominatedBy(const DpBtCandidate& o) {
+		const size_t SQ = 40;
+		size_t rowhi = row;
+		size_t rowlo = o.row;
+		if(rowhi < rowlo) swap(rowhi, rowlo);
+		size_t colhi = col;
+		size_t collo = o.col;
+		if(colhi < collo) swap(colhi, collo);
+		return (colhi - collo) <= SQ &&
+		       (rowhi - rowlo) <= SQ;
+	}
+
+	/**
+	 * Return true if this candidate is "greater than" (should be considered
+	 * later than) the given candidate.
+	 */
+	bool operator>(const DpBtCandidate& o) const {
+		if(score < o.score) return true;
+		if(score > o.score) return false;
+		if(row   < o.row  ) return true;
+		if(row   > o.row  ) return false;
+		if(col   < o.col  ) return true;
+		if(col   > o.col  ) return false;
+		return false;
+	}
+
+	/**
+	 * Return true if this candidate is "less than" (should be considered
+	 * sooner than) the given candidate.
+	 */
+	bool operator<(const DpBtCandidate& o) const {
+		if(score > o.score) return true;
+		if(score < o.score) return false;
+		if(row   > o.row  ) return true;
+		if(row   < o.row  ) return false;
+		if(col   > o.col  ) return true;
+		if(col   < o.col  ) return false;
+		return false;
+	}
+	
+	/**
+	 * Return true if this candidate equals the given candidate.
+	 */
+	bool operator==(const DpBtCandidate& o) const {
+		return row   == o.row &&
+		       col   == o.col &&
+			   score == o.score;
+	}
+	bool operator>=(const DpBtCandidate& o) const { return !((*this) < o); }
+	bool operator<=(const DpBtCandidate& o) const { return !((*this) > o); }
+	
+#ifndef NDEBUG
+	/**
+	 * Check internal consistency.
+	 */
+	bool repOk() const {
+		assert(VALID_SCORE(score));
+		return true;
+	}
+#endif
+
+	size_t   row;   // cell row
+	size_t   col;   // cell column w/r/t LHS of rectangle
+	TAlScore score; // score fo alignment
+	int      fate;  // flag indicating whether we succeeded, failed, skipped
+};
+
+template <typename T>
+class NBest {
+
+public:
+
+	NBest<T>() { nelt_ = nbest_ = n_ = 0; }
+	
+	bool inited() const { return nelt_ > 0; }
+	
+	void init(size_t nelt, size_t nbest) {
+		nelt_ = nelt;
+		nbest_ = nbest;
+		elts_.resize(nelt * nbest);
+		ncur_.resize(nelt);
+		ncur_.fill(0);
+		n_ = 0;
+	}
+	
+	/**
+	 * Add a new result to bin 'elt'.  Where it gets prioritized in the list of
+	 * results in that bin depends on the result of operator>.
+	 */
+	bool add(size_t elt, const T& o) {
+		assert_lt(elt, nelt_);
+		const size_t ncur = ncur_[elt];
+		assert_leq(ncur, nbest_);
+		n_++;
+		for(size_t i = 0; i < nbest_ && i <= ncur; i++) {
+			if(o > elts_[nbest_ * elt + i] || i >= ncur) {
+				// Insert it here
+				// Move everyone from here on down by one slot
+				for(int j = (int)ncur; j > (int)i; j--) {
+					if(j < (int)nbest_) {
+						elts_[nbest_ * elt + j] = elts_[nbest_ * elt + j - 1];
+					}
+				}
+				elts_[nbest_ * elt + i] = o;
+				if(ncur < nbest_) {
+					ncur_[elt]++;
+				}
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	/**
+	 * Return true iff there are no solutions.
+	 */
+	bool empty() const {
+		return n_ == 0;
+	}
+	
+	/**
+	 * Dump all the items in our payload into the given EList.
+	 */
+	template<typename TList>
+	void dump(TList& l) const {
+		if(empty()) return;
+		for(size_t i = 0; i < nelt_; i++) {
+			assert_leq(ncur_[i], nbest_);
+			for(size_t j = 0; j < ncur_[i]; j++) {
+				l.push_back(elts_[i * nbest_ + j]);
+			}
+		}
+	}
+
+protected:
+
+	size_t        nelt_;
+	size_t        nbest_;
+	EList<T>      elts_;
+	EList<size_t> ncur_;
+	size_t        n_;     // total # results added
+};
+
+#endif /*def ALIGNER_SW_NUC_H_*/
diff --git a/aligner_swsse.cpp b/aligner_swsse.cpp
new file mode 100644
index 0000000..d4f7d78
--- /dev/null
+++ b/aligner_swsse.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+#include "aligner_sw_common.h"
+#include "aligner_swsse.h"
+
+/**
+ * Given a number of rows (nrow), a number of columns (ncol), and the
+ * number of words to fit inside a single __m128i vector, initialize the
+ * matrix buffer to accomodate the needed configuration of vectors.
+ */
+void SSEMatrix::init(
+	size_t nrow,
+	size_t ncol,
+	size_t wperv)
+{
+	nrow_ = nrow;
+	ncol_ = ncol;
+	wperv_ = wperv;
+	nvecPerCol_ = (nrow + (wperv-1)) / wperv;
+	// The +1 is so that we don't have to special-case the final column;
+	// instead, we just write off the end of the useful part of the table
+	// with pvEStore.
+	try {
+		matbuf_.resizeNoCopy((ncol+1) * nvecPerCell_ * nvecPerCol_);
+	} catch(exception& e) {
+		cerr << "Tried to allocate DP matrix with " << (ncol+1)
+		     << " columns, " << nvecPerCol_
+			 << " vectors per column, and and " << nvecPerCell_
+			 << " vectors per cell" << endl;
+		throw e;
+	}
+	assert(wperv_ == 8 || wperv_ == 16);
+	vecshift_ = (wperv_ == 8) ? 3 : 4;
+	nvecrow_ = (nrow + (wperv_-1)) >> vecshift_;
+	nveccol_ = ncol;
+	colstride_ = nvecPerCol_ * nvecPerCell_;
+	rowstride_ = nvecPerCell_;
+	inited_ = true;
+}
+
+/**
+ * Initialize the matrix of masks and backtracking flags.
+ */
+void SSEMatrix::initMasks() {
+	assert_gt(nrow_, 0);
+	assert_gt(ncol_, 0);
+	masks_.resize(nrow_);
+	reset_.resizeNoCopy(nrow_);
+	reset_.fill(false);
+}
+
+/**
+ * Given a row, col and matrix (i.e. E, F or H), return the corresponding
+ * element.
+ */
+int SSEMatrix::eltSlow(size_t row, size_t col, size_t mat) const {
+	assert_lt(row, nrow_);
+	assert_lt(col, ncol_);
+	assert_leq(mat, 3);
+	// Move to beginning of column/row
+	size_t rowelt = row / nvecrow_;
+	size_t rowvec = row % nvecrow_;
+	size_t eltvec = (col * colstride_) + (rowvec * rowstride_) + mat;
+	if(wperv_ == 16) {
+		return (int)((uint8_t*)(matbuf_.ptr() + eltvec))[rowelt];
+	} else {
+		assert_eq(8, wperv_);
+		return (int)((int16_t*)(matbuf_.ptr() + eltvec))[rowelt];
+	}
+}
diff --git a/aligner_swsse.h b/aligner_swsse.h
new file mode 100644
index 0000000..8e5bbd3
--- /dev/null
+++ b/aligner_swsse.h
@@ -0,0 +1,500 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALIGNER_SWSSE_H_
+#define ALIGNER_SWSSE_H_
+
+#include "ds.h"
+#include "mem_ids.h"
+#include "random_source.h"
+#include "scoring.h"
+#include "mask.h"
+#include "sse_util.h"
+#include <strings.h>
+
+
+struct SSEMetrics {
+	
+	SSEMetrics():mutex_m() { reset(); }
+
+	void clear() { reset(); }
+	void reset() {
+		dp = dpsat = dpfail = dpsucc = 
+		col = cell = inner = fixup =
+		gathsol = bt = btfail = btsucc = btcell =
+		corerej = nrej = 0;
+	}
+
+	void merge(const SSEMetrics& o, bool getLock = false) {
+        ThreadSafe ts(&mutex_m, getLock);
+		dp       += o.dp;
+		dpsat    += o.dpsat;
+		dpfail   += o.dpfail;
+		dpsucc   += o.dpsucc;
+		col      += o.col;
+		cell     += o.cell;
+		inner    += o.inner;
+		fixup    += o.fixup;
+		gathsol  += o.gathsol;
+		bt       += o.bt;
+		btfail   += o.btfail;
+		btsucc   += o.btsucc;
+		btcell   += o.btcell;
+		corerej  += o.corerej;
+		nrej     += o.nrej;
+	}
+
+	uint64_t dp;       // DPs tried
+	uint64_t dpsat;    // DPs saturated
+	uint64_t dpfail;   // DPs failed
+	uint64_t dpsucc;   // DPs succeeded
+	uint64_t col;      // DP columns
+	uint64_t cell;     // DP cells
+	uint64_t inner;    // DP inner loop iters
+	uint64_t fixup;    // DP fixup loop iters
+	uint64_t gathsol;  // DP gather solution cells found
+	uint64_t bt;       // DP backtraces
+	uint64_t btfail;   // DP backtraces failed
+	uint64_t btsucc;   // DP backtraces succeeded
+	uint64_t btcell;   // DP backtrace cells traversed
+	uint64_t corerej;  // DP backtrace core rejections
+	uint64_t nrej;     // DP backtrace N rejections
+	MUTEX_T  mutex_m;
+};
+
+/**
+ * Encapsulates matrix information calculated by the SSE aligner.
+ *
+ * Matrix memory is laid out as follows:
+ *
+ * - Elements (individual cell scores) are packed into __m128i vectors
+ * - Vectors are packed into quartets, quartet elements correspond to: a vector
+ *   from E, one from F, one from H, and one that's "reserved"
+ * - Quartets are packed into columns, where the number of quartets is
+ *   determined by the number of query characters divided by the number of
+ *   elements per vector
+ *
+ * Regarding the "reserved" element of the vector quartet: we use it for two
+ * things.  First, we use the first column of reserved vectors to stage the
+ * initial column of H vectors.  Second, we use the "reserved" vectors during
+ * the backtrace procedure to store information about (a) which cells have been
+ * traversed, (b) whether the cell is "terminal" (in local mode), etc.
+ */
+struct SSEMatrix {
+
+	// Each matrix element is a quartet of vectors.  These constants are used
+	// to identify members of the quartet.
+	const static size_t E   = 0;
+	const static size_t F   = 1;
+	const static size_t H   = 2;
+	const static size_t TMP = 3;
+
+	SSEMatrix(int cat = 0) : nvecPerCell_(4), matbuf_(cat) { }
+
+	/**
+	 * Return a pointer to the matrix buffer.
+	 */
+	inline __m128i *ptr() {
+		assert(inited_);
+		return matbuf_.ptr();
+	}
+	
+	/**
+	 * Return a pointer to the E vector at the given row and column.  Note:
+	 * here row refers to rows of vectors, not rows of elements.
+	 */
+	inline __m128i* evec(size_t row, size_t col) {
+		assert_lt(row, nvecrow_);
+		assert_lt(col, nveccol_);
+		size_t elt = row * rowstride() + col * colstride() + E;
+		assert_lt(elt, matbuf_.size());
+		return ptr() + elt;
+	}
+
+	/**
+	 * Like evec, but it's allowed to ask for a pointer to one column after the
+	 * final one.
+	 */
+	inline __m128i* evecUnsafe(size_t row, size_t col) {
+		assert_lt(row, nvecrow_);
+		assert_leq(col, nveccol_);
+		size_t elt = row * rowstride() + col * colstride() + E;
+		assert_lt(elt, matbuf_.size());
+		return ptr() + elt;
+	}
+
+	/**
+	 * Return a pointer to the F vector at the given row and column.  Note:
+	 * here row refers to rows of vectors, not rows of elements.
+	 */
+	inline __m128i* fvec(size_t row, size_t col) {
+		assert_lt(row, nvecrow_);
+		assert_lt(col, nveccol_);
+		size_t elt = row * rowstride() + col * colstride() + F;
+		assert_lt(elt, matbuf_.size());
+		return ptr() + elt;
+	}
+
+	/**
+	 * Return a pointer to the H vector at the given row and column.  Note:
+	 * here row refers to rows of vectors, not rows of elements.
+	 */
+	inline __m128i* hvec(size_t row, size_t col) {
+		assert_lt(row, nvecrow_);
+		assert_lt(col, nveccol_);
+		size_t elt = row * rowstride() + col * colstride() + H;
+		assert_lt(elt, matbuf_.size());
+		return ptr() + elt;
+	}
+
+	/**
+	 * Return a pointer to the TMP vector at the given row and column.  Note:
+	 * here row refers to rows of vectors, not rows of elements.
+	 */
+	inline __m128i* tmpvec(size_t row, size_t col) {
+		assert_lt(row, nvecrow_);
+		assert_lt(col, nveccol_);
+		size_t elt = row * rowstride() + col * colstride() + TMP;
+		assert_lt(elt, matbuf_.size());
+		return ptr() + elt;
+	}
+
+	/**
+	 * Like tmpvec, but it's allowed to ask for a pointer to one column after
+	 * the final one.
+	 */
+	inline __m128i* tmpvecUnsafe(size_t row, size_t col) {
+		assert_lt(row, nvecrow_);
+		assert_leq(col, nveccol_);
+		size_t elt = row * rowstride() + col * colstride() + TMP;
+		assert_lt(elt, matbuf_.size());
+		return ptr() + elt;
+	}
+	
+	/**
+	 * Given a number of rows (nrow), a number of columns (ncol), and the
+	 * number of words to fit inside a single __m128i vector, initialize the
+	 * matrix buffer to accomodate the needed configuration of vectors.
+	 */
+	void init(
+		size_t nrow,
+		size_t ncol,
+		size_t wperv);
+	
+	/**
+	 * Return the number of __m128i's you need to skip over to get from one
+	 * cell to the cell one column over from it.
+	 */
+	inline size_t colstride() const { return colstride_; }
+
+	/**
+	 * Return the number of __m128i's you need to skip over to get from one
+	 * cell to the cell one row down from it.
+	 */
+	inline size_t rowstride() const { return rowstride_; }
+
+	/**
+	 * Given a row, col and matrix (i.e. E, F or H), return the corresponding
+	 * element.
+	 */
+	int eltSlow(size_t row, size_t col, size_t mat) const;
+	
+	/**
+	 * Given a row, col and matrix (i.e. E, F or H), return the corresponding
+	 * element.
+	 */
+	inline int elt(size_t row, size_t col, size_t mat) const {
+		assert(inited_);
+		assert_lt(row, nrow_);
+		assert_lt(col, ncol_);
+		assert_lt(mat, 3);
+		// Move to beginning of column/row
+		size_t rowelt = row / nvecrow_;
+		size_t rowvec = row % nvecrow_;
+		size_t eltvec = (col * colstride_) + (rowvec * rowstride_) + mat;
+		assert_lt(eltvec, matbuf_.size());
+		if(wperv_ == 16) {
+			return (int)((uint8_t*)(matbuf_.ptr() + eltvec))[rowelt];
+		} else {
+			assert_eq(8, wperv_);
+			return (int)((int16_t*)(matbuf_.ptr() + eltvec))[rowelt];
+		}
+	}
+
+	/**
+	 * Return the element in the E matrix at element row, col.
+	 */
+	inline int eelt(size_t row, size_t col) const {
+		return elt(row, col, E);
+	}
+
+	/**
+	 * Return the element in the F matrix at element row, col.
+	 */
+	inline int felt(size_t row, size_t col) const {
+		return elt(row, col, F);
+	}
+
+	/**
+	 * Return the element in the H matrix at element row, col.
+	 */
+	inline int helt(size_t row, size_t col) const {
+		return elt(row, col, H);
+	}
+	
+	/**
+	 * Return true iff the given cell has its reportedThru bit set.
+	 */
+	inline bool reportedThrough(
+		size_t row,          // current row
+		size_t col) const    // current column
+	{
+		return (masks_[row][col] & (1 << 0)) != 0;
+	}
+
+	/**
+	 * Set the given cell's reportedThru bit.
+	 */
+	inline void setReportedThrough(
+		size_t row,          // current row
+		size_t col)          // current column
+	{
+		masks_[row][col] |= (1 << 0);
+	}
+
+	/**
+	 * Return true iff the H mask has been set with a previous call to hMaskSet.
+	 */
+	bool isHMaskSet(
+		size_t row,          // current row
+		size_t col) const;   // current column
+
+	/**
+	 * Set the given cell's H mask.  This is the mask of remaining legal ways to
+	 * backtrack from the H cell at this coordinate.  It's 5 bits long and has
+	 * offset=2 into the 16-bit field.
+	 */
+	void hMaskSet(
+		size_t row,          // current row
+		size_t col,          // current column
+		int mask);
+
+	/**
+	 * Return true iff the E mask has been set with a previous call to eMaskSet.
+	 */
+	bool isEMaskSet(
+		size_t row,          // current row
+		size_t col) const;   // current column
+
+	/**
+	 * Set the given cell's E mask.  This is the mask of remaining legal ways to
+	 * backtrack from the E cell at this coordinate.  It's 2 bits long and has
+	 * offset=8 into the 16-bit field.
+	 */
+	void eMaskSet(
+		size_t row,          // current row
+		size_t col,          // current column
+		int mask);
+	
+	/**
+	 * Return true iff the F mask has been set with a previous call to fMaskSet.
+	 */
+	bool isFMaskSet(
+		size_t row,          // current row
+		size_t col) const;   // current column
+
+	/**
+	 * Set the given cell's F mask.  This is the mask of remaining legal ways to
+	 * backtrack from the F cell at this coordinate.  It's 2 bits long and has
+	 * offset=11 into the 16-bit field.
+	 */
+	void fMaskSet(
+		size_t row,          // current row
+		size_t col,          // current column
+		int mask);
+
+	/**
+	 * Analyze a cell in the SSE-filled dynamic programming matrix.  Determine &
+	 * memorize ways that we can backtrack from the cell.  If there is at least one
+	 * way to backtrack, select one at random and return the selection.
+	 *
+	 * There are a few subtleties to keep in mind regarding which cells can be at
+	 * the end of a backtrace.  First of all: cells from which we can backtrack
+	 * should not be at the end of a backtrace.  But have to distinguish between
+	 * cells whose masks eventually become 0 (we shouldn't end at those), from
+	 * those whose masks were 0 all along (we can end at those).
+	 */
+	void analyzeCell(
+		size_t row,          // current row
+		size_t col,          // current column
+		size_t ct,           // current cell type: E/F/H
+		int refc,
+		int readc,
+		int readq,
+		const Scoring& sc,   // scoring scheme
+		int64_t offsetsc,    // offset to add to each score
+		RandomSource& rand,  // rand gen for choosing among equal options
+		bool& empty,         // out: =true iff no way to backtrace
+		int& cur,            // out: =type of transition
+		bool& branch,        // out: =true iff we chose among >1 options
+		bool& canMoveThru,   // out: =true iff ...
+		bool& reportedThru); // out: =true iff ...
+
+	/**
+	 * Initialize the matrix of masks and backtracking flags.
+	 */
+	void initMasks();
+
+	/**
+	 * Return the number of rows in the dynamic programming matrix.
+	 */
+	size_t nrow() const {
+		return nrow_;
+	}
+
+	/**
+	 * Return the number of columns in the dynamic programming matrix.
+	 */
+	size_t ncol() const {
+		return ncol_;
+	}
+	
+	/**
+	 * Prepare a row so we can use it to store masks.
+	 */
+	void resetRow(size_t i) {
+		assert(!reset_[i]);
+		masks_[i].resizeNoCopy(ncol_);
+		masks_[i].fillZero();
+		reset_[i] = true;
+	}
+
+	bool             inited_;      // initialized?
+	size_t           nrow_;        // # rows
+	size_t           ncol_;        // # columns
+	size_t           nvecrow_;     // # vector rows (<= nrow_)
+	size_t           nveccol_;     // # vector columns (<= ncol_)
+	size_t           wperv_;       // # words per vector
+	size_t           vecshift_;    // # bits to shift to divide by words per vec
+	size_t           nvecPerCol_;  // # vectors per column
+	size_t           nvecPerCell_; // # vectors per matrix cell (4)
+	size_t           colstride_;   // # vectors b/t adjacent cells in same row
+	size_t           rowstride_;   // # vectors b/t adjacent cells in same col
+	EList_m128i      matbuf_;      // buffer for holding vectors
+	ELList<uint16_t> masks_;       // buffer for masks/backtracking flags
+	EList<bool>      reset_;       // true iff row in masks_ has been reset
+};
+
+/**
+ * All the data associated with the query profile and other data needed for SSE
+ * alignment of a query.
+ */
+struct SSEData {
+	SSEData(int cat = 0) : profbuf_(cat), mat_(cat) { }
+	EList_m128i    profbuf_;     // buffer for query profile & temp vecs
+	EList_m128i    vecbuf_;      // buffer for 2 column vectors (not using mat_)
+	size_t         qprofStride_; // stride for query profile
+	size_t         gbarStride_;  // gap barrier for query profile
+	SSEMatrix      mat_;         // SSE matrix for holding all E, F, H vectors
+	size_t         maxPen_;      // biggest penalty of all
+	size_t         maxBonus_;    // biggest bonus of all
+	size_t         lastIter_;    // which 128-bit striped word has final row?
+	size_t         lastWord_;    // which word within 128-word has final row?
+	int            bias_;        // all scores shifted up by this for unsigned
+};
+
+/**
+ * Return true iff the H mask has been set with a previous call to hMaskSet.
+ */
+inline bool SSEMatrix::isHMaskSet(
+	size_t row,          // current row
+	size_t col) const    // current column
+{
+	return (masks_[row][col] & (1 << 1)) != 0;
+}
+
+/**
+ * Set the given cell's H mask.  This is the mask of remaining legal ways to
+ * backtrack from the H cell at this coordinate.  It's 5 bits long and has
+ * offset=2 into the 16-bit field.
+ */
+inline void SSEMatrix::hMaskSet(
+	size_t row,          // current row
+	size_t col,          // current column
+	int mask)
+{
+	assert_lt(mask, 32);
+	masks_[row][col] &= ~(31 << 1);
+	masks_[row][col] |= (1 << 1 | mask << 2);
+}
+
+/**
+ * Return true iff the E mask has been set with a previous call to eMaskSet.
+ */
+inline bool SSEMatrix::isEMaskSet(
+	size_t row,          // current row
+	size_t col) const    // current column
+{
+	return (masks_[row][col] & (1 << 7)) != 0;
+}
+
+/**
+ * Set the given cell's E mask.  This is the mask of remaining legal ways to
+ * backtrack from the E cell at this coordinate.  It's 2 bits long and has
+ * offset=8 into the 16-bit field.
+ */
+inline void SSEMatrix::eMaskSet(
+	size_t row,          // current row
+	size_t col,          // current column
+	int mask)
+{
+	assert_lt(mask, 4);
+	masks_[row][col] &= ~(7 << 7);
+	masks_[row][col] |=  (1 << 7 | mask << 8);
+}
+
+/**
+ * Return true iff the F mask has been set with a previous call to fMaskSet.
+ */
+inline bool SSEMatrix::isFMaskSet(
+	size_t row,          // current row
+	size_t col) const    // current column
+{
+	return (masks_[row][col] & (1 << 10)) != 0;
+}
+
+/**
+ * Set the given cell's F mask.  This is the mask of remaining legal ways to
+ * backtrack from the F cell at this coordinate.  It's 2 bits long and has
+ * offset=11 into the 16-bit field.
+ */
+inline void SSEMatrix::fMaskSet(
+	size_t row,          // current row
+	size_t col,          // current column
+	int mask)
+{
+	assert_lt(mask, 4);
+	masks_[row][col] &= ~(7 << 10);
+	masks_[row][col] |=  (1 << 10 | mask << 11);
+}
+
+#define ROWSTRIDE_2COL 4
+#define ROWSTRIDE 4
+
+#endif /*ndef ALIGNER_SWSSE_H_*/
diff --git a/aligner_swsse_ee_i16.cpp b/aligner_swsse_ee_i16.cpp
new file mode 100644
index 0000000..accc03c
--- /dev/null
+++ b/aligner_swsse_ee_i16.cpp
@@ -0,0 +1,1914 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * aligner_sw_sse.cpp
+ *
+ * Versions of key alignment functions that use vector instructions to
+ * accelerate dynamic programming.  Based chiefly on the striped Smith-Waterman
+ * paper and implementation by Michael Farrar.  See:
+ *
+ * Farrar M. Striped Smith-Waterman speeds database searches six times over
+ * other SIMD implementations. Bioinformatics. 2007 Jan 15;23(2):156-61.
+ * http://sites.google.com/site/farrarmichael/smith-waterman
+ *
+ * While the paper describes an implementation of Smith-Waterman, we extend it
+ * do end-to-end read alignment as well as local alignment.  The change
+ * required for this is minor: we simply let vmax be the maximum element in the
+ * score domain rather than the minimum.
+ *
+ * The vectorized dynamic programming implementation lacks some features that
+ * make it hard to adapt to solving the entire dynamic-programming alignment
+ * problem.  For instance:
+ *
+ * - It doesn't respect gap barriers on either end of the read
+ * - It just gives a maximum; not enough information to backtrace without
+ *   redoing some alignment
+ * - It's a little difficult to handle st_ and en_, especially st_.
+ * - The query profile mechanism makes handling of ambiguous reference bases a
+ *   little tricky (16 cols in query profile lookup table instead of 5)
+ *
+ * Given the drawbacks, it is tempting to use SSE dynamic programming as a
+ * filter rather than as an aligner per se.  Here are a few ideas for how it
+ * can be extended to handle more of the alignment problem:
+ *
+ * - Save calculated scores to a big array as we go.  We return to this array
+ *   to find and backtrace from good solutions.
+ */
+
+#include <limits>
+#include "aligner_sw.h"
+
+static const size_t NBYTES_PER_REG  = 16;
+static const size_t NWORDS_PER_REG  = 8;
+static const size_t NBITS_PER_WORD  = 16;
+static const size_t NBYTES_PER_WORD = 2;
+
+// In 16-bit end-to-end mode, we have the option of using signed saturated
+// arithmetic.  Because we have signed arithmetic, there's no need to add/subtract
+// bias when building an applying the query profile.  The lowest value we can
+// use is 0x8000, and the greatest is 0x7fff.
+
+typedef int16_t TCScore;
+
+/**
+ * Build query profile look up tables for the read.  The query profile look
+ * up table is organized as a 1D array indexed by [i][j] where i is the
+ * reference character in the current DP column (0=A, 1=C, etc), and j is
+ * the segment of the query we're currently working on.
+ */
+void SwAligner::buildQueryProfileEnd2EndSseI16(bool fw) {
+	bool& done = fw ? sseI16fwBuilt_ : sseI16rcBuilt_;
+	if(done) {
+		return;
+	}
+	done = true;
+	const BTDnaString* rd = fw ? rdfw_ : rdrc_;
+	const BTString* qu = fw ? qufw_ : qurc_;
+    // daehwan - allows to align a portion of a read, not the whole
+	// const size_t len = rd->length();
+    const size_t len = dpRows();
+	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
+	// How many __m128i's are needed
+	size_t n128s =
+		64 +                    // slack bytes, for alignment?
+		(seglen * ALPHA_SIZE)   // query profile data
+		* 2;                    // & gap barrier data
+	assert_gt(n128s, 0);
+	SSEData& d = fw ? sseI16fw_ : sseI16rc_;
+	d.profbuf_.resizeNoCopy(n128s);
+	assert(!d.profbuf_.empty());
+	d.maxPen_      = d.maxBonus_ = 0;
+	d.lastIter_    = d.lastWord_ = 0;
+	d.qprofStride_ = d.gbarStride_ = 2;
+	d.bias_ = 0; // no bias when words are signed
+	// For each reference character A, C, G, T, N ...
+	for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
+		// For each segment ...
+		for(size_t i = 0; i < seglen; i++) {
+			size_t j = i;
+			int16_t *qprofWords =
+				reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2));
+			int16_t *gbarWords =
+				reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2) + 1);
+			// For each sub-word (byte) ...
+			for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+				int sc = 0;
+				*gbarWords = 0;
+				if(j < len) {
+					int readc = (*rd)[j];
+					int readq = (*qu)[j];
+					sc = sc_->score(readc, (int)(1 << refc), readq - 33);
+					size_t j_from_end = len - j - 1;
+					if(j < (size_t)sc_->gapbar ||
+					   j_from_end < (size_t)sc_->gapbar)
+					{
+						// Inside the gap barrier
+						*gbarWords = 0x8000; // add this twice
+					}
+				}
+				if(refc == 0 && j == len-1) {
+					// Remember which 128-bit word and which smaller word has
+					// the final row
+					d.lastIter_ = i;
+					d.lastWord_ = k;
+				}
+				if(sc < 0) {
+					if((size_t)(-sc) > d.maxPen_) {
+						d.maxPen_ = (size_t)(-sc);
+					}
+				} else {
+					if((size_t)sc > d.maxBonus_) {
+						d.maxBonus_ = (size_t)sc;
+					}
+				}
+				*qprofWords = (int16_t)sc;
+				gbarWords++;
+				qprofWords++;
+				j += seglen; // update offset into query
+			}
+		}
+	}
+}
+
+#ifndef NDEBUG
+/**
+ * Return true iff the cell has sane E/F/H values w/r/t its predecessors.
+ */
+static bool cellOkEnd2EndI16(
+	SSEData& d,
+	size_t row,
+	size_t col,
+	int refc,
+	int readc,
+	int readq,
+	const Scoring& sc)     // scoring scheme
+{
+	TCScore floorsc = 0x8000;
+	TCScore ceilsc = MAX_I64;
+	TAlScore offsetsc = -0x7fff;
+	TAlScore sc_h_cur = (TAlScore)d.mat_.helt(row, col);
+	TAlScore sc_e_cur = (TAlScore)d.mat_.eelt(row, col);
+	TAlScore sc_f_cur = (TAlScore)d.mat_.felt(row, col);
+	if(sc_h_cur > floorsc) {
+		sc_h_cur += offsetsc;
+	}
+	if(sc_e_cur > floorsc) {
+		sc_e_cur += offsetsc;
+	}
+	if(sc_f_cur > floorsc) {
+		sc_f_cur += offsetsc;
+	}
+	bool gapsAllowed = true;
+	size_t rowFromEnd = d.mat_.nrow() - row - 1;
+	if(row < (size_t)sc.gapbar || rowFromEnd < (size_t)sc.gapbar) {
+		gapsAllowed = false;
+	}
+	bool e_left_trans = false, h_left_trans = false;
+	bool f_up_trans   = false, h_up_trans = false;
+	bool h_diag_trans = false;
+	if(gapsAllowed) {
+		TAlScore sc_h_left = floorsc;
+		TAlScore sc_e_left = floorsc;
+		TAlScore sc_h_up   = floorsc;
+		TAlScore sc_f_up   = floorsc;
+		if(col > 0 && sc_e_cur > floorsc && sc_e_cur <= ceilsc) {
+			sc_h_left = d.mat_.helt(row, col-1) + offsetsc;
+			sc_e_left = d.mat_.eelt(row, col-1) + offsetsc;
+			e_left_trans = (sc_e_left > floorsc && sc_e_cur == sc_e_left - sc.readGapExtend());
+			h_left_trans = (sc_h_left > floorsc && sc_e_cur == sc_h_left - sc.readGapOpen());
+			assert(e_left_trans || h_left_trans);
+		}
+		if(row > 0 && sc_f_cur > floorsc && sc_f_cur <= ceilsc) {
+			sc_h_up = d.mat_.helt(row-1, col) + offsetsc;
+			sc_f_up = d.mat_.felt(row-1, col) + offsetsc;
+			f_up_trans = (sc_f_up > floorsc && sc_f_cur == sc_f_up - sc.refGapExtend());
+			h_up_trans = (sc_h_up > floorsc && sc_f_cur == sc_h_up - sc.refGapOpen());
+			assert(f_up_trans || h_up_trans);
+		}
+	} else {
+		assert_geq(floorsc, sc_e_cur);
+		assert_geq(floorsc, sc_f_cur);
+	}
+	if(col > 0 && row > 0 && sc_h_cur > floorsc && sc_h_cur <= ceilsc) {
+		TAlScore sc_h_upleft = d.mat_.helt(row-1, col-1) + offsetsc;
+		TAlScore sc_diag = sc.score(readc, (int)refc, readq - 33);
+		h_diag_trans = sc_h_cur == sc_h_upleft + sc_diag;
+	}
+	assert(
+		sc_h_cur <= floorsc ||
+		e_left_trans ||
+		h_left_trans ||
+		f_up_trans   ||
+		h_up_trans   ||
+		h_diag_trans ||
+		sc_h_cur > ceilsc ||
+		row == 0 ||
+		col == 0);
+	return true;
+}
+#endif /*ndef NDEBUG*/
+
+#ifdef NDEBUG
+
+#define assert_all_eq0(x)
+#define assert_all_gt(x, y)
+#define assert_all_gt_lo(x)
+#define assert_all_lt(x, y)
+#define assert_all_lt_hi(x)
+
+#else
+
+#define assert_all_eq0(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt(x, y) { \
+	__m128i tmp = _mm_cmpgt_epi16(x, y); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt_lo(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpgt_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt(x, y) { \
+	__m128i tmp = _mm_cmplt_epi16(x, y); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_leq(x, y) { \
+	__m128i tmp = _mm_cmpgt_epi16(x, y); \
+	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt_hi(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_cmpeq_epi16(z, z); \
+	z = _mm_srli_epi16(z, 1); \
+	tmp = _mm_cmplt_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+#endif
+
+/**
+ * Aligns by filling a dynamic programming matrix with the SSE-accelerated,
+ * banded DP approach of Farrar.  As it goes, it determines which cells we
+ * might backtrace from and tallies the best (highest-scoring) N backtrace
+ * candidate cells per diagonal.  Also returns the alignment score of the best
+ * alignment in the matrix.
+ *
+ * This routine does *not* maintain a matrix holding the entire matrix worth of
+ * scores, nor does it maintain any other dense O(mn) data structure, as this
+ * would quickly exhaust memory for queries longer than about 10,000 kb.
+ * Instead, in the fill stage it maintains two columns worth of scores at a
+ * time (current/previous, or right/left) - these take O(m) space.  When
+ * finished with the current column, it determines which cells from the
+ * previous column, if any, are candidates we might backtrace from to find a
+ * full alignment.  A candidate cell has a score that rises above the threshold
+ * and isn't improved upon by a match in the next column.  The best N
+ * candidates per diagonal are stored in a O(m + n) data structure.
+ */
+TAlScore SwAligner::alignGatherEE16(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileEnd2EndSseI16(fw_);
+	assert(!d.profbuf_.empty());
+
+	assert_eq(0, d.maxBonus_);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+	
+	// Now set up the score vectors.  We just need two columns worth, which
+	// we'll call "left" and "right".
+	d.vecbuf_.resize(4 * 2 * iter);
+	d.vecbuf_.zero();
+	__m128i *vbuf_l = d.vecbuf_.ptr();
+	__m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
+	
+	// This is the data structure that holds candidate cells per diagonal.
+	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
+	if(!debug) {
+		btdiag_.init(ndiags, 2);
+	}
+
+	// Data structure that holds checkpointed anti-diagonals
+	TAlScore perfectScore = sc_->perfectScore(dpRows());
+	bool checkpoint = true;
+	bool cpdebug = false;
+#ifndef NDEBUG
+	cpdebug = dpRows() < 1000;
+#endif
+	cper_.init(
+		dpRows(),      // # rows
+		rff_ - rfi_,   // # columns
+		cperPerPow2_,  // checkpoint every 1 << perpow2 diags (& next)
+		perfectScore,  // perfect score (for sanity checks)
+		false,         // matrix cells have 8-bit scores?
+		cperTri_,      // triangular mini-fills?
+		false,         // alignment is local?
+		cpdebug);      // save all cells for debugging?
+		
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i vhilsw   = _mm_setzero_si128();
+	__m128i vlolsw   = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+	__m128i vhd      = _mm_setzero_si128();
+	__m128i vhdtmp   = _mm_setzero_si128();
+	__m128i vtmp     = _mm_setzero_si128();
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_I16);
+	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_I16);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_I16);
+	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_I16);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+
+	// Set all elts to 0x8000 (min value for signed 16-bit)
+	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	
+	// Set all elts to 0x7fff (max value for signed 16-bit)
+	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	
+	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
+	vlolsw = _mm_shuffle_epi32(vlo, 0);
+	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
+	vhilsw = _mm_shuffle_epi32(vhi, 0);
+	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	const size_t colstride = ROWSTRIDE_2COL * iter;
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
+	/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
+	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	
+	// Maximum score in final row
+	bool found = false;
+	TCScore lrmax = MIN_I16;
+	
+	for(size_t i = 0; i < iter; i++) {
+		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		// Could initialize Hs to high or low.  If high, cells in the lower
+		// triangle will have somewhat more legitiate scores, but still won't
+		// be exhaustively scored.
+		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+	}
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+	
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		// Swap left and right; vbuf_l is the vector on the left, which we
+		// generally load from, and vbuf_r is the vector on the right, which we
+		// generally store to.
+		swap(vbuf_l, vbuf_r);
+		pvELeft = vbuf_l + 0; pvERight = vbuf_r + 0;
+		/* pvFLeft = vbuf_l + 1; */ pvFRight = vbuf_r + 1;
+		pvHLeft = vbuf_l + 2; pvHRight = vbuf_r + 2;
+		
+		// Fetch the appropriate query profile.  Note that elements of rf_ must
+		// be numbers, not masks.
+		const int refc = (int)rf_[i];
+		
+		// Fetch the appropriate query profile
+		size_t off = (size_t)firsts5[refc] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Set all cells to low value
+		vf = _mm_cmpeq_epi16(vf, vf);
+		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = _mm_or_si128(vf, vlolsw);
+		
+		// Load H vector from the final row of the previous column
+		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		// Shift 2 bytes down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		// Fill topmost (least sig) cell with high value
+		vh = _mm_or_si128(vh, vhilsw);
+		
+		// For each character in the reference text:
+		size_t j;
+		for(j = 0; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELeft);
+			vhd = _mm_load_si128(pvHLeft);
+			assert_all_lt(ve, vhi);
+			pvELeft += ROWSTRIDE_2COL;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_adds_epi16(vh, pvScore[0]);
+			
+			// Update H, factoring in E and F
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Update vE value
+			vhdtmp = vhd;
+			vhd = _mm_subs_epi16(vhd, rdgapo);
+			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epi16(ve, rdgape);
+			ve = _mm_max_epi16(ve, vhd);
+			vh = _mm_max_epi16(vh, ve);
+
+			// Save the new vH values
+			_mm_store_si128(pvHRight, vh);
+			pvHRight += ROWSTRIDE_2COL;
+			vtmp = vh;
+			assert_all_lt(ve, vhi);
+			
+			// Load the next h value
+			vh = vhdtmp;
+			pvHLeft += ROWSTRIDE_2COL;
+
+			// Save E values
+			_mm_store_si128(pvERight, ve);
+			pvERight += ROWSTRIDE_2COL;
+			
+			// Update vf value
+			vtmp = _mm_subs_epi16(vtmp, rfgapo);
+			vf = _mm_subs_epi16(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epi16(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFRight -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFRight);
+		
+		pvHRight -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHRight);
+		
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = _mm_or_si128(vf, vlolsw);
+		
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epi16(vtmp, vf);
+		vtmp = _mm_cmpgt_epi16(vf, vtmp);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0x0000) {
+			// Store this vf
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHRight, vh);
+			pvHRight += ROWSTRIDE_2COL;
+			
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFRight -= colstride;
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				pvHRight -= colstride;
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = _mm_or_si128(vf, vlolsw);
+			} else {
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epi16(vf, rfgape);
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epi16(vtmp, vf);
+			vtmp = _mm_cmpgt_epi16(vf, vtmp);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+
+		
+		// Check in the last row for the maximum so far
+		__m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
+		// Note: we may not want to extract from the final row
+		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
+		found = true;
+		if(lr > lrmax) {
+			lrmax = lr;
+		}
+		
+		// Now we'd like to know whether the bottommost element of the right
+		// column is a candidate we might backtrace from.  First question is:
+		// did it exceed the minimum score threshold?
+		TAlScore score = (TAlScore)(lr - 0x7fff);
+		if(lr == MIN_I16) {
+			score = MIN_I64;
+		}
+		if(!debug && score >= minsc_) {
+			DpBtCandidate cand(dpRows() - 1, i - rfi_, score);
+			btdiag_.add(i - rfi_, cand);
+		}
+
+		// Save some elements to checkpoints
+		if(checkpoint) {
+			
+			__m128i *pvE = vbuf_r + 0;
+			__m128i *pvF = vbuf_r + 1;
+			__m128i *pvH = vbuf_r + 2;
+			size_t coli = i - rfi_;
+			if(coli < cper_.locol_) cper_.locol_ = coli;
+			if(coli > cper_.hicol_) cper_.hicol_ = coli;
+			
+			if(cperTri_) {
+				size_t rc_mod = coli & cper_.lomask_;
+				assert_lt(rc_mod, cper_.per_);
+				int64_t row = -rc_mod-1;
+				int64_t row_mod = row;
+				int64_t row_div = 0;
+				size_t idx = coli >> cper_.perpow2_;
+				size_t idxrow = idx * cper_.nrow_;
+				assert_eq(4, ROWSTRIDE_2COL);
+				bool done = false;
+				while(true) {
+					row += (cper_.per_ - 2);
+					row_mod += (cper_.per_ - 2);
+					for(size_t j = 0; j < 2; j++) {
+						row++;
+						row_mod++;
+						if(row >= 0 && (size_t)row < cper_.nrow_) {
+							// Update row divided by iter_ and mod iter_
+							while(row_mod >= (int64_t)iter) {
+								row_mod -= (int64_t)iter;
+								row_div++;
+							}
+							size_t delt = idxrow + row;
+							size_t vecoff = (row_mod << 5) + row_div;
+							assert_lt(row_div, 8);
+							int16_t h_sc = ((int16_t*)pvH)[vecoff];
+							int16_t e_sc = ((int16_t*)pvE)[vecoff];
+							int16_t f_sc = ((int16_t*)pvF)[vecoff];
+							if(h_sc != MIN_I16) h_sc -= 0x7fff;
+							if(e_sc != MIN_I16) e_sc -= 0x7fff;
+							if(f_sc != MIN_I16) f_sc -= 0x7fff;
+							assert_leq(h_sc, cper_.perf_);
+							assert_leq(e_sc, cper_.perf_);
+							assert_leq(f_sc, cper_.perf_);
+							CpQuad *qdiags = ((j == 0) ? cper_.qdiag1s_.ptr() : cper_.qdiag2s_.ptr());
+							qdiags[delt].sc[0] = h_sc;
+							qdiags[delt].sc[1] = e_sc;
+							qdiags[delt].sc[2] = f_sc;
+						} // if(row >= 0 && row < nrow_)
+						else if(row >= 0 && (size_t)row >= cper_.nrow_) {
+							done = true;
+							break;
+						}
+					} // end of loop over anti-diags
+					if(done) {
+						break;
+					}
+					idx++;
+					idxrow += cper_.nrow_;
+				}
+			} else {
+				// If this is the first column, take this opportunity to
+				// pre-calculate the coordinates of the elements we're going to
+				// checkpoint.
+				if(coli == 0) {
+					size_t cpi    = cper_.per_-1;
+					size_t cpimod = cper_.per_-1;
+					size_t cpidiv = 0;
+					cper_.commitMap_.clear();
+					while(cpi < cper_.nrow_) {
+						while(cpimod >= iter) {
+							cpimod -= iter;
+							cpidiv++;
+						}
+						size_t vecoff = (cpimod << 5) + cpidiv;
+						cper_.commitMap_.push_back(vecoff);
+						cpi += cper_.per_;
+						cpimod += cper_.per_;
+					}
+				}
+				// Save all the rows
+				size_t rowoff = 0;
+				size_t sz = cper_.commitMap_.size();
+				for(size_t i = 0; i < sz; i++, rowoff += cper_.ncol_) {
+					size_t vecoff = cper_.commitMap_[i];
+					int16_t h_sc = ((int16_t*)pvH)[vecoff];
+					int16_t e_sc = ((int16_t*)pvE)[vecoff];
+					int16_t f_sc = ((int16_t*)pvF)[vecoff];
+					if(h_sc != MIN_I16) h_sc -= 0x7fff;
+					if(e_sc != MIN_I16) e_sc -= 0x7fff;
+					if(f_sc != MIN_I16) f_sc -= 0x7fff;
+					assert_leq(h_sc, cper_.perf_);
+					assert_leq(e_sc, cper_.perf_);
+					assert_leq(f_sc, cper_.perf_);
+					CpQuad& dst = cper_.qrows_[rowoff + coli];
+					dst.sc[0] = h_sc;
+					dst.sc[1] = e_sc;
+					dst.sc[2] = f_sc;
+				}
+				// Is this a column we'd like to checkpoint?
+				if((coli & cper_.lomask_) == cper_.lomask_) {
+					// Save the column using memcpys
+					assert_gt(coli, 0);
+					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
+					__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				}
+			}
+			if(cper_.debug_) {
+				// Save the column using memcpys
+				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+				size_t coloff = coli * wordspercol;
+				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+			}
+		}
+	}
+	
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+
+	flag = 0;
+
+	// Did we find a solution?
+	TAlScore score = MIN_I64;
+	if(!found) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return MIN_I64;
+	} else {
+		score = (TAlScore)(lrmax - 0x7fff);
+		if(score < minsc_) {
+			flag = -1; // no
+			if(!debug) met.dpfail++;
+			return score;
+		}
+	}
+	
+	// Could we have saturated?
+	if(lrmax == MIN_I16) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+	
+	// Now take all the backtrace candidates in the btdaig_ structure and
+	// dump them into the btncand_ array.  They'll be sorted later.
+	if(!debug) {
+		btdiag_.dump(btncand_);
+		assert(!btncand_.empty());
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return score;
+}
+
+/**
+ * Solve the current alignment problem using SSE instructions that operate on 8
+ * signed 16-bit values packed into a single 128-bit register.
+ */
+TAlScore SwAligner::alignNucleotidesEnd2EndSseI16(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileEnd2EndSseI16(fw_);
+	assert(!d.profbuf_.empty());
+
+	assert_eq(0, d.maxBonus_);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i vhilsw   = _mm_setzero_si128();
+	__m128i vlolsw   = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+#if 0
+	__m128i vhd      = _mm_setzero_si128();
+	__m128i vhdtmp   = _mm_setzero_si128();
+#endif
+	__m128i vtmp     = _mm_setzero_si128();
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_I16);
+	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_I16);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_I16);
+	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_I16);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+
+	// Set all elts to 0x8000 (min value for signed 16-bit)
+	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	
+	// Set all elts to 0x7fff (max value for signed 16-bit)
+	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	
+	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
+	vlolsw = _mm_shuffle_epi32(vlo, 0);
+	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
+	vhilsw = _mm_shuffle_epi32(vhi, 0);
+	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
+	const size_t colstride = d.mat_.colstride();
+	assert_eq(ROWSTRIDE, colstride / iter);
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	__m128i *pvETmp = d.mat_.evec(0, 0);
+	
+	// Maximum score in final row
+	bool found = false;
+	TCScore lrmax = MIN_I16;
+	
+	for(size_t i = 0; i < iter; i++) {
+		_mm_store_si128(pvETmp, vlo);
+		// Could initialize Hs to high or low.  If high, cells in the lower
+		// triangle will have somewhat more legitiate scores, but still won't
+		// be exhaustively scored.
+		_mm_store_si128(pvHTmp, vlo);
+		pvETmp += ROWSTRIDE;
+		pvHTmp += ROWSTRIDE;
+	}
+	// These are swapped just before the innermost loop
+	__m128i *pvHStore = d.mat_.hvec(0, 0);
+	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	__m128i *pvELoad  = d.mat_.evec(0, 0);
+	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	__m128i *pvFStore = d.mat_.fvec(0, 0);
+	__m128i *pvFTmp   = NULL;
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+	
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+	
+	colstop_ = rff_ - 1;
+	lastsolcol_ = 0;
+	
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert(pvFStore == d.mat_.fvec(0, i - rfi_));
+		assert(pvHStore == d.mat_.hvec(0, i - rfi_));
+		
+		// Fetch the appropriate query profile.  Note that elements of rf_ must
+		// be numbers, not masks.
+		const int refc = (int)rf_[i];
+		size_t off = (size_t)firsts5[refc] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Set all cells to low value
+		vf = _mm_cmpeq_epi16(vf, vf);
+		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = _mm_or_si128(vf, vlolsw);
+		
+		// Load H vector from the final row of the previous column
+		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		// Shift 2 bytes down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		// Fill topmost (least sig) cell with high value
+		vh = _mm_or_si128(vh, vhilsw);
+		
+		// For each character in the reference text:
+		size_t j;
+		for(j = 0; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELoad);
+#if 0
+			vhd = _mm_load_si128(pvHLoad);
+#endif
+			assert_all_lt(ve, vhi);
+			pvELoad += ROWSTRIDE;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_adds_epi16(vh, pvScore[0]);
+			
+			// Update H, factoring in E and F
+			vh = _mm_max_epi16(vh, ve);
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Save the new vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update vE value
+			vtmp = vh;
+#if 0
+			vhdtmp = vhd;
+			vhd = _mm_subs_epi16(vhd, rdgapo);
+			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epi16(ve, rdgape);
+			ve = _mm_max_epi16(ve, vhd);
+#else
+			vh = _mm_subs_epi16(vh, rdgapo);
+			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epi16(ve, rdgape);
+			ve = _mm_max_epi16(ve, vh);
+#endif
+			assert_all_lt(ve, vhi);
+			
+			// Load the next h value
+#if 0
+			vh = vhdtmp;
+#else
+			vh = _mm_load_si128(pvHLoad);
+#endif
+			pvHLoad += ROWSTRIDE;
+			
+			// Save E values
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+			
+			// Update vf value
+			vtmp = _mm_subs_epi16(vtmp, rfgapo);
+			vf = _mm_subs_epi16(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epi16(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFTmp = pvFStore;
+		pvFStore -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFStore);
+		
+		pvHStore -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHStore);
+		
+#if 0
+#else
+		pvEStore -= colstride; // reset to start of column
+		ve = _mm_load_si128(pvEStore);
+#endif
+		
+		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = _mm_or_si128(vf, vlolsw);
+		
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epi16(vtmp, vf);
+		vtmp = _mm_cmpgt_epi16(vf, vtmp);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0x0000) {
+			// Store this vf
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update E in case it can be improved using our new vh
+#if 0
+#else
+			vh = _mm_subs_epi16(vh, rdgapo);
+			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			ve = _mm_max_epi16(ve, vh);
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+#endif
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFStore -= colstride;
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				pvHStore -= colstride;
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+#if 0
+#else
+				pvEStore -= colstride;
+				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+#endif
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = _mm_or_si128(vf, vlolsw);
+			} else {
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+#if 0
+#else
+				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+#endif
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epi16(vf, rfgape);
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epi16(vtmp, vf);
+			vtmp = _mm_cmpgt_epi16(vf, vtmp);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+
+#ifndef NDEBUG
+		if((rand() & 15) == 0) {
+			// This is a work-intensive sanity check; each time we finish filling
+			// a column, we check that each H, E, and F is sensible.
+			for(size_t k = 0; k < dpRows(); k++) {
+				assert(cellOkEnd2EndI16(
+					d,
+					k,                   // row
+					i - rfi_,            // col
+					refc,                // reference mask
+					(int)(*rd_)[rdi_+k], // read char
+					(int)(*qu_)[rdi_+k], // read quality
+					*sc_));              // scoring scheme
+			}
+		}
+#endif
+		
+		__m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
+		// Note: we may not want to extract from the final row
+		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
+		found = true;
+		if(lr > lrmax) {
+			lrmax = lr;
+		}
+
+		// pvELoad and pvHLoad are already where they need to be
+		
+		// Adjust the load and store vectors here.  
+		pvHStore = pvHLoad + colstride;
+		pvEStore = pvELoad + colstride;
+		pvFStore = pvFTmp;
+	}
+	
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+	
+	flag = 0;
+	
+	// Did we find a solution?
+	TAlScore score = MIN_I64;
+	if(!found) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return MIN_I64;
+	} else {
+		score = (TAlScore)(lrmax - 0x7fff);
+		if(score < minsc_) {
+			flag = -1; // no
+			if(!debug) met.dpfail++;
+			return score;
+		}
+	}
+	
+	// Could we have saturated?
+	if(lrmax == MIN_I16) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return score;
+}
+
+/**
+ * Given a filled-in DP table, populate the btncand_ list with candidate cells
+ * that might be at the ends of valid alignments.  No need to do this unless
+ * the maximum score returned by the align*() func is >= the minimum.
+ *
+ * Only cells that are exhaustively scored are candidates.  Those are the
+ * cells inside the shape made of o's in this:
+ *
+ *  |-maxgaps-|
+ *  *********************************    -
+ *   ********************************    |
+ *    *******************************    |
+ *     ******************************    |
+ *      *****************************    |
+ *       **************************** read len
+ *        ***************************    |
+ *         **************************    |
+ *          *************************    |
+ *           ************************    |
+ *            ***********oooooooooooo    -
+ *            |-maxgaps-|
+ *  |-readlen-|
+ *  |-------skip--------|
+ *
+ * And it's possible for the shape to be truncated on the left and right sides.
+ *
+ * 
+ */
+bool SwAligner::gatherCellsNucleotidesEnd2EndSseI16(TAlScore best) {
+	// What's the minimum number of rows that can possibly be spanned by an
+	// alignment that meets the minimum score requirement?
+	assert(sse16succ_);
+	const size_t ncol = rff_ - rfi_;
+	const size_t nrow = dpRows();
+	assert_gt(nrow, 0);
+	btncand_.clear();
+	btncanddone_.clear();
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	assert(!d.profbuf_.empty());
+	const size_t colstride = d.mat_.colstride();
+	ASSERT_ONLY(bool sawbest = false);
+	__m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
+	for(size_t j = 0; j < ncol; j++) {
+		TAlScore sc = (TAlScore)(((TCScore*)pvH)[d.lastWord_] - 0x7fff);
+		assert_leq(sc, best);
+		ASSERT_ONLY(sawbest = (sawbest || sc == best));
+		if(sc >= minsc_) {
+			// Yes, this is legit
+			met.gathsol++;
+			btncand_.expand();
+			btncand_.back().init(nrow-1, j, sc);
+		}
+		pvH += colstride;
+	}
+	assert(sawbest);
+	if(!btncand_.empty()) {
+		d.mat_.initMasks();
+	}
+	return !btncand_.empty();
+}
+
+#define MOVE_VEC_PTR_UP(vec, rowvec, rowelt) { \
+	if(rowvec == 0) { \
+		rowvec += d.mat_.nvecrow_; \
+		vec += d.mat_.colstride_; \
+		rowelt--; \
+	} \
+	rowvec--; \
+	vec -= ROWSTRIDE; \
+}
+
+#define MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt) { vec -= d.mat_.colstride_; }
+
+#define MOVE_VEC_PTR_UPLEFT(vec, rowvec, rowelt) { \
+ 	MOVE_VEC_PTR_UP(vec, rowvec, rowelt); \
+ 	MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt); \
+}
+
+#define MOVE_ALL_LEFT() { \
+	MOVE_VEC_PTR_LEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_LEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UP() { \
+	MOVE_VEC_PTR_UP(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UP(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UP(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UPLEFT() { \
+	MOVE_VEC_PTR_UPLEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UPLEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define NEW_ROW_COL(row, col) { \
+	rowelt = row / d.mat_.nvecrow_; \
+	rowvec = row % d.mat_.nvecrow_; \
+	eltvec = (col * d.mat_.colstride_) + (rowvec * ROWSTRIDE); \
+	cur_vec = d.mat_.matbuf_.ptr() + eltvec; \
+	left_vec = cur_vec; \
+	left_rowelt = rowelt; \
+	left_rowvec = rowvec; \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	up_vec = cur_vec; \
+	up_rowelt = rowelt; \
+	up_rowvec = rowvec; \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	upleft_vec = up_vec; \
+	upleft_rowelt = up_rowelt; \
+	upleft_rowvec = up_rowvec; \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+/**
+ * Given the dynamic programming table and a cell, trace backwards from the
+ * cell and install the edits and score/penalty in the appropriate fields
+ * of res.  The RandomSource is used to break ties among equally good ways
+ * of tracing back.
+ *
+ * Whenever we enter a cell, we check whether the read/ref coordinates of
+ * that cell correspond to a cell we traversed constructing a previous
+ * alignment.  If so, we backtrack to the last decision point, mask out the
+ * path that led to the previously observed cell, and continue along a
+ * different path; or, if there are no more paths to try, we give up.
+ *
+ * If an alignment is found, 'off' is set to the alignment's upstream-most
+ * reference character's offset into the chromosome and true is returned.
+ * Otherwise, false is returned.
+ */
+bool SwAligner::backtraceNucleotidesEnd2EndSseI16(
+	TAlScore       escore, // in: expected score
+	SwResult&      res,    // out: store results (edits and scores) here
+	size_t&        off,    // out: store diagonal projection of origin
+	size_t&        nbts,   // out: # backtracks
+	size_t         row,    // start in this row
+	size_t         col,    // start in this column
+	RandomSource&  rnd)    // random gen, to choose among equal paths
+{
+	assert_lt(row, dpRows());
+	assert_lt(col, (size_t)(rff_ - rfi_));
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	met.bt++;
+	assert(!d.profbuf_.empty());
+	assert_lt(row, rd_->length());
+	btnstack_.clear(); // empty the backtrack stack
+	btcells_.clear();  // empty the cells-so-far list
+	AlnScore score; score.score_ = 0;
+	// score.gaps_ = score.ns_ = 0;
+	size_t origCol = col;
+	size_t gaps = 0, readGaps = 0, refGaps = 0;
+	res.alres.reset();
+    EList<Edit>& ned = res.alres.ned();
+	assert(ned.empty());
+	assert_gt(dpRows(), row);
+	ASSERT_ONLY(size_t trimEnd = dpRows() - row - 1);
+	size_t trimBeg = 0;
+	size_t ct = SSEMatrix::H; // cell type
+	// Row and col in terms of where they fall in the SSE vector matrix
+	size_t rowelt, rowvec, eltvec;
+	size_t left_rowelt, up_rowelt, upleft_rowelt;
+	size_t left_rowvec, up_rowvec, upleft_rowvec;
+	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	NEW_ROW_COL(row, col);
+	while((int)row >= 0) {
+		met.btcell++;
+		nbts++;
+		int readc = (*rd_)[rdi_ + row];
+		int refm  = (int)rf_[rfi_ + col];
+		int readq = (*qu_)[row];
+		assert_leq(col, origCol);
+		// Get score in this cell
+		bool empty = false, reportedThru, canMoveThru, branch = false;
+		int cur = SSEMatrix::H;
+		if(!d.mat_.reset_[row]) {
+			d.mat_.resetRow(row);
+		}
+		reportedThru = d.mat_.reportedThrough(row, col);
+		canMoveThru = true;
+		if(reportedThru) {
+			canMoveThru = false;
+		} else {
+			empty = false;
+			if(row > 0) {
+				assert_gt(row, 0);
+				size_t rowFromEnd = d.mat_.nrow() - row - 1;
+				bool gapsAllowed = true;
+				if(row < (size_t)sc_->gapbar ||
+				   rowFromEnd < (size_t)sc_->gapbar)
+				{
+					gapsAllowed = false;
+				}
+				const TAlScore floorsc = MIN_I64;
+				const int offsetsc = -0x7fff;
+				// Move to beginning of column/row
+				if(ct == SSEMatrix::E) { // AKA rdgap
+					assert_gt(col, 0);
+					TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::E))[rowelt] + offsetsc;
+					assert(gapsAllowed);
+					// Currently in the E matrix; incoming transition must come from the
+					// left.  It's either a gap open from the H matrix or a gap extend from
+					// the E matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell to the left
+					TAlScore sc_h_left = ((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc;
+					if(sc_h_left > floorsc && sc_h_left - sc_->readGapOpen() == sc_cur) {
+						mask |= (1 << 0);
+					}
+					// Get E score of cell to the left
+					TAlScore sc_e_left = ((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc;
+					if(sc_e_left > floorsc && sc_e_left - sc_->readGapExtend() == sc_cur) {
+						mask |= (1 << 1);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isEMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 8) & 3;
+					}
+					if(mask == 3) {
+#if 1
+						// Pick H -> E cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// Pick H -> E cell
+							cur = SW_BT_OALL_READ_OPEN;
+							d.mat_.eMaskSet(row, col, 2); // might choose E later
+						} else {
+							// Pick E -> E cell
+							cur = SW_BT_RDGAP_EXTEND;
+							d.mat_.eMaskSet(row, col, 1); // might choose H later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// I chose the E cell
+						cur = SW_BT_RDGAP_EXTEND;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					assert(!empty || !canMoveThru);
+				} else if(ct == SSEMatrix::F) { // AKA rfgap
+					assert_gt(row, 0);
+					assert(gapsAllowed);
+					TAlScore sc_h_up = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_f_up = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_cur  = ((TCScore*)(cur_vec + SSEMatrix::F))[rowelt] + offsetsc;
+					// Currently in the F matrix; incoming transition must come from above.
+					// It's either a gap open from the H matrix or a gap extend from the F
+					// matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell above
+					if(sc_h_up > floorsc && sc_h_up - sc_->refGapOpen() == sc_cur) {
+						mask |= (1 << 0);
+					}
+					// Get F score of cell above
+					if(sc_f_up > floorsc && sc_f_up - sc_->refGapExtend() == sc_cur) {
+						mask |= (1 << 1);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isFMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 11) & 3;
+					}
+					if(mask == 3) {
+#if 1
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// I chose the H cell
+							cur = SW_BT_OALL_REF_OPEN;
+							d.mat_.fMaskSet(row, col, 2); // might choose E later
+						} else {
+							// I chose the F cell
+							cur = SW_BT_RFGAP_EXTEND;
+							d.mat_.fMaskSet(row, col, 1); // might choose E later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// I chose the F cell
+						cur = SW_BT_RFGAP_EXTEND;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					assert(!empty || !canMoveThru);
+				} else {
+					assert_eq(SSEMatrix::H, ct);
+					TAlScore sc_cur      = ((TCScore*)(cur_vec + SSEMatrix::H))[rowelt]    + offsetsc;
+					TAlScore sc_f_up     = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_h_up     = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_h_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::H))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_e_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::E))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_h_upleft = col > 0 ? (((TCScore*)(upleft_vec + SSEMatrix::H))[upleft_rowelt] + offsetsc) : floorsc;
+					TAlScore sc_diag     = sc_->score(readc, refm, readq - 33);
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					if(gapsAllowed) {
+						if(sc_h_up     > floorsc && sc_cur == sc_h_up   - sc_->refGapOpen()) {
+							mask |= (1 << 0);
+						}
+						if(sc_h_left   > floorsc && sc_cur == sc_h_left - sc_->readGapOpen()) {
+							mask |= (1 << 1);
+						}
+						if(sc_f_up     > floorsc && sc_cur == sc_f_up   - sc_->refGapExtend()) {
+							mask |= (1 << 2);
+						}
+						if(sc_e_left   > floorsc && sc_cur == sc_e_left - sc_->readGapExtend()) {
+							mask |= (1 << 3);
+						}
+					}
+					if(sc_h_upleft > floorsc && sc_cur == sc_h_upleft + sc_diag) {
+						mask |= (1 << 4);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isHMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 2) & 31;
+					}
+					assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+					int opts = alts5[mask];
+					int select = -1;
+					if(opts == 1) {
+						select = firsts5[mask];
+						assert_geq(mask, 0);
+						d.mat_.hMaskSet(row, col, 0);
+					} else if(opts > 1) {
+#if 1
+						if(       (mask & 16) != 0) {
+							select = 4; // H diag
+						} else if((mask & 1) != 0) {
+							select = 0; // H up
+						} else if((mask & 4) != 0) {
+							select = 2; // F up
+						} else if((mask & 2) != 0) {
+							select = 1; // H left
+						} else if((mask & 8) != 0) {
+							select = 3; // E left
+						}
+#else
+						select = randFromMask(rnd, mask);
+#endif
+						assert_geq(mask, 0);
+						mask &= ~(1 << select);
+						assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+						d.mat_.hMaskSet(row, col, mask);
+						branch = true;
+					} else { /* No way to backtrack! */ }
+					if(select != -1) {
+						if(select == 4) {
+							cur = SW_BT_OALL_DIAG;
+						} else if(select == 0) {
+							cur = SW_BT_OALL_REF_OPEN;
+						} else if(select == 1) {
+							cur = SW_BT_OALL_READ_OPEN;
+						} else if(select == 2) {
+							cur = SW_BT_RFGAP_EXTEND;
+						} else {
+							assert_eq(3, select)
+							cur = SW_BT_RDGAP_EXTEND;
+						}
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+				}
+				assert(!empty || !canMoveThru || ct == SSEMatrix::H);
+			}
+		}
+		d.mat_.setReportedThrough(row, col);
+		assert_eq(gaps, Edit::numGaps(ned));
+		assert_leq(gaps, rdgap_ + rfgap_);
+		// Cell was involved in a previously-reported alignment?
+		if(!canMoveThru) {
+			if(!btnstack_.empty()) {
+				// Remove all the cells from list back to and including the
+				// cell where the branch occurred
+				btcells_.resize(btnstack_.back().celsz);
+				// Pop record off the top of the stack
+				ned.resize(btnstack_.back().nedsz);
+				//aed.resize(btnstack_.back().aedsz);
+				row      = btnstack_.back().row;
+				col      = btnstack_.back().col;
+				gaps     = btnstack_.back().gaps;
+				readGaps = btnstack_.back().readGaps;
+				refGaps  = btnstack_.back().refGaps;
+				score    = btnstack_.back().score;
+				ct       = btnstack_.back().ct;
+				btnstack_.pop_back();
+				assert(!sc_->monotone || score.score() >= escore);
+				NEW_ROW_COL(row, col);
+				continue;
+			} else {
+				// No branch points to revisit; just give up
+				res.reset();
+				met.btfail++; // DP backtraces failed
+				return false;
+			}
+		}
+		assert(!reportedThru);
+		assert(!sc_->monotone || score.score() >= minsc_);
+		if(empty || row == 0) {
+			assert_eq(SSEMatrix::H, ct);
+			btcells_.expand();
+			btcells_.back().first = row;
+			btcells_.back().second = col;
+			// This cell is at the end of a legitimate alignment
+			trimBeg = row;
+			assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+			break;
+		}
+		if(branch) {
+			// Add a frame to the backtrack stack
+			btnstack_.expand();
+			btnstack_.back().init(
+				ned.size(),
+				0,               // aed.size()
+				btcells_.size(),
+				row,
+				col,
+				gaps,
+				readGaps,
+				refGaps,
+				score,
+				(int)ct);
+		}
+		btcells_.expand();
+		btcells_.back().first = row;
+		btcells_.back().second = col;
+		switch(cur) {
+			// Move up and to the left.  If the reference nucleotide in the
+			// source row mismatches the read nucleotide, penalize
+			// it and add a nucleotide mismatch.
+			case SW_BT_OALL_DIAG: {
+				assert_gt(row, 0); assert_gt(col, 0);
+				// Check for color mismatch
+				int readC = (*rd_)[row];
+				int refNmask = (int)rf_[rfi_+col];
+				assert_gt(refNmask, 0);
+				int m = matchesEx(readC, refNmask);
+				ct = SSEMatrix::H;
+				if(m != 1) {
+					Edit e(
+						(int)row,
+						mask2dna[refNmask],
+						"ACGTN"[readC],
+						EDIT_TYPE_MM);
+					assert(e.repOk());
+					assert(ned.empty() || ned.back().pos >= row);
+					ned.push_back(e);
+					int pen = QUAL2(row, col);
+					score.score_ -= pen;
+					assert(!sc_->monotone || score.score() >= escore);
+				} else {
+					// Reward a match
+					int64_t bonus = sc_->match(30);
+					score.score_ += bonus;
+					assert(!sc_->monotone || score.score() >= escore);
+				}
+				if(m == -1) {
+					//score.ns_++;
+				}
+				row--; col--;
+				MOVE_ALL_UPLEFT();
+				assert(VALID_AL_SCORE(score));
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_OALL_REF_OPEN:
+			{
+				assert_gt(row, 0);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::H;
+				int pen = sc_->refGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_RFGAP_EXTEND:
+			{
+				assert_gt(row, 1);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::F;
+				int pen = sc_->refGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			case SW_BT_OALL_READ_OPEN:
+			{
+				assert_gt(col, 0);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::H;
+				int pen = sc_->readGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			case SW_BT_RDGAP_EXTEND:
+			{
+				assert_gt(col, 1);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::E;
+				int pen = sc_->readGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			default: throw 1;
+		}
+	} // while((int)row > 0)
+	assert_eq(0, trimBeg);
+	assert_eq(0, trimEnd);
+	assert_geq(col, 0);
+	assert_eq(SSEMatrix::H, ct);
+	// The number of cells in the backtracs should equal the number of read
+	// bases after trimming plus the number of gaps
+	assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+	// Check whether we went through a core diagonal and set 'reported' flag on
+	// each cell
+	bool overlappedCoreDiag = false;
+	for(size_t i = 0; i < btcells_.size(); i++) {
+		size_t rw = btcells_[i].first;
+		size_t cl = btcells_[i].second;
+		// Calculate the diagonal within the *trimmed* rectangle, i.e. the
+		// rectangle we dealt with in align, gather and backtrack.
+		int64_t diagi = cl - rw;
+		// Now adjust to the diagonal within the *untrimmed* rectangle by
+		// adding on the amount trimmed from the left.
+		diagi += rect_->triml;
+		if(diagi >= 0) {
+			size_t diag = (size_t)diagi;
+			if(diag >= rect_->corel && diag <= rect_->corer) {
+				overlappedCoreDiag = true;
+				break;
+			}
+		}
+		assert(d.mat_.reportedThrough(rw, cl));
+	}
+	if(!overlappedCoreDiag) {
+		// Must overlap a core diagonal.  Otherwise, we run the risk of
+		// reporting an alignment that overlaps (and trumps) a higher-scoring
+		// alignment that lies partially outside the dynamic programming
+		// rectangle.
+		res.reset();
+		met.corerej++;
+		return false;
+	}
+	int readC = (*rd_)[rdi_+row];      // get last char in read
+	int refNmask = (int)rf_[rfi_+col]; // get last ref char ref involved in aln
+	assert_gt(refNmask, 0);
+	int m = matchesEx(readC, refNmask);
+	if(m != 1) {
+		Edit e((int)row, mask2dna[refNmask], "ACGTN"[readC], EDIT_TYPE_MM);
+		assert(e.repOk());
+		assert(ned.empty() || ned.back().pos >= row);
+		ned.push_back(e);
+		score.score_ -= QUAL2(row, col);
+		assert_geq(score.score(), minsc_);
+	} else {
+		score.score_ += sc_->match(30);
+	}
+	if(m == -1) {
+		//score.ns_++;
+	}
+#if 0
+	if(score.ns_ > nceil_) {
+		// Alignment has too many Ns in it!
+		res.reset();
+		met.nrej++;
+		return false;
+	}
+#endif
+	res.reverse();
+	assert(Edit::repOk(ned, (*rd_)));
+	assert_eq(score.score(), escore);
+	assert_leq(gaps, rdgap_ + rfgap_);
+	off = col;
+	assert_lt(col + (size_t)rfi_, (size_t)rff_);
+	// score.gaps_ = gaps;
+	res.alres.setScore(score);
+#if 0
+	res.alres.setShape(
+		refidx_,                  // ref id
+		off + rfi_ + rect_->refl, // 0-based ref offset
+		reflen_,                  // reference length
+		fw_,                      // aligned to Watson?
+		rdf_ - rdi_,              // read length
+		true,                     // pretrim soft?
+		0,                        // pretrim 5' end
+		0,                        // pretrim 3' end
+		true,                     // alignment trim soft?
+		fw_ ? trimBeg : trimEnd,  // alignment trim 5' end
+		fw_ ? trimEnd : trimBeg); // alignment trim 3' end
+#endif
+	size_t refns = 0;
+	for(size_t i = col; i <= origCol; i++) {
+		if((int)rf_[rfi_+i] > 15) {
+			refns++;
+		}
+	}
+	// res.alres.setRefNs(refns);
+	assert(Edit::repOk(ned, (*rd_), true, trimBeg, trimEnd));
+	assert(res.repOk());
+#ifndef NDEBUG
+	size_t gapsCheck = 0;
+	for(size_t i = 0; i < ned.size(); i++) {
+		if(ned[i].isGap()) gapsCheck++;
+	}
+	assert_eq(gaps, gapsCheck);
+	BTDnaString refstr;
+	for(size_t i = col; i <= origCol; i++) {
+		refstr.append(firsts5[(int)rf_[rfi_+i]]);
+	}
+	BTDnaString editstr;
+    // daehwan
+	// Edit::toRef((*rd_), ned, editstr, true, trimBeg, trimEnd);
+    Edit::toRef((*rd_), ned, editstr, true, trimBeg + rdi_, trimEnd + (rd_->length() - rdf_));
+	if(refstr != editstr) {
+		cerr << "Decoded nucleotides and edits don't match reference:" << endl;
+		cerr << "           score: " << score.score()
+		     << " (" << gaps << " gaps)" << endl;
+		cerr << "           edits: ";
+		Edit::print(cerr, ned);
+		cerr << endl;
+		cerr << "    decoded nucs: " << (*rd_) << endl;
+		cerr << "     edited nucs: " << editstr << endl;
+		cerr << "  reference nucs: " << refstr << endl;
+		assert(0);
+	}
+#endif
+	met.btsucc++; // DP backtraces succeeded
+	return true;
+}
diff --git a/aligner_swsse_ee_u8.cpp b/aligner_swsse_ee_u8.cpp
new file mode 100644
index 0000000..ef45da0
--- /dev/null
+++ b/aligner_swsse_ee_u8.cpp
@@ -0,0 +1,1905 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * aligner_sw_sse.cpp
+ *
+ * Versions of key alignment functions that use vector instructions to
+ * accelerate dynamic programming.  Based chiefly on the striped Smith-Waterman
+ * paper and implementation by Michael Farrar.  See:
+ *
+ * Farrar M. Striped Smith-Waterman speeds database searches six times over
+ * other SIMD implementations. Bioinformatics. 2007 Jan 15;23(2):156-61.
+ * http://sites.google.com/site/farrarmichael/smith-waterman
+ *
+ * While the paper describes an implementation of Smith-Waterman, we extend it
+ * do end-to-end read alignment as well as local alignment.  The change
+ * required for this is minor: we simply let vmax be the maximum element in the
+ * score domain rather than the minimum.
+ *
+ * The vectorized dynamic programming implementation lacks some features that
+ * make it hard to adapt to solving the entire dynamic-programming alignment
+ * problem.  For instance:
+ *
+ * - It doesn't respect gap barriers on either end of the read
+ * - It just gives a maximum; not enough information to backtrace without
+ *   redoing some alignment
+ * - It's a little difficult to handle st_ and en_, especially st_.
+ * - The query profile mechanism makes handling of ambiguous reference bases a
+ *   little tricky (16 cols in query profile lookup table instead of 5)
+ *
+ * Given the drawbacks, it is tempting to use SSE dynamic programming as a
+ * filter rather than as an aligner per se.  Here are a few ideas for how it
+ * can be extended to handle more of the alignment problem:
+ *
+ * - Save calculated scores to a big array as we go.  We return to this array
+ *   to find and backtrace from good solutions.
+ */
+
+#include <limits>
+#include "aligner_sw.h"
+
+static const size_t NBYTES_PER_REG  = 16;
+static const size_t NWORDS_PER_REG  = 16;
+// static const size_t NBITS_PER_WORD  = 8;
+static const size_t NBYTES_PER_WORD = 1;
+
+// In end-to-end mode, we start high (255) and go low (0).  Factoring in
+// a query profile involves unsigned saturating subtraction, so all the
+// query profile elements should be expressed as a positive penalty rather
+// than a negative score.
+
+typedef uint8_t TCScore;
+
+/**
+ * Build query profile look up tables for the read.  The query profile look
+ * up table is organized as a 1D array indexed by [i][j] where i is the
+ * reference character in the current DP column (0=A, 1=C, etc), and j is
+ * the segment of the query we're currently working on.
+ */
+void SwAligner::buildQueryProfileEnd2EndSseU8(bool fw) {
+	bool& done = fw ? sseU8fwBuilt_ : sseU8rcBuilt_;
+	if(done) {
+		return;
+	}
+	done = true;
+	const BTDnaString* rd = fw ? rdfw_ : rdrc_;
+	const BTString* qu = fw ? qufw_ : qurc_;
+    // daehwan - allows to align a portion of a read, not the whole.
+	// const size_t len = rd->length();
+    const size_t len = dpRows();
+	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
+	// How many __m128i's are needed
+	size_t n128s =
+		64 +                    // slack bytes, for alignment?
+		(seglen * ALPHA_SIZE)   // query profile data
+		* 2;                    // & gap barrier data
+	assert_gt(n128s, 0);
+	SSEData& d = fw ? sseU8fw_ : sseU8rc_;
+	d.profbuf_.resizeNoCopy(n128s);
+	assert(!d.profbuf_.empty());
+	d.maxPen_      = d.maxBonus_ = 0;
+	d.lastIter_    = d.lastWord_ = 0;
+	d.qprofStride_ = d.gbarStride_ = 2;
+	d.bias_ = 0; // no bias needed for end-to-end alignment; just use subtraction
+	// For each reference character A, C, G, T, N ...
+	for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
+		// For each segment ...
+		for(size_t i = 0; i < seglen; i++) {
+			size_t j = i;
+			uint8_t *qprofWords =
+				reinterpret_cast<uint8_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2));
+			uint8_t *gbarWords =
+				reinterpret_cast<uint8_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2) + 1);
+			// For each sub-word (byte) ...
+			for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+				int sc = 0;
+				*gbarWords = 0;
+				if(j < len) {
+					int readc = (*rd)[j];
+					int readq = (*qu)[j];
+					sc = sc_->score(readc, (int)(1 << refc), readq - 33);
+					// Make score positive, to fit in an unsigned
+					sc = -sc;
+					assert_range(0, 255, sc);
+					size_t j_from_end = len - j - 1;
+					if(j < (size_t)sc_->gapbar ||
+					   j_from_end < (size_t)sc_->gapbar)
+					{
+						// Inside the gap barrier
+						*gbarWords = 0xff;
+					}
+				}
+				if(refc == 0 && j == len-1) {
+					// Remember which 128-bit word and which smaller word has
+					// the final row
+					d.lastIter_ = i;
+					d.lastWord_ = k;
+				}
+				if((size_t)sc > d.maxPen_) {
+					d.maxPen_ = (size_t)sc;
+				}
+				*qprofWords = (uint8_t)sc;
+				gbarWords++;
+				qprofWords++;
+				j += seglen; // update offset into query
+			}
+		}
+	}
+}
+
+#ifndef NDEBUG
+/**
+ * Return true iff the cell has sane E/F/H values w/r/t its predecessors.
+ */
+static bool cellOkEnd2EndU8(
+	SSEData& d,
+	size_t row,
+	size_t col,
+	int refc,
+	int readc,
+	int readq,
+	const Scoring& sc)     // scoring scheme
+{
+	TCScore floorsc = 0;
+	TAlScore ceilsc = MAX_I64;
+	TAlScore offsetsc = -0xff;
+	TAlScore sc_h_cur = (TAlScore)d.mat_.helt(row, col);
+	TAlScore sc_e_cur = (TAlScore)d.mat_.eelt(row, col);
+	TAlScore sc_f_cur = (TAlScore)d.mat_.felt(row, col);
+	if(sc_h_cur > floorsc) {
+		sc_h_cur += offsetsc;
+	}
+	if(sc_e_cur > floorsc) {
+		sc_e_cur += offsetsc;
+	}
+	if(sc_f_cur > floorsc) {
+		sc_f_cur += offsetsc;
+	}
+	bool gapsAllowed = true;
+	size_t rowFromEnd = d.mat_.nrow() - row - 1;
+	if(row < (size_t)sc.gapbar || rowFromEnd < (size_t)sc.gapbar) {
+		gapsAllowed = false;
+	}
+	bool e_left_trans = false, h_left_trans = false;
+	bool f_up_trans   = false, h_up_trans = false;
+	bool h_diag_trans = false;
+	if(gapsAllowed) {
+		TAlScore sc_h_left = floorsc;
+		TAlScore sc_e_left = floorsc;
+		TAlScore sc_h_up   = floorsc;
+		TAlScore sc_f_up   = floorsc;
+		if(col > 0 && sc_e_cur > floorsc && sc_e_cur <= ceilsc) {
+			sc_h_left = d.mat_.helt(row, col-1) + offsetsc;
+			sc_e_left = d.mat_.eelt(row, col-1) + offsetsc;
+			e_left_trans = (sc_e_left > floorsc && sc_e_cur == sc_e_left - sc.readGapExtend());
+			h_left_trans = (sc_h_left > floorsc && sc_e_cur == sc_h_left - sc.readGapOpen());
+			assert(e_left_trans || h_left_trans);
+			// Check that we couldn't have got a better E score
+			assert_geq(sc_e_cur, sc_e_left - sc.readGapExtend());
+			assert_geq(sc_e_cur, sc_h_left - sc.readGapOpen());
+		}
+		if(row > 0 && sc_f_cur > floorsc && sc_f_cur <= ceilsc) {
+			sc_h_up = d.mat_.helt(row-1, col) + offsetsc;
+			sc_f_up = d.mat_.felt(row-1, col) + offsetsc;
+			f_up_trans = (sc_f_up > floorsc && sc_f_cur == sc_f_up - sc.refGapExtend());
+			h_up_trans = (sc_h_up > floorsc && sc_f_cur == sc_h_up - sc.refGapOpen());
+			assert(f_up_trans || h_up_trans);
+			// Check that we couldn't have got a better F score
+			assert_geq(sc_f_cur, sc_f_up - sc.refGapExtend());
+			assert_geq(sc_f_cur, sc_h_up - sc.refGapOpen());
+		}
+	} else {
+		assert_geq(floorsc, sc_e_cur);
+		assert_geq(floorsc, sc_f_cur);
+	}
+	if(col > 0 && row > 0 && sc_h_cur > floorsc && sc_h_cur <= ceilsc) {
+		TAlScore sc_h_upleft = d.mat_.helt(row-1, col-1) + offsetsc;
+		TAlScore sc_diag = sc.score(readc, (int)refc, readq - 33);
+		h_diag_trans = sc_h_cur == sc_h_upleft + sc_diag;
+	}
+	assert(
+		sc_h_cur <= floorsc ||
+		e_left_trans ||
+		h_left_trans ||
+		f_up_trans   ||
+		h_up_trans   ||
+		h_diag_trans ||
+		sc_h_cur > ceilsc ||
+		row == 0 ||
+		col == 0);
+	return true;
+}
+#endif /*ndef NDEBUG*/
+
+#ifdef NDEBUG
+
+#define assert_all_eq0(x)
+#define assert_all_gt(x, y)
+#define assert_all_gt_lo(x)
+#define assert_all_lt(x, y)
+#define assert_all_lt_hi(x)
+
+#else
+
+#define assert_all_eq0(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt(x, y) { \
+	__m128i tmp = _mm_cmpgt_epu8(x, y); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt_lo(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpgt_epu8(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt(x, y) { \
+	__m128i z = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	__m128i tmp = _mm_subs_epu8(y, x); \
+	tmp = _mm_cmpeq_epi16(tmp, z); \
+	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt_hi(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_cmpeq_epu8(z, z); \
+	z = _mm_srli_epu8(z, 1); \
+	tmp = _mm_cmplt_epu8(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+#endif
+
+/**
+ * Aligns by filling a dynamic programming matrix with the SSE-accelerated,
+ * banded DP approach of Farrar.  As it goes, it determines which cells we
+ * might backtrace from and tallies the best (highest-scoring) N backtrace
+ * candidate cells per diagonal.  Also returns the alignment score of the best
+ * alignment in the matrix.
+ *
+ * This routine does *not* maintain a matrix holding the entire matrix worth of
+ * scores, nor does it maintain any other dense O(mn) data structure, as this
+ * would quickly exhaust memory for queries longer than about 10,000 kb.
+ * Instead, in the fill stage it maintains two columns worth of scores at a
+ * time (current/previous, or right/left) - these take O(m) space.  When
+ * finished with the current column, it determines which cells from the
+ * previous column, if any, are candidates we might backtrace from to find a
+ * full alignment.  A candidate cell has a score that rises above the threshold
+ * and isn't improved upon by a match in the next column.  The best N
+ * candidates per diagonal are stored in a O(m + n) data structure.
+ */
+TAlScore SwAligner::alignGatherEE8(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileEnd2EndSseU8(fw_);
+	assert(!d.profbuf_.empty());
+
+	assert_eq(0, d.maxBonus_);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+	
+	int dup;
+	
+	// Now set up the score vectors.  We just need two columns worth, which
+	// we'll call "left" and "right".
+	d.vecbuf_.resize(4 * 2 * iter);
+	d.vecbuf_.zero();
+	__m128i *vbuf_l = d.vecbuf_.ptr();
+	__m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
+
+	// This is the data structure that holds candidate cells per diagonal.
+	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
+	if(!debug) {
+		btdiag_.init(ndiags, 2);
+	}
+
+	// Data structure that holds checkpointed anti-diagonals
+	TAlScore perfectScore = sc_->perfectScore(dpRows());
+	bool checkpoint = true;
+	bool cpdebug = false;
+#ifndef NDEBUG
+	cpdebug = dpRows() < 1000;
+#endif
+	cper_.init(
+		dpRows(),      // # rows
+		rff_ - rfi_,   // # columns
+		cperPerPow2_,  // checkpoint every 1 << perpow2 diags (& next)
+		perfectScore,  // perfect score (for sanity checks)
+		true,          // matrix cells have 8-bit scores?
+		cperTri_,      // triangular mini-fills?
+		false,         // alignment is local?
+		cpdebug);      // save all cells for debugging?
+
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+	__m128i vhd      = _mm_setzero_si128();
+	__m128i vhdtmp   = _mm_setzero_si128();
+	__m128i vtmp     = _mm_setzero_si128();
+	__m128i vzero    = _mm_setzero_si128();
+	__m128i vhilsw   = _mm_setzero_si128();
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_U8);
+	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
+	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_U8);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
+	rfgape = _mm_insert_epi16(rfgape, dup, 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_U8);
+	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
+	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_U8);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
+	rdgape = _mm_insert_epi16(rdgape, dup, 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	
+	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	
+	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
+	vhilsw = _mm_shuffle_epi32(vhi, 0);
+	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	const size_t colstride = ROWSTRIDE_2COL * iter;
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
+	/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
+	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	
+	// Maximum score in final row
+	bool found = false;
+	TCScore lrmax = MIN_U8;
+	
+	for(size_t i = 0; i < iter; i++) {
+		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		// Could initialize Hs to high or low.  If high, cells in the lower
+		// triangle will have somewhat more legitiate scores, but still won't
+		// be exhaustively scored.
+		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+	}
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+	
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		// Swap left and right; vbuf_l is the vector on the left, which we
+		// generally load from, and vbuf_r is the vector on the right, which we
+		// generally store to.
+		swap(vbuf_l, vbuf_r);
+		pvELeft = vbuf_l + 0; pvERight = vbuf_r + 0;
+		/* pvFLeft = vbuf_l + 1; */ pvFRight = vbuf_r + 1;
+		pvHLeft = vbuf_l + 2; pvHRight = vbuf_r + 2;
+		
+		// Fetch the appropriate query profile.  Note that elements of rf_ must
+		// be numbers, not masks.
+		const int refc = (int)rf_[i];
+		
+		// Fetch the appropriate query profile
+		size_t off = (size_t)firsts5[refc] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Set all cells to low value
+		vf = _mm_xor_si128(vf, vf);
+
+		// Load H vector from the final row of the previous column
+		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		// Shift 2 bytes down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		// Fill topmost (least sig) cell with high value
+		vh = _mm_or_si128(vh, vhilsw);
+		
+		// For each character in the reference text:
+		size_t j;
+		for(j = 0; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELeft);
+			vhd = _mm_load_si128(pvHLeft);
+			assert_all_lt(ve, vhi);
+			pvELeft += ROWSTRIDE_2COL;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_subs_epu8(vh, pvScore[0]);
+			
+			// Update H, factoring in E and F
+			vh = _mm_max_epu8(vh, vf);
+			
+			// Update vE value
+			vhdtmp = vhd;
+			vhd = _mm_subs_epu8(vhd, rdgapo);
+			vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epu8(ve, rdgape);
+			ve = _mm_max_epu8(ve, vhd);
+			vh = _mm_max_epu8(vh, ve);
+			
+			// Save the new vH values
+			_mm_store_si128(pvHRight, vh);
+			pvHRight += ROWSTRIDE_2COL;
+			vtmp = vh;
+			assert_all_lt(ve, vhi);
+			
+			// Load the next h value
+			vh = vhdtmp;
+			pvHLeft += ROWSTRIDE_2COL;
+
+			// Save E values
+			_mm_store_si128(pvERight, ve);
+			pvERight += ROWSTRIDE_2COL;
+			
+			// Update vf value
+			vtmp = _mm_subs_epu8(vtmp, rfgapo);
+
+			vf = _mm_subs_epu8(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epu8(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFRight -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFRight);
+		
+		pvHRight -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHRight);
+		
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		
+		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epu8(vtmp, vf);
+		vtmp = _mm_subs_epu8(vf, vtmp);
+		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0xffff) {
+			// Store this vf
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epu8(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHRight, vh);
+			pvHRight += ROWSTRIDE_2COL;
+			
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFRight -= colstride;
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				pvHRight -= colstride;
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+			} else {
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epu8(vf, rfgape);
+			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epu8(vtmp, vf);
+			vtmp = _mm_subs_epu8(vf, vtmp);
+			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+		
+		// Check in the last row for the maximum so far
+		__m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
+		// Note: we may not want to extract from the final row
+		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
+		found = true;
+		if(lr > lrmax) {
+			lrmax = lr;
+		}
+		
+		// Now we'd like to know whether the bottommost element of the right
+		// column is a candidate we might backtrace from.  First question is:
+		// did it exceed the minimum score threshold?
+		TAlScore score = (TAlScore)(lr - 0xff);
+		if(lr == MIN_U8) {
+			score = MIN_I64;
+		}
+		if(!debug && score >= minsc_) {
+			DpBtCandidate cand(dpRows() - 1, i - rfi_, score);
+			btdiag_.add(i - rfi_, cand);
+		}
+
+		// Save some elements to checkpoints
+		if(checkpoint) {
+			
+			__m128i *pvE = vbuf_r + 0;
+			__m128i *pvF = vbuf_r + 1;
+			__m128i *pvH = vbuf_r + 2;
+			size_t coli = i - rfi_;
+			if(coli < cper_.locol_) cper_.locol_ = coli;
+			if(coli > cper_.hicol_) cper_.hicol_ = coli;
+			
+			if(cperTri_) {
+				size_t rc_mod = coli & cper_.lomask_;
+				assert_lt(rc_mod, cper_.per_);
+				int64_t row = -rc_mod-1;
+				int64_t row_mod = row;
+				int64_t row_div = 0;
+				size_t idx = coli >> cper_.perpow2_;
+				size_t idxrow = idx * cper_.nrow_;
+				assert_eq(4, ROWSTRIDE_2COL);
+				bool done = false;
+				while(true) {
+					row += (cper_.per_ - 2);
+					row_mod += (cper_.per_ - 2);
+					for(size_t j = 0; j < 2; j++) {
+						row++;
+						row_mod++;
+						if(row >= 0 && (size_t)row < cper_.nrow_) {
+							// Update row divided by iter_ and mod iter_
+							while(row_mod >= (int64_t)iter) {
+								row_mod -= (int64_t)iter;
+								row_div++;
+							}
+							size_t delt = idxrow + row;
+							size_t vecoff = (row_mod << 6) + row_div;
+							assert_lt(row_div, 16);
+							int16_t h_sc = ((uint8_t*)pvH)[vecoff];
+							int16_t e_sc = ((uint8_t*)pvE)[vecoff];
+							int16_t f_sc = ((uint8_t*)pvF)[vecoff];
+							if(h_sc == 0) h_sc = MIN_I16;
+							else h_sc -= 0xff;
+							if(e_sc == 0) e_sc = MIN_I16;
+							else e_sc -= 0xff;
+							if(f_sc == 0) f_sc = MIN_I16;
+							else f_sc -= 0xff;
+							assert_leq(h_sc, cper_.perf_);
+							assert_leq(e_sc, cper_.perf_);
+							assert_leq(f_sc, cper_.perf_);
+							CpQuad *qdiags = ((j == 0) ? cper_.qdiag1s_.ptr() : cper_.qdiag2s_.ptr());
+							qdiags[delt].sc[0] = h_sc;
+							qdiags[delt].sc[1] = e_sc;
+							qdiags[delt].sc[2] = f_sc;
+						} // if(row >= 0 && row < nrow_)
+						else if(row >= 0 && (size_t)row >= cper_.nrow_) {
+							done = true;
+							break;
+						}
+					} // end of loop over anti-diags
+					if(done) {
+						break;
+					}
+					idx++;
+					idxrow += cper_.nrow_;
+				}
+			} else {
+				// If this is the first column, take this opportunity to
+				// pre-calculate the coordinates of the elements we're going to
+				// checkpoint.
+				if(coli == 0) {
+					size_t cpi    = cper_.per_-1;
+					size_t cpimod = cper_.per_-1;
+					size_t cpidiv = 0;
+					cper_.commitMap_.clear();
+					while(cpi < cper_.nrow_) {
+						while(cpimod >= iter) {
+							cpimod -= iter;
+							cpidiv++;
+						}
+						size_t vecoff = (cpimod << 6) + cpidiv;
+						cper_.commitMap_.push_back(vecoff);
+						cpi += cper_.per_;
+						cpimod += cper_.per_;
+					}
+				}
+				// Save all the rows
+				size_t rowoff = 0;
+				size_t sz = cper_.commitMap_.size();
+				for(size_t i = 0; i < sz; i++, rowoff += cper_.ncol_) {
+					size_t vecoff = cper_.commitMap_[i];
+					int16_t h_sc = ((uint8_t*)pvH)[vecoff];
+					//int16_t e_sc = ((uint8_t*)pvE)[vecoff];
+					int16_t f_sc = ((uint8_t*)pvF)[vecoff];
+					if(h_sc == 0) h_sc = MIN_I16;
+					else h_sc -= 0xff;
+					//if(e_sc == 0) e_sc = MIN_I16;
+					//else e_sc -= 0xff;
+					if(f_sc == 0) f_sc = MIN_I16;
+					else f_sc -= 0xff;
+					assert_leq(h_sc, cper_.perf_);
+					//assert_leq(e_sc, cper_.perf_);
+					assert_leq(f_sc, cper_.perf_);
+					CpQuad& dst = cper_.qrows_[rowoff + coli];
+					dst.sc[0] = h_sc;
+					//dst.sc[1] = e_sc;
+					dst.sc[2] = f_sc;
+				}
+				// Is this a column we'd like to checkpoint?
+				if((coli & cper_.lomask_) == cper_.lomask_) {
+					// Save the column using memcpys
+					assert_gt(coli, 0);
+					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
+					__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				}
+			}
+			if(cper_.debug_) {
+				// Save the column using memcpys
+				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+				size_t coloff = coli * wordspercol;
+				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+			}
+		}
+	}
+	
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+
+	flag = 0;
+
+	// Did we find a solution?
+	TAlScore score = MIN_I64;
+	if(!found) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return MIN_I64;
+	} else {
+		score = (TAlScore)(lrmax - 0xff);
+		if(score < minsc_) {
+			flag = -1; // no
+			if(!debug) met.dpfail++;
+			return score;
+		}
+	}
+	
+	// Could we have saturated?
+	if(lrmax == MIN_U8) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+
+	// Now take all the backtrace candidates in the btdaig_ structure and
+	// dump them into the btncand_ array.  They'll be sorted later.
+	if(!debug) {
+		btdiag_.dump(btncand_);
+		assert(!btncand_.empty());
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return score;
+}
+
+/**
+ * Solve the current alignment problem using SSE instructions that operate on 16
+ * unsigned 8-bit values packed into a single 128-bit register.
+ */
+TAlScore SwAligner::alignNucleotidesEnd2EndSseU8(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileEnd2EndSseU8(fw_);
+	assert(!d.profbuf_.empty());
+
+	assert_eq(0, d.maxBonus_);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+
+	int dup;
+	
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+#if 0
+	__m128i vhd      = _mm_setzero_si128();
+	__m128i vhdtmp   = _mm_setzero_si128();
+#endif
+	__m128i vtmp     = _mm_setzero_si128();
+	__m128i vzero    = _mm_setzero_si128();
+	__m128i vhilsw   = _mm_setzero_si128();
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_U8);
+	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
+	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_U8);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
+	rfgape = _mm_insert_epi16(rfgape, dup, 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_U8);
+	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
+	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_U8);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
+	rdgape = _mm_insert_epi16(rdgape, dup, 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	
+	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	
+	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
+	vhilsw = _mm_shuffle_epi32(vhi, 0);
+	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
+	const size_t colstride = d.mat_.colstride();
+	//const size_t rowstride = d.mat_.rowstride();
+	assert_eq(ROWSTRIDE, colstride / iter);
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	__m128i *pvETmp = d.mat_.evec(0, 0);
+	
+	// Maximum score in final row
+	bool found = false;
+	TCScore lrmax = MIN_U8;
+	
+	for(size_t i = 0; i < iter; i++) {
+		_mm_store_si128(pvETmp, vlo);
+		_mm_store_si128(pvHTmp, vlo); // start high in end-to-end mode
+		pvETmp += ROWSTRIDE;
+		pvHTmp += ROWSTRIDE;
+	}
+	// These are swapped just before the innermost loop
+	__m128i *pvHStore = d.mat_.hvec(0, 0);
+	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	__m128i *pvELoad  = d.mat_.evec(0, 0);
+	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	__m128i *pvFStore = d.mat_.fvec(0, 0);
+	__m128i *pvFTmp   = NULL;
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+	
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+
+	colstop_ = rff_ - 1;
+	lastsolcol_ = 0;
+
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert(pvFStore == d.mat_.fvec(0, i - rfi_));
+		assert(pvHStore == d.mat_.hvec(0, i - rfi_));
+		
+		// Fetch the appropriate query profile.  Note that elements of rf_ must
+		// be numbers, not masks.
+		const int refc = (int)rf_[i];
+		size_t off = (size_t)firsts5[refc] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Set all cells to low value
+		vf = _mm_xor_si128(vf, vf);
+
+		// Load H vector from the final row of the previous column
+		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		// Shift 2 bytes down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		// Fill topmost (least sig) cell with high value
+		vh = _mm_or_si128(vh, vhilsw);
+		
+		// For each character in the reference text:
+		size_t j;
+		for(j = 0; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELoad);
+#if 0
+			vhd = _mm_load_si128(pvHLoad);
+#endif
+			assert_all_lt(ve, vhi);
+			pvELoad += ROWSTRIDE;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_subs_epu8(vh, pvScore[0]);
+			
+			// Update H, factoring in E and F
+			vh = _mm_max_epu8(vh, ve);
+			vh = _mm_max_epu8(vh, vf);
+			
+			// Save the new vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update vE value
+			vtmp = vh;
+#if 0
+			vhdtmp = vhd;
+			vhd = _mm_subs_epu8(vhd, rdgapo);
+			vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epu8(ve, rdgape);
+			ve = _mm_max_epu8(ve, vhd);
+#else
+			vh = _mm_subs_epu8(vh, rdgapo);
+			vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epu8(ve, rdgape);
+			ve = _mm_max_epu8(ve, vh);
+#endif
+			assert_all_lt(ve, vhi);
+			
+			// Load the next h value
+#if 0
+			vh = vhdtmp;
+#else
+			vh = _mm_load_si128(pvHLoad);
+#endif
+			pvHLoad += ROWSTRIDE;
+			
+			// Save E values
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+			
+			// Update vf value
+			vtmp = _mm_subs_epu8(vtmp, rfgapo);
+			vf = _mm_subs_epu8(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epu8(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFTmp = pvFStore;
+		pvFStore -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFStore);
+		
+		pvHStore -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHStore);
+		
+#if 0
+#else
+		pvEStore -= colstride; // reset to start of column
+		ve = _mm_load_si128(pvEStore);
+#endif
+		
+		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		
+		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epu8(vtmp, vf);
+		vtmp = _mm_subs_epu8(vf, vtmp);
+		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0xffff) {
+			// Store this vf
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epu8(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update E in case it can be improved using our new vh
+#if 0
+#else
+			vh = _mm_subs_epu8(vh, rdgapo);
+			vh = _mm_subs_epu8(vh, *pvScore); // veto some read gap opens
+			ve = _mm_max_epu8(ve, vh);
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+#endif
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFStore -= colstride;
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				pvHStore -= colstride;
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+#if 0
+#else
+				pvEStore -= colstride;
+				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+#endif
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+			} else {
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+#if 0
+#else
+				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+#endif
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epu8(vf, rfgape);
+			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epu8(vtmp, vf);
+			vtmp = _mm_subs_epu8(vf, vtmp);
+			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+		
+#ifndef NDEBUG
+		if(true && (rand() & 15) == 0) {
+			// This is a work-intensive sanity check; each time we finish filling
+			// a column, we check that each H, E, and F is sensible.
+			for(size_t k = 0; k < dpRows(); k++) {
+				assert(cellOkEnd2EndU8(
+					d,
+					k,                   // row
+					i - rfi_,            // col
+					refc,                // reference mask
+					(int)(*rd_)[rdi_+k], // read char
+					(int)(*qu_)[rdi_+k], // read quality
+					*sc_));              // scoring scheme
+			}
+		}
+#endif
+		
+		__m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
+		// Note: we may not want to extract from the final row
+		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
+		found = true;
+		if(lr > lrmax) {
+			lrmax = lr;
+		}
+
+		// pvELoad and pvHLoad are already where they need to be
+		
+		// Adjust the load and store vectors here.  
+		pvHStore = pvHLoad + colstride;
+		pvEStore = pvELoad + colstride;
+		pvFStore = pvFTmp;
+	}
+	
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+	
+	flag = 0;
+	
+	// Did we find a solution?
+	TAlScore score = MIN_I64;
+	if(!found) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return MIN_I64;
+	} else {
+		score = (TAlScore)(lrmax - 0xff);
+		if(score < minsc_) {
+			flag = -1; // no
+			if(!debug) met.dpfail++;
+			return score;
+		}
+	}
+	
+	// Could we have saturated?
+	if(lrmax == MIN_U8) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return score;
+}
+
+/**
+ * Given a filled-in DP table, populate the btncand_ list with candidate cells
+ * that might be at the ends of valid alignments.  No need to do this unless
+ * the maximum score returned by the align*() func is >= the minimum.
+ *
+ * Only cells that are exhaustively scored are candidates.  Those are the
+ * cells inside the shape made of o's in this:
+ *
+ *  |-maxgaps-|
+ *  *********************************    -
+ *   ********************************    |
+ *    *******************************    |
+ *     ******************************    |
+ *      *****************************    |
+ *       **************************** read len
+ *        ***************************    |
+ *         **************************    |
+ *          *************************    |
+ *           ************************    |
+ *            ***********oooooooooooo    -
+ *            |-maxgaps-|
+ *  |-readlen-|
+ *  |-------skip--------|
+ *
+ * And it's possible for the shape to be truncated on the left and right sides.
+ *
+ * 
+ */
+bool SwAligner::gatherCellsNucleotidesEnd2EndSseU8(TAlScore best) {
+	// What's the minimum number of rows that can possibly be spanned by an
+	// alignment that meets the minimum score requirement?
+	assert(sse8succ_);
+	const size_t ncol = rff_ - rfi_;
+	const size_t nrow = dpRows();
+	assert_gt(nrow, 0);
+	btncand_.clear();
+	btncanddone_.clear();
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	assert(!d.profbuf_.empty());
+	const size_t colstride = d.mat_.colstride();
+	ASSERT_ONLY(bool sawbest = false);
+	__m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
+	for(size_t j = 0; j < ncol; j++) {
+		TAlScore sc = (TAlScore)(((TCScore*)pvH)[d.lastWord_] - 0xff);
+		assert_leq(sc, best);
+		ASSERT_ONLY(sawbest = (sawbest || sc == best));
+		if(sc >= minsc_) {
+			// Yes, this is legit
+			met.gathsol++;
+			btncand_.expand();
+			btncand_.back().init(nrow-1, j, sc);
+		}
+		pvH += colstride;
+	}
+	assert(sawbest);
+	if(!btncand_.empty()) {
+		d.mat_.initMasks();
+	}
+	return !btncand_.empty();
+}
+
+#define MOVE_VEC_PTR_UP(vec, rowvec, rowelt) { \
+	if(rowvec == 0) { \
+		rowvec += d.mat_.nvecrow_; \
+		vec += d.mat_.colstride_; \
+		rowelt--; \
+	} \
+	rowvec--; \
+	vec -= ROWSTRIDE; \
+}
+
+#define MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt) { vec -= d.mat_.colstride_; }
+
+#define MOVE_VEC_PTR_UPLEFT(vec, rowvec, rowelt) { \
+ 	MOVE_VEC_PTR_UP(vec, rowvec, rowelt); \
+ 	MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt); \
+}
+
+#define MOVE_ALL_LEFT() { \
+	MOVE_VEC_PTR_LEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_LEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UP() { \
+	MOVE_VEC_PTR_UP(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UP(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UP(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UPLEFT() { \
+	MOVE_VEC_PTR_UPLEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UPLEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define NEW_ROW_COL(row, col) { \
+	rowelt = row / d.mat_.nvecrow_; \
+	rowvec = row % d.mat_.nvecrow_; \
+	eltvec = (col * d.mat_.colstride_) + (rowvec * ROWSTRIDE); \
+	cur_vec = d.mat_.matbuf_.ptr() + eltvec; \
+	left_vec = cur_vec; \
+	left_rowelt = rowelt; \
+	left_rowvec = rowvec; \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	up_vec = cur_vec; \
+	up_rowelt = rowelt; \
+	up_rowvec = rowvec; \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	upleft_vec = up_vec; \
+	upleft_rowelt = up_rowelt; \
+	upleft_rowvec = up_rowvec; \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+/**
+ * Given the dynamic programming table and a cell, trace backwards from the
+ * cell and install the edits and score/penalty in the appropriate fields
+ * of res.  The RandomSource is used to break ties among equally good ways
+ * of tracing back.
+ *
+ * Whenever we enter a cell, we check whether the read/ref coordinates of
+ * that cell correspond to a cell we traversed constructing a previous
+ * alignment.  If so, we backtrack to the last decision point, mask out the
+ * path that led to the previously observed cell, and continue along a
+ * different path; or, if there are no more paths to try, we give up.
+ *
+ * If an alignment is found, 'off' is set to the alignment's upstream-most
+ * reference character's offset into the chromosome and true is returned.
+ * Otherwise, false is returned.
+ */
+bool SwAligner::backtraceNucleotidesEnd2EndSseU8(
+	TAlScore       escore, // in: expected score
+	SwResult&      res,    // out: store results (edits and scores) here
+	size_t&        off,    // out: store diagonal projection of origin
+	size_t&        nbts,   // out: # backtracks
+	size_t         row,    // start in this row
+	size_t         col,    // start in this column
+	RandomSource&  rnd)    // random gen, to choose among equal paths
+{
+	assert_lt(row, dpRows());
+	assert_lt(col, (size_t)(rff_ - rfi_));
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	met.bt++;
+	assert(!d.profbuf_.empty());
+	assert_lt(row, rd_->length());
+	btnstack_.clear(); // empty the backtrack stack
+	btcells_.clear();  // empty the cells-so-far list
+	AlnScore score; score.score_ = 0;
+	// score.gaps_ = score.ns_ = 0;
+	size_t origCol = col;
+	size_t gaps = 0, readGaps = 0, refGaps = 0;
+	res.alres.reset();
+    EList<Edit>& ned = res.alres.ned();
+	assert(ned.empty());
+	assert_gt(dpRows(), row);
+	ASSERT_ONLY(size_t trimEnd = dpRows() - row - 1);
+	size_t trimBeg = 0;
+	size_t ct = SSEMatrix::H; // cell type
+	// Row and col in terms of where they fall in the SSE vector matrix
+	size_t rowelt, rowvec, eltvec;
+	size_t left_rowelt, up_rowelt, upleft_rowelt;
+	size_t left_rowvec, up_rowvec, upleft_rowvec;
+	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	NEW_ROW_COL(row, col);
+	while((int)row >= 0) {
+		met.btcell++;
+		nbts++;
+		int readc = (*rd_)[rdi_ + row];
+		int refm  = (int)rf_[rfi_ + col];
+		int readq = (*qu_)[row];
+		assert_leq(col, origCol);
+		// Get score in this cell
+		bool empty = false, reportedThru, canMoveThru, branch = false;
+		int cur = SSEMatrix::H;
+		if(!d.mat_.reset_[row]) {
+			d.mat_.resetRow(row);
+		}
+		reportedThru = d.mat_.reportedThrough(row, col);
+		canMoveThru = true;
+		if(reportedThru) {
+			canMoveThru = false;
+		} else {
+			empty = false;
+			if(row > 0) {
+				assert_gt(row, 0);
+				size_t rowFromEnd = d.mat_.nrow() - row - 1;
+				bool gapsAllowed = true;
+				if(row < (size_t)sc_->gapbar ||
+				   rowFromEnd < (size_t)sc_->gapbar)
+				{
+					gapsAllowed = false;
+				}
+				const TAlScore floorsc = MIN_I64;
+				const int offsetsc = -0xff;
+				// Move to beginning of column/row
+				if(ct == SSEMatrix::E) { // AKA rdgap
+					assert_gt(col, 0);
+					TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::E))[rowelt] + offsetsc;
+					assert(gapsAllowed);
+					// Currently in the E matrix; incoming transition must come from the
+					// left.  It's either a gap open from the H matrix or a gap extend from
+					// the E matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell to the left
+					TAlScore sc_h_left = ((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc;
+					if(sc_h_left > floorsc && sc_h_left - sc_->readGapOpen() == sc_cur) {
+						mask |= (1 << 0);
+					}
+					// Get E score of cell to the left
+					TAlScore sc_e_left = ((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc;
+					if(sc_e_left > floorsc && sc_e_left - sc_->readGapExtend() == sc_cur) {
+						mask |= (1 << 1);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isEMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 8) & 3;
+					}
+					if(mask == 3) {
+#if 1
+						// Pick H -> E cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// Pick H -> E cell
+							cur = SW_BT_OALL_READ_OPEN;
+							d.mat_.eMaskSet(row, col, 2); // might choose E later
+						} else {
+							// Pick E -> E cell
+							cur = SW_BT_RDGAP_EXTEND;
+							d.mat_.eMaskSet(row, col, 1); // might choose H later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// I chose the E cell
+						cur = SW_BT_RDGAP_EXTEND;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					assert(!empty || !canMoveThru);
+				} else if(ct == SSEMatrix::F) { // AKA rfgap
+					assert_gt(row, 0);
+					assert(gapsAllowed);
+					TAlScore sc_h_up = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_f_up = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_cur  = ((TCScore*)(cur_vec + SSEMatrix::F))[rowelt] + offsetsc;
+					// Currently in the F matrix; incoming transition must come from above.
+					// It's either a gap open from the H matrix or a gap extend from the F
+					// matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell above
+					if(sc_h_up > floorsc && sc_h_up - sc_->refGapOpen() == sc_cur) {
+						mask |= (1 << 0);
+					}
+					// Get F score of cell above
+					if(sc_f_up > floorsc && sc_f_up - sc_->refGapExtend() == sc_cur) {
+						mask |= (1 << 1);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isFMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 11) & 3;
+					}
+					if(mask == 3) {
+#if 1
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// I chose the H cell
+							cur = SW_BT_OALL_REF_OPEN;
+							d.mat_.fMaskSet(row, col, 2); // might choose E later
+						} else {
+							// I chose the F cell
+							cur = SW_BT_RFGAP_EXTEND;
+							d.mat_.fMaskSet(row, col, 1); // might choose E later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// I chose the F cell
+						cur = SW_BT_RFGAP_EXTEND;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					assert(!empty || !canMoveThru);
+				} else {
+					assert_eq(SSEMatrix::H, ct);
+					TAlScore sc_cur      = ((TCScore*)(cur_vec + SSEMatrix::H))[rowelt]    + offsetsc;
+					TAlScore sc_f_up     = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_h_up     = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_h_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::H))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_e_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::E))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_h_upleft = col > 0 ? (((TCScore*)(upleft_vec + SSEMatrix::H))[upleft_rowelt] + offsetsc) : floorsc;
+					TAlScore sc_diag     = sc_->score(readc, refm, readq - 33);
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					if(gapsAllowed) {
+						if(sc_h_up     > floorsc && sc_cur == sc_h_up   - sc_->refGapOpen()) {
+							mask |= (1 << 0);
+						}
+						if(sc_h_left   > floorsc && sc_cur == sc_h_left - sc_->readGapOpen()) {
+							mask |= (1 << 1);
+						}
+						if(sc_f_up     > floorsc && sc_cur == sc_f_up   - sc_->refGapExtend()) {
+							mask |= (1 << 2);
+						}
+						if(sc_e_left   > floorsc && sc_cur == sc_e_left - sc_->readGapExtend()) {
+							mask |= (1 << 3);
+						}
+					}
+					if(sc_h_upleft > floorsc && sc_cur == sc_h_upleft + sc_diag) {
+						mask |= (1 << 4);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isHMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 2) & 31;
+					}
+					assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+					int opts = alts5[mask];
+					int select = -1;
+					if(opts == 1) {
+						select = firsts5[mask];
+						assert_geq(mask, 0);
+						d.mat_.hMaskSet(row, col, 0);
+					} else if(opts > 1) {
+#if 1
+						if(       (mask & 16) != 0) {
+							select = 4; // H diag
+						} else if((mask & 1) != 0) {
+							select = 0; // H up
+						} else if((mask & 4) != 0) {
+							select = 2; // F up
+						} else if((mask & 2) != 0) {
+							select = 1; // H left
+						} else if((mask & 8) != 0) {
+							select = 3; // E left
+						}
+#else
+						select = randFromMask(rnd, mask);
+#endif
+						assert_geq(mask, 0);
+						mask &= ~(1 << select);
+						assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+						d.mat_.hMaskSet(row, col, mask);
+						branch = true;
+					} else { /* No way to backtrack! */ }
+					if(select != -1) {
+						if(select == 4) {
+							cur = SW_BT_OALL_DIAG;
+						} else if(select == 0) {
+							cur = SW_BT_OALL_REF_OPEN;
+						} else if(select == 1) {
+							cur = SW_BT_OALL_READ_OPEN;
+						} else if(select == 2) {
+							cur = SW_BT_RFGAP_EXTEND;
+						} else {
+							assert_eq(3, select)
+							cur = SW_BT_RDGAP_EXTEND;
+						}
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+				}
+				assert(!empty || !canMoveThru || ct == SSEMatrix::H);
+			}
+		}
+		//cerr << "reportedThrough rejected (" << row << ", " << col << ")" << endl;
+		d.mat_.setReportedThrough(row, col);
+		assert_eq(gaps, Edit::numGaps(ned));
+		assert_leq(gaps, rdgap_ + rfgap_);
+		// Cell was involved in a previously-reported alignment?
+		if(!canMoveThru) {
+			if(!btnstack_.empty()) {
+				// Remove all the cells from list back to and including the
+				// cell where the branch occurred
+				btcells_.resize(btnstack_.back().celsz);
+				// Pop record off the top of the stack
+				ned.resize(btnstack_.back().nedsz);
+				//aed.resize(btnstack_.back().aedsz);
+				row      = btnstack_.back().row;
+				col      = btnstack_.back().col;
+				gaps     = btnstack_.back().gaps;
+				readGaps = btnstack_.back().readGaps;
+				refGaps  = btnstack_.back().refGaps;
+				score    = btnstack_.back().score;
+				ct       = btnstack_.back().ct;
+				btnstack_.pop_back();
+				assert(!sc_->monotone || score.score() >= escore);
+				NEW_ROW_COL(row, col);
+				continue;
+			} else {
+				// No branch points to revisit; just give up
+				res.reset();
+				met.btfail++; // DP backtraces failed
+				return false;
+			}
+		}
+		assert(!reportedThru);
+		assert(!sc_->monotone || score.score() >= minsc_);
+		if(empty || row == 0) {
+			assert_eq(SSEMatrix::H, ct);
+			btcells_.expand();
+			btcells_.back().first = row;
+			btcells_.back().second = col;
+			// This cell is at the end of a legitimate alignment
+			trimBeg = row;
+			assert_eq(0, trimBeg);
+			assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+			break;
+		}
+		if(branch) {
+			// Add a frame to the backtrack stack
+			btnstack_.expand();
+			btnstack_.back().init(
+				ned.size(),
+				0,               // aed.size()
+				btcells_.size(),
+				row,
+				col,
+				gaps,
+				readGaps,
+				refGaps,
+				score,
+				(int)ct);
+		}
+		btcells_.expand();
+		btcells_.back().first = row;
+		btcells_.back().second = col;
+		switch(cur) {
+			// Move up and to the left.  If the reference nucleotide in the
+			// source row mismatches the read nucleotide, penalize
+			// it and add a nucleotide mismatch.
+			case SW_BT_OALL_DIAG: {
+				assert_gt(row, 0); assert_gt(col, 0);
+				// Check for color mismatch
+				int readC = (*rd_)[row];
+				int refNmask = (int)rf_[rfi_+col];
+				assert_gt(refNmask, 0);
+				int m = matchesEx(readC, refNmask);
+				ct = SSEMatrix::H;
+				if(m != 1) {
+					Edit e(
+						(int)row,
+						mask2dna[refNmask],
+						"ACGTN"[readC],
+						EDIT_TYPE_MM);
+					assert(e.repOk());
+					assert(ned.empty() || ned.back().pos >= row);
+					ned.push_back(e);
+					int pen = QUAL2(row, col);
+					score.score_ -= pen;
+					assert(!sc_->monotone || score.score() >= escore);
+				} else {
+					// Reward a match
+					int64_t bonus = sc_->match(30);
+					score.score_ += bonus;
+					assert(!sc_->monotone || score.score() >= escore);
+				}
+				if(m == -1) {
+					// score.ns_++;
+				}
+				row--; col--;
+				MOVE_ALL_UPLEFT();
+				assert(VALID_AL_SCORE(score));
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_OALL_REF_OPEN:
+			{
+				assert_gt(row, 0);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::H;
+				int pen = sc_->refGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_RFGAP_EXTEND:
+			{
+				assert_gt(row, 1);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::F;
+				int pen = sc_->refGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			case SW_BT_OALL_READ_OPEN:
+			{
+				assert_gt(col, 0);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::H;
+				int pen = sc_->readGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			case SW_BT_RDGAP_EXTEND:
+			{
+				assert_gt(col, 1);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::E;
+				int pen = sc_->readGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			default: throw 1;
+		}
+	} // while((int)row > 0)
+	assert_eq(0, trimBeg);
+	assert_eq(0, trimEnd);
+	assert_geq(col, 0);
+	assert_eq(SSEMatrix::H, ct);
+	// The number of cells in the backtracs should equal the number of read
+	// bases after trimming plus the number of gaps
+	assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+	// Check whether we went through a core diagonal and set 'reported' flag on
+	// each cell
+	bool overlappedCoreDiag = false;
+	for(size_t i = 0; i < btcells_.size(); i++) {
+		size_t rw = btcells_[i].first;
+		size_t cl = btcells_[i].second;
+		// Calculate the diagonal within the *trimmed* rectangle, i.e. the
+		// rectangle we dealt with in align, gather and backtrack.
+		int64_t diagi = cl - rw;
+		// Now adjust to the diagonal within the *untrimmed* rectangle by
+		// adding on the amount trimmed from the left.
+		diagi += rect_->triml;
+		if(diagi >= 0) {
+			size_t diag = (size_t)diagi;
+			if(diag >= rect_->corel && diag <= rect_->corer) {
+				overlappedCoreDiag = true;
+				break;
+			}
+		}
+#ifndef NDEBUG
+		//assert(!d.mat_.reportedThrough(rw, cl));
+		//d.mat_.setReportedThrough(rw, cl);
+		assert(d.mat_.reportedThrough(rw, cl));
+#endif
+	}
+	if(!overlappedCoreDiag) {
+		// Must overlap a core diagonal.  Otherwise, we run the risk of
+		// reporting an alignment that overlaps (and trumps) a higher-scoring
+		// alignment that lies partially outside the dynamic programming
+		// rectangle.
+		res.reset();
+		met.corerej++;
+		return false;
+	}
+	int readC = (*rd_)[rdi_+row];      // get last char in read
+	int refNmask = (int)rf_[rfi_+col]; // get last ref char ref involved in aln
+	assert_gt(refNmask, 0);
+	int m = matchesEx(readC, refNmask);
+	if(m != 1) {
+		Edit e((int)row, mask2dna[refNmask], "ACGTN"[readC], EDIT_TYPE_MM);
+		assert(e.repOk());
+		assert(ned.empty() || ned.back().pos >= row);
+		ned.push_back(e);
+		score.score_ -= QUAL2(row, col);
+		assert_geq(score.score(), minsc_);
+	} else {
+		score.score_ += sc_->match(30);
+	}
+	if(m == -1) {
+		// score.ns_++;
+	}
+#if 0
+	if(score.ns_ > nceil_) {
+		// Alignment has too many Ns in it!
+		res.reset();
+		met.nrej++;
+		return false;
+	}
+#endif
+	res.reverse();
+	assert(Edit::repOk(ned, (*rd_)));
+	assert_eq(score.score(), escore);
+	assert_leq(gaps, rdgap_ + rfgap_);
+	off = col;
+	assert_lt(col + (size_t)rfi_, (size_t)rff_);
+	// score.gaps_ = gaps;
+	res.alres.setScore(score);
+#if 0
+	res.alres.setShape(
+		refidx_,                  // ref id
+		off + rfi_ + rect_->refl, // 0-based ref offset
+		reflen_,                  // length of entire reference
+		fw_,                      // aligned to Watson?
+		rdf_ - rdi_,              // read length
+		true,                     // pretrim soft?
+		0,                        // pretrim 5' end
+		0,                        // pretrim 3' end
+		true,                     // alignment trim soft?
+		fw_ ? trimBeg : trimEnd,  // alignment trim 5' end
+		fw_ ? trimEnd : trimBeg); // alignment trim 3' end
+#endif
+	size_t refns = 0;
+	for(size_t i = col; i <= origCol; i++) {
+		if((int)rf_[rfi_+i] > 15) {
+			refns++;
+		}
+	}
+	// res.alres.setRefNs(refns);
+	assert(Edit::repOk(ned, (*rd_), true, trimBeg, trimEnd));
+	assert(res.repOk());
+#ifndef NDEBUG
+	size_t gapsCheck = 0;
+	for(size_t i = 0; i < ned.size(); i++) {
+		if(ned[i].isGap()) gapsCheck++;
+	}
+	assert_eq(gaps, gapsCheck);
+	BTDnaString refstr;
+	for(size_t i = col; i <= origCol; i++) {
+		refstr.append(firsts5[(int)rf_[rfi_+i]]);
+	}
+	BTDnaString editstr;
+    // daehwan
+	// Edit::toRef((*rd_), ned, editstr, true, trimBeg, trimEnd);
+    Edit::toRef((*rd_), ned, editstr, true, trimBeg + rdi_, trimEnd + (rd_->length() - rdf_));
+	if(refstr != editstr) {
+		cerr << "Decoded nucleotides and edits don't match reference:" << endl;
+		cerr << "           score: " << score.score()
+		     << " (" << gaps << " gaps)" << endl;
+		cerr << "           edits: ";
+		Edit::print(cerr, ned);
+		cerr << endl;
+		cerr << "    decoded nucs: " << (*rd_) << endl;
+		cerr << "     edited nucs: " << editstr << endl;
+		cerr << "  reference nucs: " << refstr << endl;
+		assert(0);
+	}
+#endif
+	met.btsucc++; // DP backtraces succeeded
+	return true;
+}
diff --git a/aligner_swsse_loc_i16.cpp b/aligner_swsse_loc_i16.cpp
new file mode 100644
index 0000000..2593b3c
--- /dev/null
+++ b/aligner_swsse_loc_i16.cpp
@@ -0,0 +1,2275 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * aligner_sw_sse.cpp
+ *
+ * Versions of key alignment functions that use vector instructions to
+ * accelerate dynamic programming.  Based chiefly on the striped Smith-Waterman
+ * paper and implementation by Michael Farrar.  See:
+ *
+ * Farrar M. Striped Smith-Waterman speeds database searches six times over
+ * other SIMD implementations. Bioinformatics. 2007 Jan 15;23(2):156-61.
+ * http://sites.google.com/site/farrarmichael/smith-waterman
+ *
+ * While the paper describes an implementation of Smith-Waterman, we extend it
+ * do end-to-end read alignment as well as local alignment.  The change
+ * required for this is minor: we simply let vmax be the maximum element in the
+ * score domain rather than the minimum.
+ *
+ * The vectorized dynamic programming implementation lacks some features that
+ * make it hard to adapt to solving the entire dynamic-programming alignment
+ * problem.  For instance:
+ *
+ * - It doesn't respect gap barriers on either end of the read
+ * - It just gives a maximum; not enough information to backtrace without
+ *   redoing some alignment
+ * - It's a little difficult to handle st_ and en_, especially st_.
+ * - The query profile mechanism makes handling of ambiguous reference bases a
+ *   little tricky (16 cols in query profile lookup table instead of 5)
+ *
+ * Given the drawbacks, it is tempting to use SSE dynamic programming as a
+ * filter rather than as an aligner per se.  Here are a few ideas for how it
+ * can be extended to handle more of the alignment problem:
+ *
+ * - Save calculated scores to a big array as we go.  We return to this array
+ *   to find and backtrace from good solutions.
+ */
+
+#include <limits>
+#include "aligner_sw.h"
+
+static const size_t NBYTES_PER_REG  = 16;
+static const size_t NWORDS_PER_REG  = 8;
+static const size_t NBITS_PER_WORD  = 16;
+static const size_t NBYTES_PER_WORD = 2;
+
+// In 16-bit local mode, we have the option of using signed saturated
+// arithmetic.  Because we have signed arithmetic, there's no need to
+// add/subtract bias when building an applying the query profile.  The lowest
+// value we can use is 0x8000, greatest is 0x7fff.
+
+typedef int16_t TCScore;
+
+/**
+ * Build query profile look up tables for the read.  The query profile look
+ * up table is organized as a 1D array indexed by [i][j] where i is the
+ * reference character in the current DP column (0=A, 1=C, etc), and j is
+ * the segment of the query we're currently working on.
+ */
+void SwAligner::buildQueryProfileLocalSseI16(bool fw) {
+	bool& done = fw ? sseI16fwBuilt_ : sseI16rcBuilt_;
+	if(done) {
+		return;
+	}
+	done = true;
+	const BTDnaString* rd = fw ? rdfw_ : rdrc_;
+	const BTString* qu = fw ? qufw_ : qurc_;
+	const size_t len = rd->length();
+	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
+	// How many __m128i's are needed
+	size_t n128s =
+		64 +                    // slack bytes, for alignment?
+		(seglen * ALPHA_SIZE)   // query profile data
+		* 2;                    // & gap barrier data
+	assert_gt(n128s, 0);
+	SSEData& d = fw ? sseI16fw_ : sseI16rc_;
+	d.profbuf_.resizeNoCopy(n128s);
+	assert(!d.profbuf_.empty());
+	d.maxPen_      = d.maxBonus_ = 0;
+	d.lastIter_    = d.lastWord_ = 0;
+	d.qprofStride_ = d.gbarStride_ = 2;
+	d.bias_ = 0; // no bias when words are signed
+	// For each reference character A, C, G, T, N ...
+	for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
+		// For each segment ...
+		for(size_t i = 0; i < seglen; i++) {
+			size_t j = i;
+			int16_t *qprofWords =
+				reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2));
+			int16_t *gbarWords =
+				reinterpret_cast<int16_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2) + 1);
+			// For each sub-word (byte) ...
+			for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+				int sc = 0;
+				*gbarWords = 0;
+				if(j < len) {
+					int readc = (*rd)[j];
+					int readq = (*qu)[j];
+					sc = sc_->score(readc, (int)(1 << refc), readq - 33);
+					size_t j_from_end = len - j - 1;
+					if(j < (size_t)sc_->gapbar ||
+					   j_from_end < (size_t)sc_->gapbar)
+					{
+						// Inside the gap barrier
+						*gbarWords = 0x8000; // add this twice
+					}
+				}
+				if(refc == 0 && j == len-1) {
+					// Remember which 128-bit word and which smaller word has
+					// the final row
+					d.lastIter_ = i;
+					d.lastWord_ = k;
+				}
+				if(sc < 0) {
+					if((size_t)(-sc) > d.maxPen_) {
+						d.maxPen_ = (size_t)(-sc);
+					}
+				} else {
+					if((size_t)sc > d.maxBonus_) {
+						d.maxBonus_ = (size_t)sc;
+					}
+				}
+				*qprofWords = (int16_t)sc;
+				gbarWords++;
+				qprofWords++;
+				j += seglen; // update offset into query
+			}
+		}
+	}
+}
+
+#ifndef NDEBUG
+/**
+ * Return true iff the cell has sane E/F/H values w/r/t its predecessors.
+ */
+static bool cellOkLocalI16(
+	SSEData& d,
+	size_t row,
+	size_t col,
+	int refc,
+	int readc,
+	int readq,
+	const Scoring& sc)     // scoring scheme
+{
+	TCScore floorsc = MIN_I16;
+	TCScore ceilsc = MIN_I16-1;
+	TAlScore offsetsc = 0x8000;
+	TAlScore sc_h_cur = (TAlScore)d.mat_.helt(row, col);
+	TAlScore sc_e_cur = (TAlScore)d.mat_.eelt(row, col);
+	TAlScore sc_f_cur = (TAlScore)d.mat_.felt(row, col);
+	if(sc_h_cur > floorsc) {
+		sc_h_cur += offsetsc;
+	}
+	if(sc_e_cur > floorsc) {
+		sc_e_cur += offsetsc;
+	}
+	if(sc_f_cur > floorsc) {
+		sc_f_cur += offsetsc;
+	}
+	bool gapsAllowed = true;
+	size_t rowFromEnd = d.mat_.nrow() - row - 1;
+	if(row < (size_t)sc.gapbar || rowFromEnd < (size_t)sc.gapbar) {
+		gapsAllowed = false;
+	}
+	bool e_left_trans = false, h_left_trans = false;
+	bool f_up_trans   = false, h_up_trans = false;
+	bool h_diag_trans = false;
+	if(gapsAllowed) {
+		TAlScore sc_h_left = floorsc;
+		TAlScore sc_e_left = floorsc;
+		TAlScore sc_h_up   = floorsc;
+		TAlScore sc_f_up   = floorsc;
+		if(col > 0 && sc_e_cur > floorsc && sc_e_cur <= ceilsc) {
+			sc_h_left = d.mat_.helt(row, col-1) + offsetsc;
+			sc_e_left = d.mat_.eelt(row, col-1) + offsetsc;
+			e_left_trans = (sc_e_left > floorsc && sc_e_cur == sc_e_left - sc.readGapExtend());
+			h_left_trans = (sc_h_left > floorsc && sc_e_cur == sc_h_left - sc.readGapOpen());
+			assert(e_left_trans || h_left_trans);
+		}
+		if(row > 0 && sc_f_cur > floorsc && sc_f_cur <= ceilsc) {
+			sc_h_up = d.mat_.helt(row-1, col) + offsetsc;
+			sc_f_up = d.mat_.felt(row-1, col) + offsetsc;
+			f_up_trans = (sc_f_up > floorsc && sc_f_cur == sc_f_up - sc.refGapExtend());
+			h_up_trans = (sc_h_up > floorsc && sc_f_cur == sc_h_up - sc.refGapOpen());
+			assert(f_up_trans || h_up_trans);
+		}
+	} else {
+		assert_geq(floorsc, sc_e_cur);
+		assert_geq(floorsc, sc_f_cur);
+	}
+	if(col > 0 && row > 0 && sc_h_cur > floorsc && sc_h_cur <= ceilsc) {
+		TAlScore sc_h_upleft = d.mat_.helt(row-1, col-1) + offsetsc;
+		TAlScore sc_diag = sc.score(readc, (int)refc, readq - 33);
+		h_diag_trans = sc_h_cur == sc_h_upleft + sc_diag;
+	}
+	assert(
+		sc_h_cur <= floorsc ||
+		e_left_trans ||
+		h_left_trans ||
+		f_up_trans   ||
+		h_up_trans   ||
+		h_diag_trans ||
+		sc_h_cur > ceilsc ||
+		row == 0 ||
+		col == 0);
+	return true;
+}
+#endif /*ndef NDEBUG*/
+
+#ifdef NDEBUG
+
+#define assert_all_eq0(x)
+#define assert_all_gt(x, y)
+#define assert_all_gt_lo(x)
+#define assert_all_lt(x, y)
+#define assert_all_lt_hi(x)
+
+#else
+
+#define assert_all_eq0(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt(x, y) { \
+	__m128i tmp = _mm_cmpgt_epi16(x, y); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt_lo(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpgt_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt(x, y) { \
+	__m128i tmp = _mm_cmplt_epi16(x, y); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_leq(x, y) { \
+	__m128i tmp = _mm_cmpgt_epi16(x, y); \
+	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt_hi(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_cmpeq_epi16(z, z); \
+	z = _mm_srli_epi16(z, 1); \
+	tmp = _mm_cmplt_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+#endif
+
+/**
+ * Aligns by filling a dynamic programming matrix with the SSE-accelerated,
+ * banded DP approach of Farrar.  As it goes, it determines which cells we
+ * might backtrace from and tallies the best (highest-scoring) N backtrace
+ * candidate cells per diagonal.  Also returns the alignment score of the best
+ * alignment in the matrix.
+ *
+ * This routine does *not* maintain a matrix holding the entire matrix worth of
+ * scores, nor does it maintain any other dense O(mn) data structure, as this
+ * would quickly exhaust memory for queries longer than about 10,000 kb.
+ * Instead, in the fill stage it maintains two columns worth of scores at a
+ * time (current/previous, or right/left) - these take O(m) space.  When
+ * finished with the current column, it determines which cells from the
+ * previous column, if any, are candidates we might backtrace from to find a
+ * full alignment.  A candidate cell has a score that rises above the threshold
+ * and isn't improved upon by a match in the next column.  The best N
+ * candidates per diagonal are stored in a O(m + n) data structure.
+ */
+TAlScore SwAligner::alignGatherLoc16(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert_gt(minsc_, 0);
+	assert_leq(minsc_, MAX_I16);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileLocalSseI16(fw_);
+	assert(!d.profbuf_.empty());
+
+	assert_gt(d.maxBonus_, 0);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+	
+	// Now set up the score vectors.  We just need two columns worth, which
+	// we'll call "left" and "right".
+	d.vecbuf_.resize(ROWSTRIDE_2COL * iter * 2);
+	d.vecbuf_.zero();
+	__m128i *vbuf_l = d.vecbuf_.ptr();
+	__m128i *vbuf_r = d.vecbuf_.ptr() + (ROWSTRIDE_2COL * iter);
+	
+	// This is the data structure that holds candidate cells per diagonal.
+	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
+	if(!debug) {
+		btdiag_.init(ndiags, 2);
+	}
+	
+	// Data structure that holds checkpointed anti-diagonals
+	TAlScore perfectScore = sc_->perfectScore(dpRows());
+	bool checkpoint = true;
+	bool cpdebug = false;
+#ifndef NDEBUG
+	cpdebug = dpRows() < 1000;
+#endif
+	cper_.init(
+		dpRows(),      // # rows
+		rff_ - rfi_,   // # columns
+		cperPerPow2_,  // checkpoint every 1 << perpow2 diags (& next)
+		perfectScore,  // perfect score (for sanity checks)
+		false,         // matrix cells have 8-bit scores?
+		cperTri_,      // triangular mini-fills?
+		true,          // alignment is local?
+		cpdebug);      // save all cells for debugging?
+
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i vlolsw   = _mm_setzero_si128();
+	__m128i vmax     = _mm_setzero_si128();
+	__m128i vcolmax  = _mm_setzero_si128();
+	__m128i vmaxtmp  = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+	__m128i vhd      = _mm_setzero_si128();
+	__m128i vhdtmp   = _mm_setzero_si128();
+	__m128i vtmp     = _mm_setzero_si128();
+	__m128i vzero    = _mm_setzero_si128();
+	__m128i vminsc   = _mm_setzero_si128();
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_I16);
+	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_I16);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_I16);
+	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_I16);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	
+	// Set all elts to minimum score threshold.  Actually, to 1 less than the
+	// threshold so we can use gt instead of geq.
+	vminsc = _mm_insert_epi16(vminsc, (int)minsc_-1, 0);
+	vminsc = _mm_shufflelo_epi16(vminsc, 0);
+	vminsc = _mm_shuffle_epi32(vminsc, 0);
+
+	// Set all elts to 0x8000 (min value for signed 16-bit)
+	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	
+	// Set all elts to 0x7fff (max value for signed 16-bit)
+	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	
+	// Set all elts to 0x8000 (min value for signed 16-bit)
+	vmax = vlo;
+	
+	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
+	vlolsw = _mm_shuffle_epi32(vlo, 0);
+	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	const size_t colstride = ROWSTRIDE_2COL * iter;
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
+	//__m128i *pvFLeft = vbuf_l + 1;
+	__m128i *pvFRight = vbuf_r + 1;
+	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	
+	for(size_t i = 0; i < iter; i++) {
+		// start low in local mode
+		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+		// Note: right and left are going to be swapped as soon as we enter
+		// the outer loop below
+	}
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+	TAlScore matchsc = sc_->match(30);
+	TAlScore leftmax = MIN_I64;
+
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+	
+	size_t off = MAX_SIZE_T, lastoff;
+	bool bailed = false;
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		// Swap left and right; vbuf_l is the vector on the left, which we
+		// generally load from, and vbuf_r is the vector on the right, which we
+		// generally store to.
+		swap(vbuf_l, vbuf_r);
+		pvELeft = vbuf_l + 0; pvERight = vbuf_r + 0;
+		/* pvFLeft = vbuf_l + 1; */ pvFRight = vbuf_r + 1;
+		pvHLeft = vbuf_l + 2; pvHRight = vbuf_r + 2;
+		
+		// Fetch this column's reference mask
+		const int refm = (int)rf_[i];
+		
+		// Fetch the appropriate query profile
+		lastoff = off;
+		off = (size_t)firsts5[refm] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Load H vector from the final row of the previous column.
+		// ??? perhaps we should calculate the next iter's F instead of the
+		// current iter's?  The way we currently do it, seems like it will
+		// almost always require at least one fixup loop iter (to recalculate
+		// this topmost F).
+		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		
+		// Set all F cells to low value
+		vf = _mm_cmpeq_epi16(vf, vf);
+		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = _mm_or_si128(vf, vlolsw);
+		// vf now contains the vertical contribution
+
+		// Store cells in F, calculated previously
+		// No need to veto ref gap extensions, they're all 0x8000s
+		_mm_store_si128(pvFRight, vf);
+		pvFRight += ROWSTRIDE_2COL;
+		
+		// Shift down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		// Fill topmost (least sig) cell with low value
+		vh = _mm_or_si128(vh, vlolsw);
+		
+		// We pull out one loop iteration to make it easier to veto values in the top row
+		
+		// Load cells from E, calculated previously
+		ve = _mm_load_si128(pvELeft);
+		vhd = _mm_load_si128(pvHLeft);
+		assert_all_lt(ve, vhi);
+		pvELeft += ROWSTRIDE_2COL;
+		// ve now contains the horizontal contribution
+		
+		// Factor in query profile (matches and mismatches)
+		vh = _mm_adds_epi16(vh, pvScore[0]);
+		// vh now contains the diagonal contribution
+		
+		// Update vE value
+		vhdtmp = vhd;
+		vhd = _mm_subs_epi16(vhd, rdgapo);
+		vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+		vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+		ve = _mm_subs_epi16(ve, rdgape);
+		ve = _mm_max_epi16(ve, vhd);
+
+		// Update H, factoring in E and F
+		vh = _mm_max_epi16(vh, ve);
+		// F won't change anything!
+
+		vf = vh;
+
+		// Update highest score so far
+		vcolmax = vh;
+		
+		// Save the new vH values
+		_mm_store_si128(pvHRight, vh);
+
+		assert_all_lt(ve, vhi);
+
+		vh = vhdtmp;
+
+		assert_all_lt(ve, vhi);
+		pvHRight += ROWSTRIDE_2COL;
+		pvHLeft += ROWSTRIDE_2COL;
+		
+		// Save E values
+		_mm_store_si128(pvERight, ve);
+		pvERight += ROWSTRIDE_2COL;
+		
+		// Update vf value
+		vf = _mm_subs_epi16(vf, rfgapo);
+		assert_all_lt(vf, vhi);
+		
+		pvScore += 2; // move on to next query profile
+
+		// For each character in the reference text:
+		size_t j;
+		for(j = 1; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELeft);
+			vhd = _mm_load_si128(pvHLeft);
+			assert_all_lt(ve, vhi);
+			pvELeft += ROWSTRIDE_2COL;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_adds_epi16(vh, pvScore[0]);
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Update vE value
+			vhdtmp = vhd;
+			vhd = _mm_subs_epi16(vhd, rdgapo);
+			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epi16(ve, rdgape);
+			ve = _mm_max_epi16(ve, vhd);
+			
+			vh = _mm_max_epi16(vh, ve);
+			vtmp = vh;
+			
+			// Update highest score encountered this far
+			vcolmax = _mm_max_epi16(vcolmax, vh);
+			
+			// Save the new vH values
+			_mm_store_si128(pvHRight, vh);
+
+			vh = vhdtmp;
+
+			assert_all_lt(ve, vhi);
+			pvHRight += ROWSTRIDE_2COL;
+			pvHLeft += ROWSTRIDE_2COL;
+			
+			// Save E values
+			_mm_store_si128(pvERight, ve);
+			pvERight += ROWSTRIDE_2COL;
+			
+			// Update vf value
+			vtmp = _mm_subs_epi16(vtmp, rfgapo);
+			vf = _mm_subs_epi16(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epi16(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFRight -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFRight);
+		
+		pvHRight -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHRight);
+		
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = _mm_or_si128(vf, vlolsw);
+		
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epi16(vtmp, vf);
+		vtmp = _mm_cmpgt_epi16(vf, vtmp);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0x0000) {
+			// Store this vf
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHRight, vh);
+			pvHRight += ROWSTRIDE_2COL;
+			
+			// Update highest score encountered so far.
+			vcolmax = _mm_max_epi16(vcolmax, vh);
+
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFRight -= colstride;
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				pvHRight -= colstride;
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = _mm_or_si128(vf, vlolsw);
+			} else {
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epi16(vf, rfgape);
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epi16(vtmp, vf);
+			vtmp = _mm_cmpgt_epi16(vf, vtmp);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+
+		// Now we'd like to know exactly which cells in the left column are
+		// candidates we might backtrace from.  First question is: did *any*
+		// elements in the column exceed the minimum score threshold?
+		if(!debug && leftmax >= minsc_) {
+			// Yes.  Next question is: which cells are candidates?  We have to
+			// allow matches in the right column to override matches above and
+			// to the left in the left column.
+			assert_gt(i - rfi_, 0);
+			pvHLeft  = vbuf_l + 2;
+			assert_lt(lastoff, MAX_SIZE_T);
+			pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
+			for(size_t k = 0; k < iter; k++) {
+				vh = _mm_load_si128(pvHLeft);
+				vtmp = _mm_cmpgt_epi16(pvScore[0], vzero);
+				int cmp = _mm_movemask_epi8(vtmp);
+				if(cmp != 0) {
+					// At least one candidate in this mask.  Now iterate
+					// through vm/vh to evaluate individual cells.
+					for(size_t m = 0; m < NWORDS_PER_REG; m++) {
+						size_t row = k + m * iter;
+						if(row >= dpRows()) {
+							break;
+						}
+						TAlScore sc = (TAlScore)(((TCScore *)&vh)[m] + 0x8000);
+						if(sc >= minsc_) {
+							if(((TCScore *)&vtmp)[m] != 0) {
+								// Add to data structure holding all candidates
+								size_t col = i - rfi_ - 1; // -1 b/c prev col
+								size_t frombot = dpRows() - row - 1;
+								DpBtCandidate cand(row, col, sc);
+								btdiag_.add(frombot + col, cand);
+							}
+						}
+					}
+				}
+				pvHLeft += ROWSTRIDE_2COL;
+				pvScore += 2;
+			}
+		}
+
+		// Save some elements to checkpoints
+		if(checkpoint) {
+			
+			__m128i *pvE = vbuf_r + 0;
+			__m128i *pvF = vbuf_r + 1;
+			__m128i *pvH = vbuf_r + 2;
+			size_t coli = i - rfi_;
+			if(coli < cper_.locol_) cper_.locol_ = coli;
+			if(coli > cper_.hicol_) cper_.hicol_ = coli;
+			
+			if(cperTri_) {
+				size_t rc_mod = coli & cper_.lomask_;
+				assert_lt(rc_mod, cper_.per_);
+				int64_t row = -rc_mod-1;
+				int64_t row_mod = row;
+				int64_t row_div = 0;
+				size_t idx = coli >> cper_.perpow2_;
+				size_t idxrow = idx * cper_.nrow_;
+				assert_eq(4, ROWSTRIDE_2COL);
+				bool done = false;
+				while(true) {
+					row += (cper_.per_ - 2);
+					row_mod += (cper_.per_ - 2);
+					for(size_t j = 0; j < 2; j++) {
+						row++;
+						row_mod++;
+						if(row >= 0 && (size_t)row < cper_.nrow_) {
+							// Update row divided by iter_ and mod iter_
+							while(row_mod >= (int64_t)iter) {
+								row_mod -= (int64_t)iter;
+								row_div++;
+							}
+							size_t delt = idxrow + row;
+							size_t vecoff = (row_mod << 5) + row_div;
+							assert_lt(row_div, 8);
+							int16_t h_sc = ((int16_t*)pvH)[vecoff];
+							int16_t e_sc = ((int16_t*)pvE)[vecoff];
+							int16_t f_sc = ((int16_t*)pvF)[vecoff];
+							h_sc += 0x8000; assert_geq(h_sc, 0);
+							e_sc += 0x8000; assert_geq(e_sc, 0);
+							f_sc += 0x8000; assert_geq(f_sc, 0);
+							assert_leq(h_sc, cper_.perf_);
+							assert_leq(e_sc, cper_.perf_);
+							assert_leq(f_sc, cper_.perf_);
+							CpQuad *qdiags = ((j == 0) ? cper_.qdiag1s_.ptr() : cper_.qdiag2s_.ptr());
+							qdiags[delt].sc[0] = h_sc;
+							qdiags[delt].sc[1] = e_sc;
+							qdiags[delt].sc[2] = f_sc;
+						} // if(row >= 0 && row < nrow_)
+						else if(row >= 0 && (size_t)row >= cper_.nrow_) {
+							done = true;
+							break;
+						}
+					} // end of loop over anti-diags
+					if(done) {
+						break;
+					}
+					idx++;
+					idxrow += cper_.nrow_;
+				}
+			} else {
+				// If this is the first column, take this opportunity to
+				// pre-calculate the coordinates of the elements we're going to
+				// checkpoint.
+				if(coli == 0) {
+					size_t cpi    = cper_.per_-1;
+					size_t cpimod = cper_.per_-1;
+					size_t cpidiv = 0;
+					cper_.commitMap_.clear();
+					while(cpi < cper_.nrow_) {
+						while(cpimod >= iter) {
+							cpimod -= iter;
+							cpidiv++;
+						}
+						size_t vecoff = (cpimod << 5) + cpidiv;
+						cper_.commitMap_.push_back(vecoff);
+						cpi += cper_.per_;
+						cpimod += cper_.per_;
+					}
+				}
+				// Save all the rows
+				size_t rowoff = 0;
+				size_t sz = cper_.commitMap_.size();
+				for(size_t i = 0; i < sz; i++, rowoff += cper_.ncol_) {
+					size_t vecoff = cper_.commitMap_[i];
+					int16_t h_sc = ((int16_t*)pvH)[vecoff];
+					//int16_t e_sc = ((int16_t*)pvE)[vecoff];
+					int16_t f_sc = ((int16_t*)pvF)[vecoff];
+					h_sc += 0x8000; assert_geq(h_sc, 0);
+					//e_sc += 0x8000; assert_geq(e_sc, 0);
+					f_sc += 0x8000; assert_geq(f_sc, 0);
+					assert_leq(h_sc, cper_.perf_);
+					//assert_leq(e_sc, cper_.perf_);
+					assert_leq(f_sc, cper_.perf_);
+					CpQuad& dst = cper_.qrows_[rowoff + coli];
+					dst.sc[0] = h_sc;
+					//dst.sc[1] = e_sc;
+					dst.sc[2] = f_sc;
+				}
+				// Is this a column we'd like to checkpoint?
+				if((coli & cper_.lomask_) == cper_.lomask_) {
+					// Save the column using memcpys
+					assert_gt(coli, 0);
+					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
+					__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				}
+			}
+			if(cper_.debug_) {
+				// Save the column using memcpys
+				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+				size_t coloff = coli * wordspercol;
+				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+			}
+		}
+
+		vmax = _mm_max_epi16(vmax, vcolmax);
+		{
+			// Get single largest score in this column
+			vmaxtmp = vcolmax;
+			vtmp = _mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
+			int16_t ret = _mm_extract_epi16(vmaxtmp, 0);
+			TAlScore score = (TAlScore)(ret + 0x8000);
+			if(ret == MIN_I16) {
+				score = MIN_I64;
+			}
+			
+			if(score < minsc_) {
+				size_t ncolleft = rff_ - i - 1;
+				if(max<TAlScore>(score, 0) + (TAlScore)ncolleft * matchsc < minsc_) {
+					// Bail!  There can't possibly be a valid alignment that
+					// passes through this column.
+					bailed = true;
+					break;
+				}
+			}
+			
+			leftmax = score;
+		}
+	}
+	
+	lastoff = off;
+	
+	// Now we'd like to know exactly which cells in the *rightmost* column are
+	// candidates we might backtrace from.  Did *any* elements exceed the
+	// minimum score threshold?
+	if(!debug && !bailed && leftmax >= minsc_) {
+		// Yes.  Next question is: which cells are candidates?  We have to
+		// allow matches in the right column to override matches above and
+		// to the left in the left column.
+		pvHLeft  = vbuf_r + 2;
+		assert_lt(lastoff, MAX_SIZE_T);
+		pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
+		for(size_t k = 0; k < iter; k++) {
+			vh = _mm_load_si128(pvHLeft);
+			vtmp = _mm_cmpgt_epi16(pvScore[0], vzero);
+			int cmp = _mm_movemask_epi8(vtmp);
+			if(cmp != 0) {
+				// At least one candidate in this mask.  Now iterate
+				// through vm/vh to evaluate individual cells.
+				for(size_t m = 0; m < NWORDS_PER_REG; m++) {
+					size_t row = k + m * iter;
+					if(row >= dpRows()) {
+						break;
+					}
+					TAlScore sc = (TAlScore)(((TCScore *)&vh)[m] + 0x8000);
+					if(sc >= minsc_) {
+						if(((TCScore *)&vtmp)[m] != 0) {
+							// Add to data structure holding all candidates
+							size_t col = rff_ - rfi_ - 1; // -1 b/c prev col
+							size_t frombot = dpRows() - row - 1;
+							DpBtCandidate cand(row, col, sc);
+							btdiag_.add(frombot + col, cand);
+						}
+					}
+				}
+			}
+			pvHLeft += ROWSTRIDE_2COL;
+			pvScore += 2;
+		}
+	}
+
+	// Find largest score in vmax
+	vtmp = _mm_srli_si128(vmax, 8);
+	vmax = _mm_max_epi16(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 4);
+	vmax = _mm_max_epi16(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 2);
+	vmax = _mm_max_epi16(vmax, vtmp);
+	int16_t ret = _mm_extract_epi16(vmax, 0);
+
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+
+	flag = 0;
+
+	// Did we find a solution?
+	TAlScore score = MIN_I64;
+	if(ret == MIN_I16) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return MIN_I64;
+	} else {
+		score = (TAlScore)(ret + 0x8000);
+		if(score < minsc_) {
+			flag = -1; // no
+			if(!debug) met.dpfail++;
+			return score;
+		}
+	}
+	
+	// Could we have saturated?
+	if(ret == MAX_I16) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+	
+	// Now take all the backtrace candidates in the btdaig_ structure and
+	// dump them into the btncand_ array.  They'll be sorted later.
+	if(!debug) {
+		btdiag_.dump(btncand_);	
+		assert(!btncand_.empty());
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return score;
+}
+
+/**
+ * Solve the current alignment problem using SSE instructions that operate on 8
+ * signed 16-bit values packed into a single 128-bit register.
+ */
+TAlScore SwAligner::alignNucleotidesLocalSseI16(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileLocalSseI16(fw_);
+	assert(!d.profbuf_.empty());
+
+	assert_gt(d.maxBonus_, 0);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i vlolsw   = _mm_setzero_si128();
+	__m128i vmax     = _mm_setzero_si128();
+	__m128i vcolmax  = _mm_setzero_si128();
+	__m128i vmaxtmp  = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+	__m128i vtmp     = _mm_setzero_si128();
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_I16);
+	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_I16);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_I16);
+	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_I16);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+
+	// Set all elts to 0x8000 (min value for signed 16-bit)
+	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	
+	// Set all elts to 0x7fff (max value for signed 16-bit)
+	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	
+	// Set all elts to 0x8000 (min value for signed 16-bit)
+	vmax = vlo;
+	
+	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
+	vlolsw = _mm_shuffle_epi32(vlo, 0);
+	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
+	const size_t colstride = d.mat_.colstride();
+	//const size_t rowstride = d.mat_.rowstride();
+	assert_eq(ROWSTRIDE, colstride / iter);
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	__m128i *pvETmp = d.mat_.evec(0, 0);
+	
+	for(size_t i = 0; i < iter; i++) {
+		_mm_store_si128(pvETmp, vlo);
+		_mm_store_si128(pvHTmp, vlo); // start low in local mode
+		pvETmp += ROWSTRIDE;
+		pvHTmp += ROWSTRIDE;
+	}
+	// These are swapped just before the innermost loop
+	__m128i *pvHStore = d.mat_.hvec(0, 0);
+	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	__m128i *pvELoad  = d.mat_.evec(0, 0);
+	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	__m128i *pvFStore = d.mat_.fvec(0, 0);
+	__m128i *pvFTmp   = NULL;
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+	TAlScore matchsc = sc_->match(30);
+
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+	
+	colstop_ = rff_ - rfi_;
+	lastsolcol_ = 0;
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert(pvFStore == d.mat_.fvec(0, i - rfi_));
+		assert(pvHStore == d.mat_.hvec(0, i - rfi_));
+		
+		// Fetch this column's reference mask
+		const int refm = (int)rf_[i];
+		
+		// Fetch the appropriate query profile
+		size_t off = (size_t)firsts5[refm] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Load H vector from the final row of the previous column
+		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		
+		// Set all F cells to low value
+		vf = _mm_cmpeq_epi16(vf, vf);
+		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = _mm_or_si128(vf, vlolsw);
+		// vf now contains the vertical contribution
+
+		// Store cells in F, calculated previously
+		// No need to veto ref gap extensions, they're all 0x8000s
+		_mm_store_si128(pvFStore, vf);
+		pvFStore += ROWSTRIDE;
+		
+		// Shift down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		// Fill topmost (least sig) cell with low value
+		vh = _mm_or_si128(vh, vlolsw);
+		
+		// We pull out one loop iteration to make it easier to veto values in the top row
+		
+		// Load cells from E, calculated previously
+		ve = _mm_load_si128(pvELoad);
+		assert_all_lt(ve, vhi);
+		pvELoad += ROWSTRIDE;
+		// ve now contains the horizontal contribution
+		
+		// Factor in query profile (matches and mismatches)
+		vh = _mm_adds_epi16(vh, pvScore[0]);
+		// vh now contains the diagonal contribution
+		
+		// Update H, factoring in E and F
+		vtmp = _mm_max_epi16(vh, ve);
+		// F won't change anything!
+		
+		vh = vtmp;
+		
+		// Update highest score so far
+		vcolmax = vlo;
+		vcolmax = _mm_max_epi16(vcolmax, vh);
+		
+		// Save the new vH values
+		_mm_store_si128(pvHStore, vh);
+		pvHStore += ROWSTRIDE;
+		
+		// Update vE value
+		vf = vh;
+		vh = _mm_subs_epi16(vh, rdgapo);
+		vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+		vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+		ve = _mm_subs_epi16(ve, rdgape);
+		ve = _mm_max_epi16(ve, vh);
+		assert_all_lt(ve, vhi);
+		
+		// Load the next h value
+		vh = _mm_load_si128(pvHLoad);
+		pvHLoad += ROWSTRIDE;
+		
+		// Save E values
+		_mm_store_si128(pvEStore, ve);
+		pvEStore += ROWSTRIDE;
+		
+		// Update vf value
+		vf = _mm_subs_epi16(vf, rfgapo);
+		assert_all_lt(vf, vhi);
+		
+		pvScore += 2; // move on to next query profile
+
+		// For each character in the reference text:
+		size_t j;
+		for(j = 1; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELoad);
+			assert_all_lt(ve, vhi);
+			pvELoad += ROWSTRIDE;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_adds_epi16(vh, pvScore[0]);
+			
+			// Update H, factoring in E and F
+			vh = _mm_max_epi16(vh, ve);
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Update highest score encountered this far
+			vcolmax = _mm_max_epi16(vcolmax, vh);
+			
+			// Save the new vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update vE value
+			vtmp = vh;
+			vh = _mm_subs_epi16(vh, rdgapo);
+			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epi16(ve, rdgape);
+			ve = _mm_max_epi16(ve, vh);
+			assert_all_lt(ve, vhi);
+			
+			// Load the next h value
+			vh = _mm_load_si128(pvHLoad);
+			pvHLoad += ROWSTRIDE;
+			
+			// Save E values
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+			
+			// Update vf value
+			vtmp = _mm_subs_epi16(vtmp, rfgapo);
+			vf = _mm_subs_epi16(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epi16(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFTmp = pvFStore;
+		pvFStore -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFStore);
+		
+		pvHStore -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHStore);
+		
+		pvEStore -= colstride; // reset to start of column
+		ve = _mm_load_si128(pvEStore);
+		
+		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = _mm_or_si128(vf, vlolsw);
+		
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epi16(vtmp, vf);
+		vtmp = _mm_cmpgt_epi16(vf, vtmp);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0x0000) {
+			// Store this vf
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epi16(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update highest score encountered this far
+			vcolmax = _mm_max_epi16(vcolmax, vh);
+			
+			// Update E in case it can be improved using our new vh
+			vh = _mm_subs_epi16(vh, rdgapo);
+			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			ve = _mm_max_epi16(ve, vh);
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFStore -= colstride;
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				pvHStore -= colstride;
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				pvEStore -= colstride;
+				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = _mm_or_si128(vf, vlolsw);
+			} else {
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epi16(vf, rfgape);
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epi16(vtmp, vf);
+			vtmp = _mm_cmpgt_epi16(vf, vtmp);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+		
+#ifndef NDEBUG
+		if((rand() & 15) == 0) {
+			// This is a work-intensive sanity check; each time we finish filling
+			// a column, we check that each H, E, and F is sensible.
+			for(size_t k = 0; k < dpRows(); k++) {
+				assert(cellOkLocalI16(
+					d,
+					k,                   // row
+					i - rfi_,            // col
+					refm,                // reference mask
+					(int)(*rd_)[rdi_+k], // read char
+					(int)(*qu_)[rdi_+k], // read quality
+					*sc_));              // scoring scheme
+			}
+		}
+#endif
+
+		// Store column maximum vector in first element of tmp
+		vmax = _mm_max_epi16(vmax, vcolmax);
+		_mm_store_si128(d.mat_.tmpvec(0, i - rfi_), vcolmax);
+
+		{
+			// Get single largest score in this column
+			vmaxtmp = vcolmax;
+			vtmp = _mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
+			int16_t ret = _mm_extract_epi16(vmaxtmp, 0);
+			TAlScore score = (TAlScore)(ret + 0x8000);
+			
+			if(score < minsc_) {
+				size_t ncolleft = rff_ - i - 1;
+				if(score + (TAlScore)ncolleft * matchsc < minsc_) {
+					// Bail!  We're guaranteed not to see a valid alignment in
+					// the rest of the matrix
+					colstop_ = (i+1) - rfi_;
+					break;
+				}
+			} else {
+				lastsolcol_ = i - rfi_;
+			}
+		}
+
+		// pvELoad and pvHLoad are already where they need to be
+		
+		// Adjust the load and store vectors here.  
+		pvHStore = pvHLoad + colstride;
+		pvEStore = pvELoad + colstride;
+		pvFStore = pvFTmp;
+	}
+
+	// Find largest score in vmax
+	vtmp = _mm_srli_si128(vmax, 8);
+	vmax = _mm_max_epi16(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 4);
+	vmax = _mm_max_epi16(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 2);
+	vmax = _mm_max_epi16(vmax, vtmp);
+	int16_t ret = _mm_extract_epi16(vmax, 0);
+
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+
+	flag = 0;
+
+	// Did we find a solution?
+	TAlScore score = MIN_I64;
+	if(ret == MIN_I16) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return MIN_I64;
+	} else {
+		score = (TAlScore)(ret + 0x8000);
+		if(score < minsc_) {
+			flag = -1; // no
+			if(!debug) met.dpfail++;
+			return score;
+		}
+	}
+	
+	// Could we have saturated?
+	if(ret == MAX_I16) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return score;
+}
+
+/**
+ * Given a filled-in DP table, populate the btncand_ list with candidate cells
+ * that might be at the ends of valid alignments.  No need to do this unless
+ * the maximum score returned by the align*() func is >= the minimum.
+ *
+ * We needn't consider cells that have no chance of reaching any of the core
+ * diagonals.  These are the cells that are more than 'maxgaps' cells away from
+ * a core diagonal.
+ *
+ * We need to be careful to consider that the rectangle might be truncated on
+ * one or both ends.
+ *
+ * The seed extend case looks like this:
+ *
+ *      |Rectangle|   0: seed diagonal
+ *      **OO0oo----   o: "RHS gap" diagonals
+ *      -**OO0oo---   O: "LHS gap" diagonals
+ *      --**OO0oo--   *: "LHS extra" diagonals
+ *      ---**OO0oo-   -: cells that can't possibly be involved in a valid    
+ *      ----**OO0oo      alignment that overlaps one of the core diagonals
+ *
+ * The anchor-to-left case looks like this:
+ *
+ *   |Anchor|  | ---- Rectangle ---- |
+ *   o---------OO0000000000000oo------  0: mate diagonal (also core diags!)
+ *   -o---------OO0000000000000oo-----  o: "RHS gap" diagonals
+ *   --o---------OO0000000000000oo----  O: "LHS gap" diagonals
+ *   ---oo--------OO0000000000000oo---  *: "LHS extra" diagonals
+ *   -----o--------OO0000000000000oo--  -: cells that can't possibly be
+ *   ------o--------OO0000000000000oo-     involved in a valid alignment that
+ *   -------o--------OO0000000000000oo     overlaps one of the core diagonals
+ *                     XXXXXXXXXXXXX
+ *                     | RHS Range |
+ *                     ^           ^
+ *                     rl          rr
+ *
+ * The anchor-to-right case looks like this:
+ *
+ *    ll          lr
+ *    v           v
+ *    | LHS Range |
+ *    XXXXXXXXXXXXX          |Anchor|
+ *  OO0000000000000oo--------o--------  0: mate diagonal (also core diags!)
+ *  -OO0000000000000oo--------o-------  o: "RHS gap" diagonals
+ *  --OO0000000000000oo--------o------  O: "LHS gap" diagonals
+ *  ---OO0000000000000oo--------oo----  *: "LHS extra" diagonals
+ *  ----OO0000000000000oo---------o---  -: cells that can't possibly be
+ *  -----OO0000000000000oo---------o--     involved in a valid alignment that
+ *  ------OO0000000000000oo---------o-     overlaps one of the core diagonals
+ *  | ---- Rectangle ---- |
+ */
+bool SwAligner::gatherCellsNucleotidesLocalSseI16(TAlScore best) {
+	// What's the minimum number of rows that can possibly be spanned by an
+	// alignment that meets the minimum score requirement?
+	assert(sse16succ_);
+	size_t bonus = (size_t)sc_->match(30);
+	const size_t ncol = lastsolcol_ + 1;
+	const size_t nrow = dpRows();
+	assert_gt(nrow, 0);
+	btncand_.clear();
+	btncanddone_.clear();
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	assert(!d.profbuf_.empty());
+	//const size_t rowstride = d.mat_.rowstride();
+	//const size_t colstride = d.mat_.colstride();
+	size_t iter = (dpRows() + (NWORDS_PER_REG - 1)) / NWORDS_PER_REG;
+	assert_gt(iter, 0);
+	assert_geq(minsc_, 0);
+	assert_gt(bonus, 0);
+	size_t minrow = (size_t)(((minsc_ + bonus - 1) / bonus) - 1);
+	for(size_t j = 0; j < ncol; j++) {
+		// Establish the range of rows where a backtrace from the cell in this
+		// row/col is close enough to one of the core diagonals that it could
+		// conceivably count
+		size_t nrow_lo = MIN_SIZE_T;
+		size_t nrow_hi = nrow;
+		// First, check if there is a cell in this column with a score
+		// above the score threshold
+		__m128i vmax = *d.mat_.tmpvec(0, j);
+		__m128i vtmp = _mm_srli_si128(vmax, 8);
+		vmax = _mm_max_epi16(vmax, vtmp);
+		vtmp = _mm_srli_si128(vmax, 4);
+		vmax = _mm_max_epi16(vmax, vtmp);
+		vtmp = _mm_srli_si128(vmax, 2);
+		vmax = _mm_max_epi16(vmax, vtmp);
+		TAlScore score = (TAlScore)((int16_t)_mm_extract_epi16(vmax, 0) + 0x8000);
+		assert_geq(score, 0);
+#ifndef NDEBUG
+		{
+			// Start in upper vector row and move down
+			TAlScore max = 0;
+			vmax = *d.mat_.tmpvec(0, j);
+			__m128i *pvH = d.mat_.hvec(0, j);
+			for(size_t i = 0; i < iter; i++) {
+				for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+					TAlScore sc = (TAlScore)(((TCScore*)pvH)[k] + 0x8000);
+					TAlScore scm = (TAlScore)(((TCScore*)&vmax)[k] + 0x8000);
+					assert_leq(sc, scm);
+					if(sc > max) {
+						max = sc;
+					}
+				}
+				pvH += ROWSTRIDE;
+			}
+			assert_eq(max, score);
+		}
+#endif
+		if(score < minsc_) {
+			// Scores in column aren't good enough
+			continue;
+		}
+		// Get pointer to first cell in column to examine:
+		__m128i *pvHorig = d.mat_.hvec(0, j);
+		__m128i *pvH     = pvHorig;
+		// Get pointer to the vector in the following column that corresponds
+		// to the cells diagonally down and to the right from the cells in pvH
+		__m128i *pvHSucc = (j < ncol-1) ? d.mat_.hvec(0, j+1) : NULL;
+		// Start in upper vector row and move down
+		for(size_t i = 0; i < iter; i++) {
+			if(pvHSucc != NULL) {
+				pvHSucc += ROWSTRIDE;
+				if(i == iter-1) {
+					pvHSucc = d.mat_.hvec(0, j+1);
+				}
+			}
+			// Which elements of this vector are exhaustively scored?
+			size_t rdoff = i;
+			for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+				// Is this row, col one that we can potential backtrace from?
+				// I.e. are we close enough to a core diagonal?
+				if(rdoff >= nrow_lo && rdoff < nrow_hi) {
+					// This cell has been exhaustively scored
+					if(rdoff >= minrow) {
+						// ... and it could potentially score high enough
+						TAlScore sc = (TAlScore)(((TCScore*)pvH)[k] + 0x8000);
+						assert_leq(sc, best);
+						if(sc >= minsc_) {
+							// This is a potential solution
+							bool matchSucc = false;
+							int readc = (*rd_)[rdoff];
+							int refc = rf_[j + rfi_];
+							bool match = ((refc & (1 << readc)) != 0);
+							if(rdoff < dpRows()-1) {
+								int readcSucc = (*rd_)[rdoff+1];
+								int refcSucc = rf_[j + rfi_ + 1];
+								assert_range(0, 16, refcSucc);
+								matchSucc = ((refcSucc & (1 << readcSucc)) != 0);
+							}
+							if(match && !matchSucc) {
+								// Yes, this is legit
+								met.gathsol++;
+								btncand_.expand();
+								btncand_.back().init(rdoff, j, sc);
+							}
+						}
+					}
+				} else {
+					// Already saw every element in the vector that's been
+					// exhaustively scored
+					break;
+				}
+				rdoff += iter;
+			}
+			pvH += ROWSTRIDE;
+		}
+	}
+	if(!btncand_.empty()) {
+		d.mat_.initMasks();
+	}
+	return !btncand_.empty();
+}
+
+#define MOVE_VEC_PTR_UP(vec, rowvec, rowelt) { \
+	if(rowvec == 0) { \
+		rowvec += d.mat_.nvecrow_; \
+		vec += d.mat_.colstride_; \
+		rowelt--; \
+	} \
+	rowvec--; \
+	vec -= ROWSTRIDE; \
+}
+
+#define MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt) { vec -= d.mat_.colstride_; }
+
+#define MOVE_VEC_PTR_UPLEFT(vec, rowvec, rowelt) { \
+ 	MOVE_VEC_PTR_UP(vec, rowvec, rowelt); \
+ 	MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt); \
+}
+
+#define MOVE_ALL_LEFT() { \
+	MOVE_VEC_PTR_LEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_LEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UP() { \
+	MOVE_VEC_PTR_UP(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UP(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UP(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UPLEFT() { \
+	MOVE_VEC_PTR_UPLEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UPLEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define NEW_ROW_COL(row, col) { \
+	rowelt = row / d.mat_.nvecrow_; \
+	rowvec = row % d.mat_.nvecrow_; \
+	eltvec = (col * d.mat_.colstride_) + (rowvec * ROWSTRIDE); \
+	cur_vec = d.mat_.matbuf_.ptr() + eltvec; \
+	left_vec = cur_vec; \
+	left_rowelt = rowelt; \
+	left_rowvec = rowvec; \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	up_vec = cur_vec; \
+	up_rowelt = rowelt; \
+	up_rowvec = rowvec; \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	upleft_vec = up_vec; \
+	upleft_rowelt = up_rowelt; \
+	upleft_rowvec = up_rowvec; \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+/**
+ * Given the dynamic programming table and a cell, trace backwards from the
+ * cell and install the edits and score/penalty in the appropriate fields of
+ * res.  The RandomSource is used to break ties among equally good ways of
+ * tracing back.
+ *
+ * Whenever we enter a cell, we check if its read/ref coordinates correspond to
+ * a cell we traversed constructing a previous alignment.  If so, we backtrack
+ * to the last decision point, mask out the path that led to the previously
+ * observed cell, and continue along a different path.  If there are no more
+ * paths to try, we stop.
+ *
+ * If an alignment is found, 'off' is set to the alignment's upstream-most
+ * reference character's offset and true is returned.  Otherwise, false is
+ * returned.
+ *
+ * In local alignment mode, this method is liable to be slow, especially for
+ * long reads.  This is chiefly because if there is one valid solution
+ * (especially if it is pretty high scoring), then many, many paths shooting
+ * off that solution's path will also have valid solutions.
+ */
+bool SwAligner::backtraceNucleotidesLocalSseI16(
+	TAlScore       escore, // in: expected score
+	SwResult&      res,    // out: store results (edits and scores) here
+	size_t&        off,    // out: store diagonal projection of origin
+	size_t&        nbts,   // out: # backtracks
+	size_t         row,    // start in this row
+	size_t         col,    // start in this column
+	RandomSource&  rnd)    // random gen, to choose among equal paths
+{
+	assert_lt(row, dpRows());
+	assert_lt(col, (size_t)(rff_ - rfi_));
+	SSEData& d = fw_ ? sseI16fw_ : sseI16rc_;
+	SSEMetrics& met = extend_ ? sseI16ExtendMet_ : sseI16MateMet_;
+	met.bt++;
+	assert(!d.profbuf_.empty());
+	assert_lt(row, rd_->length());
+	btnstack_.clear(); // empty the backtrack stack
+	btcells_.clear();  // empty the cells-so-far list
+	AlnScore score;
+	// score.score_ = score.gaps_ = score.ns_ = 0;
+	size_t origCol = col;
+	size_t gaps = 0, readGaps = 0, refGaps = 0;
+	res.alres.reset();
+    EList<Edit>& ned = res.alres.ned();
+	assert(ned.empty());
+	assert_gt(dpRows(), row);
+	ASSERT_ONLY(size_t trimEnd = dpRows() - row - 1);
+	size_t trimBeg = 0;
+	size_t ct = SSEMatrix::H; // cell type
+	// Row and col in terms of where they fall in the SSE vector matrix
+	size_t rowelt, rowvec, eltvec;
+	size_t left_rowelt, up_rowelt, upleft_rowelt;
+	size_t left_rowvec, up_rowvec, upleft_rowvec;
+	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	const size_t gbar = sc_->gapbar;
+	NEW_ROW_COL(row, col);
+	// If 'backEliminate' is true, then every time we visit a cell, we remove
+	// edges into the cell.  We do this to avoid some of the thrashing around
+	// that occurs when there are lots of valid candidates in the same DP
+	// problem.
+	//const bool backEliminate = true;
+	while((int)row >= 0) {
+		// TODO: As soon as we enter a cell, set it as being reported through,
+		// *and* mark all cells that point into this cell as being reported
+		// through.  This will save us from having to consider quite so many
+		// candidates.
+		
+		met.btcell++;
+		nbts++;
+		int readc = (*rd_)[rdi_ + row];
+		int refm  = (int)rf_[rfi_ + col];
+		int readq = (*qu_)[row];
+		assert_leq(col, origCol);
+		// Get score in this cell
+		bool empty = false, reportedThru, canMoveThru, branch = false;
+		int cur = SSEMatrix::H;
+		if(!d.mat_.reset_[row]) {
+			d.mat_.resetRow(row);
+		}
+		reportedThru = d.mat_.reportedThrough(row, col);
+		canMoveThru = true;
+		if(reportedThru) {
+			canMoveThru = false;
+		} else {
+			empty = false;
+			if(row > 0) {
+				size_t rowFromEnd = d.mat_.nrow() - row - 1;
+				bool gapsAllowed = !(row < gbar || rowFromEnd < gbar);
+				const int floorsc = 0;
+				const int offsetsc = 0x8000;
+				// Move to beginning of column/row
+				if(ct == SSEMatrix::E) { // AKA rdgap
+					assert_gt(col, 0);
+					TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::E))[rowelt] + offsetsc;
+					assert(gapsAllowed);
+					// Currently in the E matrix; incoming transition must come from the
+					// left.  It's either a gap open from the H matrix or a gap extend from
+					// the E matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell to the left
+					TAlScore sc_h_left = ((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc;
+					if(sc_h_left > floorsc && sc_h_left - sc_->readGapOpen() == sc_cur) {
+						mask |= (1 << 0); // horiz H -> E move possible
+					}
+					// Get E score of cell to the left
+					TAlScore sc_e_left = ((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc;
+					if(sc_e_left > floorsc && sc_e_left - sc_->readGapExtend() == sc_cur) {
+						mask |= (1 << 1); // horiz E -> E move possible
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isEMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 8) & 3;
+					}
+					if(mask == 3) {
+						// Horiz H -> E or horiz E -> E moves possible
+#if 1
+						// Pick H -> E cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// Pick H -> E cell
+							cur = SW_BT_OALL_READ_OPEN;
+							d.mat_.eMaskSet(row, col, 2); // might choose E later
+						} else {
+							// Pick E -> E cell
+							cur = SW_BT_RDGAP_EXTEND;
+							d.mat_.eMaskSet(row, col, 1); // might choose H later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// Only horiz E -> E move possible, pick it
+						cur = SW_BT_RDGAP_EXTEND;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					if(!branch) {
+						// Is this where we can eliminate some incoming paths as well?
+					}
+					assert(!empty || !canMoveThru);
+				} else if(ct == SSEMatrix::F) { // AKA rfgap
+					assert_gt(row, 0);
+					assert(gapsAllowed);
+					TAlScore sc_h_up = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_f_up = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_cur  = ((TCScore*)(cur_vec + SSEMatrix::F))[rowelt] + offsetsc;
+					// Currently in the F matrix; incoming transition must come from above.
+					// It's either a gap open from the H matrix or a gap extend from the F
+					// matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell above
+					if(sc_h_up > floorsc && sc_h_up - sc_->refGapOpen() == sc_cur) {
+						mask |= (1 << 0);
+					}
+					// Get F score of cell above
+					if(sc_f_up > floorsc && sc_f_up - sc_->refGapExtend() == sc_cur) {
+						mask |= (1 << 1);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isFMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 11) & 3;
+					}
+					if(mask == 3) {
+#if 1
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// I chose the H cell
+							cur = SW_BT_OALL_REF_OPEN;
+							d.mat_.fMaskSet(row, col, 2); // might choose E later
+						} else {
+							// I chose the F cell
+							cur = SW_BT_RFGAP_EXTEND;
+							d.mat_.fMaskSet(row, col, 1); // might choose E later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// I chose the F cell
+						cur = SW_BT_RFGAP_EXTEND;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					assert(!empty || !canMoveThru);
+				} else {
+					assert_eq(SSEMatrix::H, ct);
+					TAlScore sc_cur      = ((TCScore*)(cur_vec + SSEMatrix::H))[rowelt]    + offsetsc;
+					TAlScore sc_f_up     = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_h_up     = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_h_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::H))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_e_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::E))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_h_upleft = col > 0 ? (((TCScore*)(upleft_vec + SSEMatrix::H))[upleft_rowelt] + offsetsc) : floorsc;
+					TAlScore sc_diag     = sc_->score(readc, refm, readq - 33);
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					if(gapsAllowed) {
+						if(sc_h_up     > floorsc && sc_cur == sc_h_up   - sc_->refGapOpen()) {
+							mask |= (1 << 0);
+						}
+						if(sc_h_left   > floorsc && sc_cur == sc_h_left - sc_->readGapOpen()) {
+							mask |= (1 << 1);
+						}
+						if(sc_f_up     > floorsc && sc_cur == sc_f_up   - sc_->refGapExtend()) {
+							mask |= (1 << 2);
+						}
+						if(sc_e_left   > floorsc && sc_cur == sc_e_left - sc_->readGapExtend()) {
+							mask |= (1 << 3);
+						}
+					}
+					if(sc_h_upleft > floorsc && sc_cur == sc_h_upleft + sc_diag) {
+						mask |= (1 << 4); // diagonal is 
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isHMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 2) & 31;
+					}
+					assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+					int opts = alts5[mask];
+					int select = -1;
+					if(opts == 1) {
+						select = firsts5[mask];
+						assert_geq(mask, 0);
+						d.mat_.hMaskSet(row, col, 0);
+					} else if(opts > 1) {
+#if 1
+						if(       (mask & 16) != 0) {
+							select = 4; // H diag
+						} else if((mask & 1) != 0) {
+							select = 0; // H up
+						} else if((mask & 4) != 0) {
+							select = 2; // F up
+						} else if((mask & 2) != 0) {
+							select = 1; // H left
+						} else if((mask & 8) != 0) {
+							select = 3; // E left
+						}
+#else
+						select = randFromMask(rnd, mask);
+#endif
+						assert_geq(mask, 0);
+						mask &= ~(1 << select);
+						assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+						d.mat_.hMaskSet(row, col, mask);
+						branch = true;
+					} else { /* No way to backtrack! */ }
+					if(select != -1) {
+						if(select == 4) {
+							cur = SW_BT_OALL_DIAG;
+						} else if(select == 0) {
+							cur = SW_BT_OALL_REF_OPEN;
+						} else if(select == 1) {
+							cur = SW_BT_OALL_READ_OPEN;
+						} else if(select == 2) {
+							cur = SW_BT_RFGAP_EXTEND;
+						} else {
+							assert_eq(3, select)
+							cur = SW_BT_RDGAP_EXTEND;
+						}
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+				}
+				assert(!empty || !canMoveThru || ct == SSEMatrix::H);
+			} // if(row > 0)
+		} // else clause of if(reportedThru)
+		if(!reportedThru) {
+			d.mat_.setReportedThrough(row, col);
+		}
+		assert(d.mat_.reportedThrough(row, col));
+		//if(backEliminate && row < d.mat_.nrow()-1) {
+		//	// Possibly pick off neighbors below and to the right if the
+		//	// neighbor's only way of backtracking is through this cell.
+		//}
+		assert_eq(gaps, Edit::numGaps(ned));
+		assert_leq(gaps, rdgap_ + rfgap_);
+		// Cell was involved in a previously-reported alignment?
+		if(!canMoveThru) {
+			if(!btnstack_.empty()) {
+				// Remove all the cells from list back to and including the
+				// cell where the branch occurred
+				btcells_.resize(btnstack_.back().celsz);
+				// Pop record off the top of the stack
+				ned.resize(btnstack_.back().nedsz);
+				//aed.resize(btnstack_.back().aedsz);
+				row      = btnstack_.back().row;
+				col      = btnstack_.back().col;
+				gaps     = btnstack_.back().gaps;
+				readGaps = btnstack_.back().readGaps;
+				refGaps  = btnstack_.back().refGaps;
+				score    = btnstack_.back().score;
+				ct       = btnstack_.back().ct;
+				btnstack_.pop_back();
+				assert(!sc_->monotone || score.score() >= escore);
+				NEW_ROW_COL(row, col);
+				continue;
+			} else {
+				// No branch points to revisit; just give up
+				res.reset();
+				met.btfail++; // DP backtraces failed
+				return false;
+			}
+		}
+		assert(!reportedThru);
+		assert(!sc_->monotone || score.score() >= minsc_);
+		if(empty || row == 0) {
+			assert_eq(SSEMatrix::H, ct);
+			btcells_.expand();
+			btcells_.back().first = row;
+			btcells_.back().second = col;
+			// This cell is at the end of a legitimate alignment
+			trimBeg = row;
+			assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+			break;
+		}
+		if(branch) {
+			// Add a frame to the backtrack stack
+			btnstack_.expand();
+			btnstack_.back().init(
+				ned.size(),
+				0,               // aed.size()
+				btcells_.size(),
+				row,
+				col,
+				gaps,
+				readGaps,
+				refGaps,
+				score,
+				(int)ct);
+		}
+		btcells_.expand();
+		btcells_.back().first = row;
+		btcells_.back().second = col;
+		switch(cur) {
+			// Move up and to the left.  If the reference nucleotide in the
+			// source row mismatches the read nucleotide, penalize
+			// it and add a nucleotide mismatch.
+			case SW_BT_OALL_DIAG: {
+				assert_gt(row, 0); assert_gt(col, 0);
+				int readC = (*rd_)[row];
+				int refNmask = (int)rf_[rfi_+col];
+				assert_gt(refNmask, 0);
+				int m = matchesEx(readC, refNmask);
+				ct = SSEMatrix::H;
+				if(m != 1) {
+					Edit e(
+						(int)row,
+						mask2dna[refNmask],
+						"ACGTN"[readC],
+						EDIT_TYPE_MM);
+					assert(e.repOk());
+					assert(ned.empty() || ned.back().pos >= row);
+					ned.push_back(e);
+					int pen = QUAL2(row, col);
+					score.score_ -= pen;
+					assert(!sc_->monotone || score.score() >= escore);
+				} else {
+					// Reward a match
+					int64_t bonus = sc_->match(30);
+					score.score_ += bonus;
+					assert(!sc_->monotone || score.score() >= escore);
+				}
+				if(m == -1) {
+					// score.ns_++;
+				}
+				row--; col--;
+				MOVE_ALL_UPLEFT();
+				assert(VALID_AL_SCORE(score));
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_OALL_REF_OPEN:
+			{
+				assert_gt(row, 0);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::H;
+				int pen = sc_->refGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_RFGAP_EXTEND:
+			{
+				assert_gt(row, 1);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::F;
+				int pen = sc_->refGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			case SW_BT_OALL_READ_OPEN:
+			{
+				assert_gt(col, 0);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::H;
+				int pen = sc_->readGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			case SW_BT_RDGAP_EXTEND:
+			{
+				assert_gt(col, 1);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::E;
+				int pen = sc_->readGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			default: throw 1;
+		}
+	} // while((int)row > 0)
+	assert_geq(col, 0);
+	assert_eq(SSEMatrix::H, ct);
+	// The number of cells in the backtracs should equal the number of read
+	// bases after trimming plus the number of gaps
+	assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+	// Check whether we went through a core diagonal and set 'reported' flag on
+	// each cell
+	bool overlappedCoreDiag = false;
+	for(size_t i = 0; i < btcells_.size(); i++) {
+		size_t rw = btcells_[i].first;
+		size_t cl = btcells_[i].second;
+		// Calculate the diagonal within the *trimmed* rectangle, i.e. the
+		// rectangle we dealt with in align, gather and backtrack.
+		int64_t diagi = cl - rw;
+		// Now adjust to the diagonal within the *untrimmed* rectangle by
+		// adding on the amount trimmed from the left.
+		diagi += rect_->triml;
+		if(diagi >= 0) {
+			size_t diag = (size_t)diagi;
+			if(diag >= rect_->corel && diag <= rect_->corer) {
+				overlappedCoreDiag = true;
+				break;
+			}
+		}
+#ifndef NDEBUG
+		//assert(!d.mat_.reportedThrough(rw, cl));
+		//d.mat_.setReportedThrough(rw, cl);
+		assert(d.mat_.reportedThrough(rw, cl));
+#endif
+	}
+	if(!overlappedCoreDiag) {
+		// Must overlap a core diagonal.  Otherwise, we run the risk of
+		// reporting an alignment that overlaps (and trumps) a higher-scoring
+		// alignment that lies partially outside the dynamic programming
+		// rectangle.
+		res.reset();
+		met.corerej++;
+		return false;
+	}
+	int readC = (*rd_)[rdi_+row];      // get last char in read
+	int refNmask = (int)rf_[rfi_+col]; // get last ref char ref involved in aln
+	assert_gt(refNmask, 0);
+	int m = matchesEx(readC, refNmask);
+	if(m != 1) {
+		Edit e((int)row, mask2dna[refNmask], "ACGTN"[readC], EDIT_TYPE_MM);
+		assert(e.repOk());
+		assert(ned.empty() || ned.back().pos >= row);
+		ned.push_back(e);
+		score.score_ -= QUAL2(row, col);
+		assert_geq(score.score(), minsc_);
+	} else {
+		score.score_ += sc_->match(30);
+	}
+	if(m == -1) {
+		// score.ns_++;
+	}
+#if 0
+	if(score.ns_ > nceil_) {
+		// Alignment has too many Ns in it!
+		res.reset();
+		met.nrej++;
+		return false;
+	}
+#endif
+	res.reverse();
+	assert(Edit::repOk(ned, (*rd_)));
+	assert_eq(score.score(), escore);
+	assert_leq(gaps, rdgap_ + rfgap_);
+	off = col;
+	assert_lt(col + (size_t)rfi_, (size_t)rff_);
+	// score.gaps_ = gaps;
+	res.alres.setScore(score);
+#if 0
+	res.alres.setShape(
+		refidx_,                  // ref id
+		off + rfi_ + rect_->refl, // 0-based ref offset
+		reflen_,                  // reference length
+		fw_,                      // aligned to Watson?
+		rdf_ - rdi_,              // read length
+		true,                     // pretrim soft?
+		0,                        // pretrim 5' end
+		0,                        // pretrim 3' end
+		true,                     // alignment trim soft?
+		fw_ ? trimBeg : trimEnd,  // alignment trim 5' end
+		fw_ ? trimEnd : trimBeg); // alignment trim 3' end
+#endif
+	size_t refns = 0;
+	for(size_t i = col; i <= origCol; i++) {
+		if((int)rf_[rfi_+i] > 15) {
+			refns++;
+		}
+	}
+	// res.alres.setRefNs(refns);
+	assert(Edit::repOk(ned, (*rd_), true, trimBeg, trimEnd));
+	assert(res.repOk());
+#ifndef NDEBUG
+	size_t gapsCheck = 0;
+	for(size_t i = 0; i < ned.size(); i++) {
+		if(ned[i].isGap()) gapsCheck++;
+	}
+	assert_eq(gaps, gapsCheck);
+	BTDnaString refstr;
+	for(size_t i = col; i <= origCol; i++) {
+		refstr.append(firsts5[(int)rf_[rfi_+i]]);
+	}
+	BTDnaString editstr;
+	Edit::toRef((*rd_), ned, editstr, true, trimBeg, trimEnd);
+	if(refstr != editstr) {
+		cerr << "Decoded nucleotides and edits don't match reference:" << endl;
+		cerr << "           score: " << score.score()
+		     << " (" << gaps << " gaps)" << endl;
+		cerr << "           edits: ";
+		Edit::print(cerr, ned);
+		cerr << endl;
+		cerr << "    decoded nucs: " << (*rd_) << endl;
+		cerr << "     edited nucs: " << editstr << endl;
+		cerr << "  reference nucs: " << refstr << endl;
+		assert(0);
+	}
+#endif
+	met.btsucc++; // DP backtraces succeeded
+	return true;
+}
diff --git a/aligner_swsse_loc_u8.cpp b/aligner_swsse_loc_u8.cpp
new file mode 100644
index 0000000..ae8c7a3
--- /dev/null
+++ b/aligner_swsse_loc_u8.cpp
@@ -0,0 +1,2269 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * aligner_sw_sse.cpp
+ *
+ * Versions of key alignment functions that use vector instructions to
+ * accelerate dynamic programming.  Based chiefly on the striped Smith-Waterman
+ * paper and implementation by Michael Farrar.  See:
+ *
+ * Farrar M. Striped Smith-Waterman speeds database searches six times over
+ * other SIMD implementations. Bioinformatics. 2007 Jan 15;23(2):156-61.
+ * http://sites.google.com/site/farrarmichael/smith-waterman
+ *
+ * While the paper describes an implementation of Smith-Waterman, we extend it
+ * do end-to-end read alignment as well as local alignment.  The change
+ * required for this is minor: we simply let vmax be the maximum element in the
+ * score domain rather than the minimum.
+ *
+ * The vectorized dynamic programming implementation lacks some features that
+ * make it hard to adapt to solving the entire dynamic-programming alignment
+ * problem.  For instance:
+ *
+ * - It doesn't respect gap barriers on either end of the read
+ * - It just gives a maximum; not enough information to backtrace without
+ *   redoing some alignment
+ * - It's a little difficult to handle st_ and en_, especially st_.
+ * - The query profile mechanism makes handling of ambiguous reference bases a
+ *   little tricky (16 cols in query profile lookup table instead of 5)
+ *
+ * Given the drawbacks, it is tempting to use SSE dynamic programming as a
+ * filter rather than as an aligner per se.  Here are a few ideas for how it
+ * can be extended to handle more of the alignment problem:
+ *
+ * - Save calculated scores to a big array as we go.  We return to this array
+ *   to find and backtrace from good solutions.
+ */
+
+#include <limits>
+#include "aligner_sw.h"
+
+// static const size_t NBYTES_PER_REG  = 16;
+static const size_t NWORDS_PER_REG  = 16;
+// static const size_t NBITS_PER_WORD  = 8;
+static const size_t NBYTES_PER_WORD = 1;
+
+// In local mode, we start low (0) and go high (255).  Factoring in a query
+// profile involves unsigned saturating addition.  All query profile elements
+// should be expressed as a positive number; this is done by adding -min
+// where min is the smallest (negative) score in the query profile.
+
+typedef uint8_t TCScore;
+
+/**
+ * Build query profile look up tables for the read.  The query profile look
+ * up table is organized as a 1D array indexed by [i][j] where i is the
+ * reference character in the current DP column (0=A, 1=C, etc), and j is
+ * the segment of the query we're currently working on.
+ */
+void SwAligner::buildQueryProfileLocalSseU8(bool fw) {
+	bool& done = fw ? sseU8fwBuilt_ : sseU8rcBuilt_;
+	if(done) {
+		return;
+	}
+	done = true;
+	const BTDnaString* rd = fw ? rdfw_ : rdrc_;
+	const BTString* qu = fw ? qufw_ : qurc_;
+	const size_t len = rd->length();
+	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
+	// How many __m128i's are needed
+	size_t n128s =
+		64 +                    // slack bytes, for alignment?
+		(seglen * ALPHA_SIZE)   // query profile data
+		* 2;                    // & gap barrier data
+	assert_gt(n128s, 0);
+	SSEData& d = fw ? sseU8fw_ : sseU8rc_;
+	d.profbuf_.resizeNoCopy(n128s);
+	assert(!d.profbuf_.empty());
+	d.maxPen_      = d.maxBonus_ = 0;
+	d.lastIter_    = d.lastWord_ = 0;
+	d.qprofStride_ = d.gbarStride_ = 2;
+	d.bias_ = 0;
+	// Calculate bias
+	for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
+		for(size_t i = 0; i < len; i++) {
+			int readc = (*rd)[i];
+			int readq = (*qu)[i];
+			int sc = sc_->score(readc, (int)(1 << refc), readq - 33);
+			if(sc < 0 && sc < d.bias_) {
+				d.bias_ = sc;
+			}
+		}
+	}
+	assert_leq(d.bias_, 0);
+	d.bias_ = -d.bias_;
+	// For each reference character A, C, G, T, N ...
+	for(size_t refc = 0; refc < ALPHA_SIZE; refc++) {
+		// For each segment ...
+		for(size_t i = 0; i < seglen; i++) {
+			size_t j = i;
+			uint8_t *qprofWords =
+				reinterpret_cast<uint8_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2));
+			uint8_t *gbarWords =
+				reinterpret_cast<uint8_t*>(d.profbuf_.ptr() + (refc * seglen * 2) + (i * 2) + 1);
+			// For each sub-word (byte) ...
+			for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+				int sc = 0;
+				*gbarWords = 0;
+				if(j < len) {
+					int readc = (*rd)[j];
+					int readq = (*qu)[j];
+					sc = sc_->score(readc, (int)(1 << refc), readq - 33);
+					assert_range(0, 255, sc + d.bias_);
+					size_t j_from_end = len - j - 1;
+					if(j < (size_t)sc_->gapbar ||
+					   j_from_end < (size_t)sc_->gapbar)
+					{
+						// Inside the gap barrier
+						*gbarWords = 0xff;
+					}
+				}
+				if(refc == 0 && j == len-1) {
+					// Remember which 128-bit word and which smaller word has
+					// the final row
+					d.lastIter_ = i;
+					d.lastWord_ = k;
+				}
+				if(sc < 0) {
+					if((size_t)(-sc) > d.maxPen_) {
+						d.maxPen_ = (size_t)(-sc);
+					}
+				} else {
+					if((size_t)sc > d.maxBonus_) {
+						d.maxBonus_ = (size_t)sc;
+					}
+				}
+				*qprofWords = (uint8_t)(sc + d.bias_);
+				gbarWords++;
+				qprofWords++;
+				j += seglen; // update offset into query
+			}
+		}
+	}
+}
+
+#ifndef NDEBUG
+/**
+ * Return true iff the cell has sane E/F/H values w/r/t its predecessors.
+ */
+static bool cellOkLocalU8(
+	SSEData& d,
+	size_t row,
+	size_t col,
+	int refc,
+	int readc,
+	int readq,
+	const Scoring& sc)     // scoring scheme
+{
+	TCScore floorsc = 0;
+	TCScore ceilsc = 255 - d.bias_ - 1;
+	TAlScore offsetsc = 0;
+	TAlScore sc_h_cur = (TAlScore)d.mat_.helt(row, col);
+	TAlScore sc_e_cur = (TAlScore)d.mat_.eelt(row, col);
+	TAlScore sc_f_cur = (TAlScore)d.mat_.felt(row, col);
+	if(sc_h_cur > floorsc) {
+		sc_h_cur += offsetsc;
+	}
+	if(sc_e_cur > floorsc) {
+		sc_e_cur += offsetsc;
+	}
+	if(sc_f_cur > floorsc) {
+		sc_f_cur += offsetsc;
+	}
+	bool gapsAllowed = true;
+	size_t rowFromEnd = d.mat_.nrow() - row - 1;
+	if(row < (size_t)sc.gapbar || rowFromEnd < (size_t)sc.gapbar) {
+		gapsAllowed = false;
+	}
+	bool e_left_trans = false, h_left_trans = false;
+	bool f_up_trans   = false, h_up_trans = false;
+	bool h_diag_trans = false;
+	if(gapsAllowed) {
+		TAlScore sc_h_left = floorsc;
+		TAlScore sc_e_left = floorsc;
+		TAlScore sc_h_up   = floorsc;
+		TAlScore sc_f_up   = floorsc;
+		if(col > 0 && sc_e_cur > floorsc && sc_e_cur <= ceilsc) {
+			sc_h_left = d.mat_.helt(row, col-1) + offsetsc;
+			sc_e_left = d.mat_.eelt(row, col-1) + offsetsc;
+			e_left_trans = (sc_e_left > floorsc && sc_e_cur == sc_e_left - sc.readGapExtend());
+			h_left_trans = (sc_h_left > floorsc && sc_e_cur == sc_h_left - sc.readGapOpen());
+			assert(e_left_trans || h_left_trans);
+		}
+		if(row > 0 && sc_f_cur > floorsc && sc_f_cur <= ceilsc) {
+			sc_h_up = d.mat_.helt(row-1, col) + offsetsc;
+			sc_f_up = d.mat_.felt(row-1, col) + offsetsc;
+			f_up_trans = (sc_f_up > floorsc && sc_f_cur == sc_f_up - sc.refGapExtend());
+			h_up_trans = (sc_h_up > floorsc && sc_f_cur == sc_h_up - sc.refGapOpen());
+			assert(f_up_trans || h_up_trans);
+		}
+	} else {
+		assert_geq(floorsc, sc_e_cur);
+		assert_geq(floorsc, sc_f_cur);
+	}
+	if(col > 0 && row > 0 && sc_h_cur > floorsc && sc_h_cur <= ceilsc) {
+		TAlScore sc_h_upleft = d.mat_.helt(row-1, col-1) + offsetsc;
+		TAlScore sc_diag = sc.score(readc, (int)refc, readq - 33);
+		h_diag_trans = sc_h_cur == sc_h_upleft + sc_diag;
+	}
+	assert(
+		sc_h_cur <= floorsc ||
+		e_left_trans ||
+		h_left_trans ||
+		f_up_trans   ||
+		h_up_trans   ||
+		h_diag_trans ||
+		sc_h_cur > ceilsc ||
+		row == 0 ||
+		col == 0);
+	return true;
+}
+#endif /*ndef NDEBUG*/
+
+#ifdef NDEBUG
+
+#define assert_all_eq0(x)
+#define assert_all_gt(x, y)
+#define assert_all_gt_lo(x)
+#define assert_all_lt(x, y)
+#define assert_all_lt_hi(x)
+
+#else
+
+#define assert_all_eq0(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt(x, y) { \
+	__m128i tmp = _mm_cmpgt_epu8(x, y); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_gt_lo(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	tmp = _mm_cmpgt_epu8(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt(x, y) { \
+	__m128i z = _mm_setzero_si128(); \
+	z = _mm_xor_si128(z, z); \
+	__m128i tmp = _mm_subs_epu8(y, x); \
+	tmp = _mm_cmpeq_epi16(tmp, z); \
+	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
+}
+
+#define assert_all_lt_hi(x) { \
+	__m128i z = _mm_setzero_si128(); \
+	__m128i tmp = _mm_setzero_si128(); \
+	z = _mm_cmpeq_epu8(z, z); \
+	z = _mm_srli_epu8(z, 1); \
+	tmp = _mm_cmplt_epu8(x, z); \
+	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+}
+#endif
+
+/**
+ * Aligns by filling a dynamic programming matrix with the SSE-accelerated,
+ * banded DP approach of Farrar.  As it goes, it determines which cells we
+ * might backtrace from and tallies the best (highest-scoring) N backtrace
+ * candidate cells per diagonal.  Also returns the alignment score of the best
+ * alignment in the matrix.
+ *
+ * This routine does *not* maintain a matrix holding the entire matrix worth of
+ * scores, nor does it maintain any other dense O(mn) data structure, as this
+ * would quickly exhaust memory for queries longer than about 10,000 kb.
+ * Instead, in the fill stage it maintains two columns worth of scores at a
+ * time (current/previous, or right/left) - these take O(m) space.  When
+ * finished with the current column, it determines which cells from the
+ * previous column, if any, are candidates we might backtrace from to find a
+ * full alignment.  A candidate cell has a score that rises above the threshold
+ * and isn't improved upon by a match in the next column.  The best N
+ * candidates per diagonal are stored in a O(m + n) data structure.
+ */
+TAlScore SwAligner::alignGatherLoc8(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert_gt(minsc_, 0);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileLocalSseU8(fw_);
+	assert(!d.profbuf_.empty());
+	assert_gt(d.bias_, 0);
+	assert_lt(d.bias_, 127);
+	
+	assert_gt(d.maxBonus_, 0);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+	
+	// Now set up the score vectors.  We just need two columns worth, which
+	// we'll call "left" and "right".
+	d.vecbuf_.resize(ROWSTRIDE_2COL * iter * 2);
+	d.vecbuf_.zero();
+	__m128i *vbuf_l = d.vecbuf_.ptr();
+	__m128i *vbuf_r = d.vecbuf_.ptr() + (ROWSTRIDE_2COL * iter);
+	
+	// This is the data structure that holds candidate cells per diagonal.
+	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
+	if(!debug) {
+		btdiag_.init(ndiags, 2);
+	}
+	
+	// Data structure that holds checkpointed anti-diagonals
+	TAlScore perfectScore = sc_->perfectScore(dpRows());
+	bool checkpoint = true;
+	bool cpdebug = false;
+#ifndef NDEBUG
+	cpdebug = dpRows() < 1000;
+#endif
+	cper_.init(
+		dpRows(),      // # rows
+		rff_ - rfi_,   // # columns
+		cperPerPow2_,  // checkpoint every 1 << perpow2 diags (& next)
+		perfectScore,  // perfect score (for sanity checks)
+		true,          // matrix cells have 8-bit scores?
+		cperTri_,      // triangular mini-fills?
+		true,          // alignment is local?
+		cpdebug);      // save all cells for debugging?
+
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i vmax     = _mm_setzero_si128();
+	__m128i vcolmax  = _mm_setzero_si128();
+	__m128i vmaxtmp  = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+	__m128i vhd      = _mm_setzero_si128();
+	__m128i vhdtmp   = _mm_setzero_si128();
+	__m128i vtmp     = _mm_setzero_si128();
+	__m128i vzero    = _mm_setzero_si128();
+	__m128i vbias    = _mm_setzero_si128();
+	__m128i vbiasm1  = _mm_setzero_si128();
+	__m128i vminsc   = _mm_setzero_si128();
+
+	int dup;
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_U8);
+	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
+	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_U8);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
+	rfgape = _mm_insert_epi16(rfgape, dup, 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_U8);
+	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
+	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_U8);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
+	rdgape = _mm_insert_epi16(rdgape, dup, 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	
+	// Set all elts to minimum score threshold.  Actually, to 1 less than the
+	// threshold so we can use gt instead of geq.
+	dup = (((int)minsc_ - 1) << 8) | (((int)minsc_ - 1) & 0x00ff);
+	vminsc = _mm_insert_epi16(vminsc, dup, 0);
+	vminsc = _mm_shufflelo_epi16(vminsc, 0);
+	vminsc = _mm_shuffle_epi32(vminsc, 0);
+
+	dup = ((d.bias_ - 1) << 8) | ((d.bias_ - 1) & 0x00ff);
+	vbiasm1 = _mm_insert_epi16(vbiasm1, dup, 0);
+	vbiasm1 = _mm_shufflelo_epi16(vbiasm1, 0);
+	vbiasm1 = _mm_shuffle_epi32(vbiasm1, 0);
+	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	vmax = vlo;
+	
+	// Make a vector of bias offsets
+	dup = (d.bias_ << 8) | (d.bias_ & 0x00ff);
+	vbias = _mm_insert_epi16(vbias, dup, 0);
+	vbias = _mm_shufflelo_epi16(vbias, 0);
+	vbias = _mm_shuffle_epi32(vbias, 0);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	const size_t colstride = ROWSTRIDE_2COL * iter;
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
+	/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
+	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	
+	for(size_t i = 0; i < iter; i++) {
+		// start low in local mode
+		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+	}
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+	TAlScore matchsc = sc_->match(30);
+	TAlScore leftmax = MIN_I64;
+
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+	
+	size_t off = MAX_SIZE_T, lastoff;
+	bool bailed = false;
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		// Swap left and right; vbuf_l is the vector on the left, which we
+		// generally load from, and vbuf_r is the vector on the right, which we
+		// generally store to.
+		swap(vbuf_l, vbuf_r);
+		pvELeft = vbuf_l + 0; pvERight = vbuf_r + 0;
+		/* pvFLeft = vbuf_l + 1; */ pvFRight = vbuf_r + 1;
+		pvHLeft = vbuf_l + 2; pvHRight = vbuf_r + 2;
+		
+		// Fetch this column's reference mask
+		const int refm = (int)rf_[i];
+		
+		// Fetch the appropriate query profile
+		lastoff = off;
+		off = (size_t)firsts5[refm] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Load H vector from the final row of the previous column.
+		// ??? perhaps we should calculate the next iter's F instead of the
+		// current iter's?  The way we currently do it, seems like it will
+		// almost always require at least one fixup loop iter (to recalculate
+		// this topmost F).
+		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		
+		// Set all cells to low value
+		vf = _mm_xor_si128(vf, vf);
+		// vf now contains the vertical contribution
+
+		// Store cells in F, calculated previously
+		// No need to veto ref gap extensions, they're all 0x00s
+		_mm_store_si128(pvFRight, vf);
+		pvFRight += ROWSTRIDE_2COL;
+		
+		// Shift down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		
+		// We pull out one loop iteration to make it easier to veto values in the top row
+		
+		// Load cells from E, calculated previously
+		ve = _mm_load_si128(pvELeft);
+		vhd = _mm_load_si128(pvHLeft);
+		assert_all_lt(ve, vhi);
+		pvELeft += ROWSTRIDE_2COL;
+		// ve now contains the horizontal contribution
+		
+		// Factor in query profile (matches and mismatches)
+		vh = _mm_adds_epu8(vh, pvScore[0]);
+		vh = _mm_subs_epu8(vh, vbias);
+		// vh now contains the diagonal contribution
+
+		vhdtmp = vhd;
+		vhd = _mm_subs_epu8(vhd, rdgapo);
+		vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+		ve = _mm_subs_epu8(ve, rdgape);
+		ve = _mm_max_epu8(ve, vhd);
+
+		vh = _mm_max_epu8(vh, ve);
+		vf = vh;
+
+		// Update highest score so far
+		vcolmax = vh;
+		
+		// Save the new vH values
+		_mm_store_si128(pvHRight, vh);
+
+		vh = vhdtmp;
+		assert_all_lt(ve, vhi);
+		pvHRight += ROWSTRIDE_2COL;
+		pvHLeft += ROWSTRIDE_2COL;
+		
+		// Save E values
+		_mm_store_si128(pvERight, ve);
+		pvERight += ROWSTRIDE_2COL;
+		
+		// Update vf value
+		vf = _mm_subs_epu8(vf, rfgapo);
+		assert_all_lt(vf, vhi);
+		
+		pvScore += 2; // move on to next query profile
+
+		// For each character in the reference text:
+		size_t j;
+		for(j = 1; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELeft);
+			vhd = _mm_load_si128(pvHLeft);
+			assert_all_lt(ve, vhi);
+			pvELeft += ROWSTRIDE_2COL;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_adds_epu8(vh, pvScore[0]);
+			vh = _mm_subs_epu8(vh, vbias);
+			
+			// Update H, factoring in E and F
+			vh = _mm_max_epu8(vh, vf);
+
+			vhdtmp = vhd;
+			vhd = _mm_subs_epu8(vhd, rdgapo);
+			vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epu8(ve, rdgape);
+			ve = _mm_max_epu8(ve, vhd);
+			
+			vh = _mm_max_epu8(vh, ve);
+			vtmp = vh;
+			
+			// Update highest score encountered this far
+			vcolmax = _mm_max_epu8(vcolmax, vh);
+			
+			// Save the new vH values
+			_mm_store_si128(pvHRight, vh);
+
+			vh = vhdtmp;
+
+			assert_all_lt(ve, vhi);
+			pvHRight += ROWSTRIDE_2COL;
+			pvHLeft += ROWSTRIDE_2COL;
+			
+			// Save E values
+			_mm_store_si128(pvERight, ve);
+			pvERight += ROWSTRIDE_2COL;
+			
+			// Update vf value
+			vtmp = _mm_subs_epu8(vtmp, rfgapo);
+			vf = _mm_subs_epu8(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epu8(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFRight -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFRight);
+		
+		pvHRight -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHRight);
+		
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		
+		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epu8(vtmp, vf);
+		// TODO: We're testing whether F changed.  Can't we just assume that F
+		// did change and instead check whether H changed?  Might save us from
+		// entering the fixup loop.
+		vtmp = _mm_subs_epu8(vf, vtmp);
+		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0xffff) {
+			// Store this vf
+			_mm_store_si128(pvFRight, vf);
+			pvFRight += ROWSTRIDE_2COL;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epu8(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHRight, vh);
+			pvHRight += ROWSTRIDE_2COL;
+			
+			// Update highest score encountered so far.
+			vcolmax = _mm_max_epu8(vcolmax, vh);
+
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFRight -= colstride;
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				pvHRight -= colstride;
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+			} else {
+				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epu8(vf, rfgape);
+			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epu8(vtmp, vf);
+			vtmp = _mm_subs_epu8(vf, vtmp);
+			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+
+		// Now we'd like to know exactly which cells in the left column are
+		// candidates we might backtrace from.  First question is: did *any*
+		// elements in the column exceed the minimum score threshold?
+		if(!debug && leftmax >= minsc_) {
+			// Yes.  Next question is: which cells are candidates?  We have to
+			// allow matches in the right column to override matches above and
+			// to the left in the left column.
+			assert_gt(i - rfi_, 0);
+			pvHLeft  = vbuf_l + 2;
+			assert_lt(lastoff, MAX_SIZE_T);
+			pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
+			for(size_t k = 0; k < iter; k++) {
+				vh = _mm_load_si128(pvHLeft);
+				vtmp = _mm_cmpgt_epi8(pvScore[0], vbiasm1);
+				int cmp = _mm_movemask_epi8(vtmp);
+				if(cmp != 0xffff) {
+					// At least one candidate in this mask.  Now iterate
+					// through vm/vh to evaluate individual cells.
+					for(size_t m = 0; m < NWORDS_PER_REG; m++) {
+						size_t row = k + m * iter;
+						if(row >= dpRows()) {
+							break;
+						}
+						if(((TCScore *)&vtmp)[m] > 0 && ((TCScore *)&vh)[m] >= minsc_) {
+							TCScore sc = ((TCScore *)&vh)[m];
+							assert_geq(sc, minsc_);
+							// Add to data structure holding all candidates
+							size_t col = i - rfi_ - 1; // -1 b/c prev col
+							size_t frombot = dpRows() - row - 1;
+							DpBtCandidate cand(row, col, sc);
+							btdiag_.add(frombot + col, cand);
+						}
+					}
+				}
+				pvHLeft += ROWSTRIDE_2COL;
+				pvScore += 2;
+			}
+		}
+
+		// Save some elements to checkpoints
+		if(checkpoint) {
+			
+			__m128i *pvE = vbuf_r + 0;
+			__m128i *pvF = vbuf_r + 1;
+			__m128i *pvH = vbuf_r + 2;
+			size_t coli = i - rfi_;
+			if(coli < cper_.locol_) cper_.locol_ = coli;
+			if(coli > cper_.hicol_) cper_.hicol_ = coli;
+			if(cperTri_) {
+				// Checkpoint for triangular mini-fills
+				size_t rc_mod = coli & cper_.lomask_;
+				assert_lt(rc_mod, cper_.per_);
+				int64_t row = -rc_mod-1;
+				int64_t row_mod = row;
+				int64_t row_div = 0;
+				size_t idx = coli >> cper_.perpow2_;
+				size_t idxrow = idx * cper_.nrow_;
+				assert_eq(4, ROWSTRIDE_2COL);
+				bool done = false;
+				while(true) {
+					row += (cper_.per_ - 2);
+					row_mod += (cper_.per_ - 2);
+					for(size_t j = 0; j < 2; j++) {
+						row++;
+						row_mod++;
+						if(row >= 0 && (size_t)row < cper_.nrow_) {
+							// Update row divided by iter_ and mod iter_
+							while(row_mod >= (int64_t)iter) {
+								row_mod -= (int64_t)iter;
+								row_div++;
+							}
+							size_t delt = idxrow + row;
+							size_t vecoff = (row_mod << 6) + row_div;
+							assert_lt(row_div, 16);
+							int16_t h_sc = ((uint8_t*)pvH)[vecoff];
+							int16_t e_sc = ((uint8_t*)pvE)[vecoff];
+							int16_t f_sc = ((uint8_t*)pvF)[vecoff];
+							assert_leq(h_sc, cper_.perf_);
+							assert_leq(e_sc, cper_.perf_);
+							assert_leq(f_sc, cper_.perf_);
+							CpQuad *qdiags = ((j == 0) ? cper_.qdiag1s_.ptr() : cper_.qdiag2s_.ptr());
+							qdiags[delt].sc[0] = h_sc;
+							qdiags[delt].sc[1] = e_sc;
+							qdiags[delt].sc[2] = f_sc;
+						} // if(row >= 0 && row < nrow_)
+						else if(row >= 0 && (size_t)row >= cper_.nrow_) {
+							done = true;
+							break;
+						}
+					} // for(size_t j = 0; j < 2; j++)
+					if(done) {
+						break;
+					}
+					idx++;
+					idxrow += cper_.nrow_;
+				} // while(true)
+			} else {
+				// Checkpoint for square mini-fills
+			
+				// If this is the first column, take this opportunity to
+				// pre-calculate the coordinates of the elements we're going to
+				// checkpoint.
+				if(coli == 0) {
+					size_t cpi    = cper_.per_-1;
+					size_t cpimod = cper_.per_-1;
+					size_t cpidiv = 0;
+					cper_.commitMap_.clear();
+					while(cpi < cper_.nrow_) {
+						while(cpimod >= iter) {
+							cpimod -= iter;
+							cpidiv++;
+						}
+						size_t vecoff = (cpimod << 6) + cpidiv;
+						cper_.commitMap_.push_back(vecoff);
+						cpi += cper_.per_;
+						cpimod += cper_.per_;
+					}
+				}
+				// Save all the rows
+				size_t rowoff = 0;
+				size_t sz = cper_.commitMap_.size();
+				for(size_t i = 0; i < sz; i++, rowoff += cper_.ncol_) {
+					size_t vecoff = cper_.commitMap_[i];
+					int16_t h_sc = ((uint8_t*)pvH)[vecoff];
+					//int16_t e_sc = ((uint8_t*)pvE)[vecoff];
+					int16_t f_sc = ((uint8_t*)pvF)[vecoff];
+					assert_leq(h_sc, cper_.perf_);
+					//assert_leq(e_sc, cper_.perf_);
+					assert_leq(f_sc, cper_.perf_);
+					CpQuad& dst = cper_.qrows_[rowoff + coli];
+					dst.sc[0] = h_sc;
+					//dst.sc[1] = e_sc;
+					dst.sc[2] = f_sc;
+				}
+				// Is this a column we'd like to checkpoint?
+				if((coli & cper_.lomask_) == cper_.lomask_) {
+					// Save the column using memcpys
+					assert_gt(coli, 0);
+					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
+					__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				}
+			}
+			if(cper_.debug_) {
+				// Save the column using memcpys
+				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
+				size_t coloff = coli * wordspercol;
+				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+			}
+		}
+
+		// Store column maximum vector in first element of tmp
+		vmax = _mm_max_epu8(vmax, vcolmax);
+
+		{
+			// Get single largest score in this column
+			vmaxtmp = vcolmax;
+			vtmp = _mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 1);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			int score = _mm_extract_epi16(vmaxtmp, 0);
+			score = score & 0x00ff;
+
+			// Could we have saturated?
+			if(score + d.bias_ >= 255) {
+				flag = -2; // yes
+				if(!debug) met.dpsat++;
+				return MIN_I64;
+			}
+			
+			if(score < minsc_) {
+				size_t ncolleft = rff_ - i - 1;
+				if(score + (TAlScore)ncolleft * matchsc < minsc_) {
+					// Bail!  There can't possibly be a valid alignment that
+					// passes through this column.
+					bailed = true;
+					break;
+				}
+			}
+			
+			leftmax = score;
+		}
+	}
+	
+	lastoff = off;
+	
+	// Now we'd like to know exactly which cells in the *rightmost* column are
+	// candidates we might backtrace from.  Did *any* elements exceed the
+	// minimum score threshold?
+	if(!debug && !bailed && leftmax >= minsc_) {
+		// Yes.  Next question is: which cells are candidates?  We have to
+		// allow matches in the right column to override matches above and
+		// to the left in the left column.
+		pvHLeft  = vbuf_r + 2;
+		assert_lt(lastoff, MAX_SIZE_T);
+		pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
+		for(size_t k = 0; k < iter; k++) {
+			vh = _mm_load_si128(pvHLeft);
+			vtmp = _mm_cmpgt_epi8(pvScore[0], vbiasm1);
+			int cmp = _mm_movemask_epi8(vtmp);
+			if(cmp != 0xffff) {
+				// At least one candidate in this mask.  Now iterate
+				// through vm/vh to evaluate individual cells.
+				for(size_t m = 0; m < NWORDS_PER_REG; m++) {
+					size_t row = k + m * iter;
+					if(row >= dpRows()) {
+						break;
+					}
+					if(((TCScore *)&vtmp)[m] > 0 && ((TCScore *)&vh)[m] >= minsc_) {
+						TCScore sc = ((TCScore *)&vh)[m];
+						assert_geq(sc, minsc_);
+						// Add to data structure holding all candidates
+						size_t col = rff_ - rfi_ - 1; // -1 b/c prev col
+						size_t frombot = dpRows() - row - 1;
+						DpBtCandidate cand(row, col, sc);
+						btdiag_.add(frombot + col, cand);
+					}
+				}
+			}
+			pvHLeft += ROWSTRIDE_2COL;
+			pvScore += 2;
+		}
+	}
+
+	// Find largest score in vmax
+	vtmp = _mm_srli_si128(vmax, 8);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 4);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 2);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 1);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+	
+	int score = _mm_extract_epi16(vmax, 0);
+	score = score & 0x00ff;
+
+	flag = 0;
+	
+	// Could we have saturated?
+	if(score + d.bias_ >= 255) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+
+	// Did we find a solution?
+	if(score == MIN_U8 || score < minsc_) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return (TAlScore)score;
+	}
+	
+	// Now take all the backtrace candidates in the btdaig_ structure and
+	// dump them into the btncand_ array.  They'll be sorted later.
+	if(!debug) {
+		assert(!btdiag_.empty());
+		btdiag_.dump(btncand_);	
+		assert(!btncand_.empty());
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return (TAlScore)score;
+}
+
+/**
+ * Solve the current alignment problem using SSE instructions that operate on 16
+ * unsigned 8-bit values packed into a single 128-bit register.
+ */
+TAlScore SwAligner::alignNucleotidesLocalSseU8(int& flag, bool debug) {
+	assert_leq(rdf_, rd_->length());
+	assert_leq(rdf_, qu_->length());
+	assert_lt(rfi_, rff_);
+	assert_lt(rdi_, rdf_);
+	assert_eq(rd_->length(), qu_->length());
+	assert_geq(sc_->gapbar, 1);
+	assert(repOk());
+#ifndef NDEBUG
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert_range(0, 16, (int)rf_[i]);
+	}
+#endif
+
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	if(!debug) met.dp++;
+	buildQueryProfileLocalSseU8(fw_);
+	assert(!d.profbuf_.empty());
+	assert_geq(d.bias_, 0);
+
+	assert_gt(d.maxBonus_, 0);
+	size_t iter =
+		(dpRows() + (NWORDS_PER_REG-1)) / NWORDS_PER_REG; // iter = segLen
+
+	int dup;
+	
+	// Many thanks to Michael Farrar for releasing his striped Smith-Waterman
+	// implementation:
+	//
+	//  http://sites.google.com/site/farrarmichael/smith-waterman
+	//
+	// Much of the implmentation below is adapted from Michael's code.
+
+	// Set all elts to reference gap open penalty
+	__m128i rfgapo   = _mm_setzero_si128();
+	__m128i rfgape   = _mm_setzero_si128();
+	__m128i rdgapo   = _mm_setzero_si128();
+	__m128i rdgape   = _mm_setzero_si128();
+	__m128i vlo      = _mm_setzero_si128();
+	__m128i vhi      = _mm_setzero_si128();
+	__m128i vmax     = _mm_setzero_si128();
+	__m128i vcolmax  = _mm_setzero_si128();
+	__m128i vmaxtmp  = _mm_setzero_si128();
+	__m128i ve       = _mm_setzero_si128();
+	__m128i vf       = _mm_setzero_si128();
+	__m128i vh       = _mm_setzero_si128();
+	__m128i vtmp     = _mm_setzero_si128();
+	__m128i vzero    = _mm_setzero_si128();
+	__m128i vbias    = _mm_setzero_si128();
+
+	assert_gt(sc_->refGapOpen(), 0);
+	assert_leq(sc_->refGapOpen(), MAX_U8);
+	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
+	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	
+	// Set all elts to reference gap extension penalty
+	assert_gt(sc_->refGapExtend(), 0);
+	assert_leq(sc_->refGapExtend(), MAX_U8);
+	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
+	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
+	rfgape = _mm_insert_epi16(rfgape, dup, 0);
+	rfgape = _mm_shufflelo_epi16(rfgape, 0);
+	rfgape = _mm_shuffle_epi32(rfgape, 0);
+
+	// Set all elts to read gap open penalty
+	assert_gt(sc_->readGapOpen(), 0);
+	assert_leq(sc_->readGapOpen(), MAX_U8);
+	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
+	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	
+	// Set all elts to read gap extension penalty
+	assert_gt(sc_->readGapExtend(), 0);
+	assert_leq(sc_->readGapExtend(), MAX_U8);
+	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
+	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
+	rdgape = _mm_insert_epi16(rdgape, dup, 0);
+	rdgape = _mm_shufflelo_epi16(rdgape, 0);
+	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	
+	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	vmax = vlo;
+	
+	// Make a vector of bias offsets
+	dup = (d.bias_ << 8) | (d.bias_ & 0x00ff);
+	vbias = _mm_insert_epi16(vbias, dup, 0);
+	vbias = _mm_shufflelo_epi16(vbias, 0);
+	vbias = _mm_shuffle_epi32(vbias, 0);
+	
+	// Points to a long vector of __m128i where each element is a block of
+	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
+	// the block of cells is from the E matrix.  If index % 3 == 1, they're
+	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
+	// Blocks of cells are organized in the same interleaved manner as they are
+	// calculated by the Farrar algorithm.
+	const __m128i *pvScore; // points into the query profile
+
+	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
+	const size_t colstride = d.mat_.colstride();
+	//const size_t rowstride = d.mat_.rowstride();
+	assert_eq(ROWSTRIDE, colstride / iter);
+	
+	// Initialize the H and E vectors in the first matrix column
+	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	__m128i *pvETmp = d.mat_.evec(0, 0);
+	
+	for(size_t i = 0; i < iter; i++) {
+		_mm_store_si128(pvETmp, vlo);
+		_mm_store_si128(pvHTmp, vlo); // start low in local mode
+		pvETmp += ROWSTRIDE;
+		pvHTmp += ROWSTRIDE;
+	}
+	// These are swapped just before the innermost loop
+	__m128i *pvHStore = d.mat_.hvec(0, 0);
+	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	__m128i *pvELoad  = d.mat_.evec(0, 0);
+	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	__m128i *pvFStore = d.mat_.fvec(0, 0);
+	__m128i *pvFTmp   = NULL;
+	
+	assert_gt(sc_->gapbar, 0);
+	size_t nfixup = 0;
+	TAlScore matchsc = sc_->match(30);
+	
+	// Fill in the table as usual but instead of using the same gap-penalty
+	// vector for each iteration of the inner loop, load words out of a
+	// pre-calculated gap vector parallel to the query profile.  The pre-
+	// calculated gap vectors enforce the gap barrier constraint by making it
+	// infinitely costly to introduce a gap in barrier rows.
+	//
+	// AND use a separate loop to fill in the first row of the table, enforcing
+	// the st_ constraints in the process.  This is awkward because it
+	// separates the processing of the first row from the others and might make
+	// it difficult to use the first-row results in the next row, but it might
+	// be the simplest and least disruptive way to deal with the st_ constraint.
+	
+	colstop_ = rff_ - rfi_;
+	lastsolcol_ = 0;
+	for(size_t i = (size_t)rfi_; i < (size_t)rff_; i++) {
+		assert(pvFStore == d.mat_.fvec(0, i - rfi_));
+		assert(pvHStore == d.mat_.hvec(0, i - rfi_));
+		
+		// Fetch this column's reference mask
+		const int refm = (int)rf_[i];
+		
+		// Fetch the appropriate query profile
+		size_t off = (size_t)firsts5[refm] * iter * 2;
+		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
+		
+		// Load H vector from the final row of the previous column
+		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		
+		// Set all cells to low value
+		vf = _mm_xor_si128(vf, vf);
+		
+		// Store cells in F, calculated previously
+		// No need to veto ref gap extensions, they're all 0x00s
+		_mm_store_si128(pvFStore, vf);
+		pvFStore += ROWSTRIDE;
+		
+		// Shift down so that topmost (least sig) cell gets 0
+		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		
+		// We pull out one loop iteration to make it easier to veto values in the top row
+		
+		// Load cells from E, calculated previously
+		ve = _mm_load_si128(pvELoad);
+		assert_all_lt(ve, vhi);
+		pvELoad += ROWSTRIDE;
+		
+		// Factor in query profile (matches and mismatches)
+		vh = _mm_adds_epu8(vh, pvScore[0]);
+		vh = _mm_subs_epu8(vh, vbias);
+		
+		// Update H, factoring in E and F
+		vh = _mm_max_epu8(vh, ve);
+		vh = _mm_max_epu8(vh, vf);
+		
+		// Update highest score so far
+		vcolmax = _mm_xor_si128(vcolmax, vcolmax);
+		vcolmax = _mm_max_epu8(vcolmax, vh);
+		
+		// Save the new vH values
+		_mm_store_si128(pvHStore, vh);
+		pvHStore += ROWSTRIDE;
+		
+		// Update vE value
+		vf = vh;
+		vh = _mm_subs_epu8(vh, rdgapo);
+		vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
+		ve = _mm_subs_epu8(ve, rdgape);
+		ve = _mm_max_epu8(ve, vh);
+		assert_all_lt(ve, vhi);
+		
+		// Load the next h value
+		vh = _mm_load_si128(pvHLoad);
+		pvHLoad += ROWSTRIDE;
+		
+		// Save E values
+		_mm_store_si128(pvEStore, ve);
+		pvEStore += ROWSTRIDE;
+		
+		// Update vf value
+		vf = _mm_subs_epu8(vf, rfgapo);
+		assert_all_lt(vf, vhi);
+		
+		pvScore += 2; // move on to next query profile
+
+		// For each character in the reference text:
+		size_t j;
+		for(j = 1; j < iter; j++) {
+			// Load cells from E, calculated previously
+			ve = _mm_load_si128(pvELoad);
+			assert_all_lt(ve, vhi);
+			pvELoad += ROWSTRIDE;
+			
+			// Store cells in F, calculated previously
+			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Factor in query profile (matches and mismatches)
+			vh = _mm_adds_epu8(vh, pvScore[0]);
+			vh = _mm_subs_epu8(vh, vbias);
+			
+			// Update H, factoring in E and F
+			vh = _mm_max_epu8(vh, ve);
+			vh = _mm_max_epu8(vh, vf);
+			
+			// Update highest score encountered this far
+			vcolmax = _mm_max_epu8(vcolmax, vh);
+			
+			// Save the new vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update vE value
+			vtmp = vh;
+			vh = _mm_subs_epu8(vh, rdgapo);
+			vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
+			ve = _mm_subs_epu8(ve, rdgape);
+			ve = _mm_max_epu8(ve, vh);
+			assert_all_lt(ve, vhi);
+			
+			// Load the next h value
+			vh = _mm_load_si128(pvHLoad);
+			pvHLoad += ROWSTRIDE;
+			
+			// Save E values
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+			
+			// Update vf value
+			vtmp = _mm_subs_epu8(vtmp, rfgapo);
+			vf = _mm_subs_epu8(vf, rfgape);
+			assert_all_lt(vf, vhi);
+			vf = _mm_max_epu8(vf, vtmp);
+			
+			pvScore += 2; // move on to next query profile / gap veto
+		}
+		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
+		pvFTmp = pvFStore;
+		pvFStore -= colstride; // reset to start of column
+		vtmp = _mm_load_si128(pvFStore);
+		
+		pvHStore -= colstride; // reset to start of column
+		vh = _mm_load_si128(pvHStore);
+		
+		pvEStore -= colstride; // reset to start of column
+		ve = _mm_load_si128(pvEStore);
+		
+		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
+		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
+		
+		// vf from last row gets shifted down by one to overlay the first row
+		// rfgape has already been subtracted from it.
+		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		
+		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = _mm_max_epu8(vtmp, vf);
+		vtmp = _mm_subs_epu8(vf, vtmp);
+		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = _mm_movemask_epi8(vtmp);
+		
+		// If any element of vtmp is greater than H - gap-open...
+		j = 0;
+		while(cmp != 0xffff) {
+			// Store this vf
+			_mm_store_si128(pvFStore, vf);
+			pvFStore += ROWSTRIDE;
+			
+			// Update vh w/r/t new vf
+			vh = _mm_max_epu8(vh, vf);
+			
+			// Save vH values
+			_mm_store_si128(pvHStore, vh);
+			pvHStore += ROWSTRIDE;
+			
+			// Update highest score encountered this far
+			vcolmax = _mm_max_epu8(vcolmax, vh);
+			
+			// Update E in case it can be improved using our new vh
+			vh = _mm_subs_epu8(vh, rdgapo);
+			vh = _mm_subs_epu8(vh, *pvScore); // veto some read gap opens
+			ve = _mm_max_epu8(ve, vh);
+			_mm_store_si128(pvEStore, ve);
+			pvEStore += ROWSTRIDE;
+			pvScore += 2;
+			
+			assert_lt(j, iter);
+			if(++j == iter) {
+				pvFStore -= colstride;
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				pvHStore -= colstride;
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				pvEStore -= colstride;
+				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+				pvScore = d.profbuf_.ptr() + off + 1;
+				j = 0;
+				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+			} else {
+				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+			}
+			
+			// Update F with another gap extension
+			vf = _mm_subs_epu8(vf, rfgape);
+			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = _mm_max_epu8(vtmp, vf);
+			vtmp = _mm_subs_epu8(vf, vtmp);
+			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
+			cmp = _mm_movemask_epi8(vtmp);
+			nfixup++;
+		}
+
+#ifndef NDEBUG
+		if((rand() & 15) == 0) {
+			// This is a work-intensive sanity check; each time we finish filling
+			// a column, we check that each H, E, and F is sensible.
+			for(size_t k = 0; k < dpRows(); k++) {
+				assert(cellOkLocalU8(
+					d,
+					k,                   // row
+					i - rfi_,            // col
+					refm,                // reference mask
+					(int)(*rd_)[rdi_+k], // read char
+					(int)(*qu_)[rdi_+k], // read quality
+					*sc_));              // scoring scheme
+			}
+		}
+#endif
+
+		// Store column maximum vector in first element of tmp
+		vmax = _mm_max_epu8(vmax, vcolmax);
+		_mm_store_si128(d.mat_.tmpvec(0, i - rfi_), vcolmax);
+
+		{
+			// Get single largest score in this column
+			vmaxtmp = vcolmax;
+			vtmp = _mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = _mm_srli_si128(vmaxtmp, 1);
+			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
+			int score = _mm_extract_epi16(vmaxtmp, 0);
+			score = score & 0x00ff;
+
+			// Could we have saturated?
+			if(score + d.bias_ >= 255) {
+				flag = -2; // yes
+				if(!debug) met.dpsat++;
+				return MIN_I64;
+			}
+			
+			if(score < minsc_) {
+				size_t ncolleft = rff_ - i - 1;
+				if(score + (TAlScore)ncolleft * matchsc < minsc_) {
+					// Bail!  We're guaranteed not to see a valid alignment in
+					// the rest of the matrix
+					colstop_ = (i+1) - rfi_;
+					break;
+				}
+			} else {
+				lastsolcol_ = i - rfi_;
+			}
+		}
+		
+		// pvELoad and pvHLoad are already where they need to be
+		
+		// Adjust the load and store vectors here.  
+		pvHStore = pvHLoad + colstride;
+		pvEStore = pvELoad + colstride;
+		pvFStore = pvFTmp;
+	}
+
+	// Find largest score in vmax
+	vtmp = _mm_srli_si128(vmax, 8);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 4);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 2);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = _mm_srli_si128(vmax, 1);
+	vmax = _mm_max_epu8(vmax, vtmp);
+	
+	// Update metrics
+	if(!debug) {
+		size_t ninner = (rff_ - rfi_) * iter;
+		met.col   += (rff_ - rfi_);             // DP columns
+		met.cell  += (ninner * NWORDS_PER_REG); // DP cells
+		met.inner += ninner;                    // DP inner loop iters
+		met.fixup += nfixup;                    // DP fixup loop iters
+	}
+	
+	int score = _mm_extract_epi16(vmax, 0);
+	score = score & 0x00ff;
+
+	flag = 0;
+	
+	// Could we have saturated?
+	if(score + d.bias_ >= 255) {
+		flag = -2; // yes
+		if(!debug) met.dpsat++;
+		return MIN_I64;
+	}
+
+	// Did we find a solution?
+	if(score == MIN_U8 || score < minsc_) {
+		flag = -1; // no
+		if(!debug) met.dpfail++;
+		return (TAlScore)score;
+	}
+	
+	// Return largest score
+	if(!debug) met.dpsucc++;
+	return (TAlScore)score;
+}
+
+/**
+ * Given a filled-in DP table, populate the btncand_ list with candidate cells
+ * that might be at the ends of valid alignments.  No need to do this unless
+ * the maximum score returned by the align*() func is >= the minimum.
+ *
+ * We needn't consider cells that have no chance of reaching any of the core
+ * diagonals.  These are the cells that are more than 'maxgaps' cells away from
+ * a core diagonal.
+ *
+ * We need to be careful to consider that the rectangle might be truncated on
+ * one or both ends.
+ *
+ * The seed extend case looks like this:
+ *
+ *      |Rectangle|   0: seed diagonal
+ *      **OO0oo----   o: "RHS gap" diagonals
+ *      -**OO0oo---   O: "LHS gap" diagonals
+ *      --**OO0oo--   *: "LHS extra" diagonals
+ *      ---**OO0oo-   -: cells that can't possibly be involved in a valid    
+ *      ----**OO0oo      alignment that overlaps one of the core diagonals
+ *
+ * The anchor-to-left case looks like this:
+ *
+ *   |Anchor|  | ---- Rectangle ---- |
+ *   o---------OO0000000000000oo------  0: mate diagonal (also core diags!)
+ *   -o---------OO0000000000000oo-----  o: "RHS gap" diagonals
+ *   --o---------OO0000000000000oo----  O: "LHS gap" diagonals
+ *   ---oo--------OO0000000000000oo---  *: "LHS extra" diagonals
+ *   -----o--------OO0000000000000oo--  -: cells that can't possibly be
+ *   ------o--------OO0000000000000oo-     involved in a valid alignment that
+ *   -------o--------OO0000000000000oo     overlaps one of the core diagonals
+ *                     XXXXXXXXXXXXX
+ *                     | RHS Range |
+ *                     ^           ^
+ *                     rl          rr
+ *
+ * The anchor-to-right case looks like this:
+ *
+ *    ll          lr
+ *    v           v
+ *    | LHS Range |
+ *    XXXXXXXXXXXXX          |Anchor|
+ *  OO0000000000000oo--------o--------  0: mate diagonal (also core diags!)
+ *  -OO0000000000000oo--------o-------  o: "RHS gap" diagonals
+ *  --OO0000000000000oo--------o------  O: "LHS gap" diagonals
+ *  ---OO0000000000000oo--------oo----  *: "LHS extra" diagonals
+ *  ----OO0000000000000oo---------o---  -: cells that can't possibly be
+ *  -----OO0000000000000oo---------o--     involved in a valid alignment that
+ *  ------OO0000000000000oo---------o-     overlaps one of the core diagonals
+ *  | ---- Rectangle ---- |
+ */
+bool SwAligner::gatherCellsNucleotidesLocalSseU8(TAlScore best) {
+	// What's the minimum number of rows that can possibly be spanned by an
+	// alignment that meets the minimum score requirement?
+	assert(sse8succ_);
+	size_t bonus = (size_t)sc_->match(30);
+	const size_t ncol = lastsolcol_ + 1;
+	const size_t nrow = dpRows();
+	assert_gt(nrow, 0);
+	btncand_.clear();
+	btncanddone_.clear();
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	assert(!d.profbuf_.empty());
+	//const size_t rowstride = d.mat_.rowstride();
+	//const size_t colstride = d.mat_.colstride();
+	size_t iter = (dpRows() + (NWORDS_PER_REG - 1)) / NWORDS_PER_REG;
+	assert_gt(iter, 0);
+	assert_geq(minsc_, 0);
+	assert_gt(bonus, 0);
+	size_t minrow = (size_t)(((minsc_ + bonus - 1) / bonus) - 1);
+	for(size_t j = 0; j < ncol; j++) {
+		// Establish the range of rows where a backtrace from the cell in this
+		// row/col is close enough to one of the core diagonals that it could
+		// conceivably count
+		size_t nrow_lo = MIN_SIZE_T;
+		size_t nrow_hi = nrow;
+		// First, check if there is a cell in this column with a score
+		// above the score threshold
+		__m128i vmax = *d.mat_.tmpvec(0, j);
+		__m128i vtmp = _mm_srli_si128(vmax, 8);
+		vmax = _mm_max_epu8(vmax, vtmp);
+		vtmp = _mm_srli_si128(vmax, 4);
+		vmax = _mm_max_epu8(vmax, vtmp);
+		vtmp = _mm_srli_si128(vmax, 2);
+		vmax = _mm_max_epu8(vmax, vtmp);
+		vtmp = _mm_srli_si128(vmax, 1);
+		vmax = _mm_max_epu8(vmax, vtmp);
+		int score = _mm_extract_epi16(vmax, 0);
+		score = score & 0x00ff;
+#ifndef NDEBUG
+		{
+			// Start in upper vector row and move down
+			TAlScore max = 0;
+			__m128i *pvH = d.mat_.hvec(0, j);
+			for(size_t i = 0; i < iter; i++) {
+				for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+					TAlScore sc = (TAlScore)((TCScore*)pvH)[k];
+					if(sc > max) {
+						max = sc;
+					}
+				}
+				pvH += ROWSTRIDE;
+			}
+			assert_eq(max, score);
+		}
+#endif
+		if((TAlScore)score < minsc_) {
+			// Scores in column aren't good enough
+			continue;
+		}
+		// Get pointer to first cell in column to examine:
+		__m128i *pvHorig = d.mat_.hvec(0, j);
+		__m128i *pvH     = pvHorig;
+		// Get pointer to the vector in the following column that corresponds
+		// to the cells diagonally down and to the right from the cells in pvH
+		__m128i *pvHSucc = (j < ncol-1) ? d.mat_.hvec(0, j+1) : NULL;
+		// Start in upper vector row and move down
+		for(size_t i = 0; i < iter; i++) {
+			if(pvHSucc != NULL) {
+				pvHSucc += ROWSTRIDE;
+				if(i == iter-1) {
+					pvHSucc = d.mat_.hvec(0, j+1);
+				}
+			}
+			// Which elements of this vector are exhaustively scored?
+			size_t rdoff = i;
+			for(size_t k = 0; k < NWORDS_PER_REG; k++) {
+				// Is this row, col one that we can potential backtrace from?
+				// I.e. are we close enough to a core diagonal?
+				if(rdoff >= nrow_lo && rdoff < nrow_hi) {
+					// This cell has been exhaustively scored
+					if(rdoff >= minrow) {
+						// ... and it could potentially score high enough
+						TAlScore sc = (TAlScore)((TCScore*)pvH)[k];
+						assert_leq(sc, best);
+						if(sc >= minsc_) {
+							// This is a potential solution
+							bool matchSucc = false;
+							int readc = (*rd_)[rdoff];
+							int refc = rf_[j + rfi_];
+							bool match = ((refc & (1 << readc)) != 0);
+							if(rdoff < dpRows()-1) {
+								int readcSucc = (*rd_)[rdoff+1];
+								int refcSucc = rf_[j + rfi_ + 1];
+								assert_range(0, 16, refcSucc);
+								matchSucc = ((refcSucc & (1 << readcSucc)) != 0);
+							}
+							if(match && !matchSucc) {
+								// Yes, this is legit
+								met.gathsol++;
+								btncand_.expand();
+								btncand_.back().init(rdoff, j, sc);
+							}
+						}
+					}
+				} else {
+					// Already saw every element in the vector that's been
+					// exhaustively scored
+					break;
+				}
+				rdoff += iter;
+			}
+			pvH += ROWSTRIDE;
+		}
+	}
+	if(!btncand_.empty()) {
+		d.mat_.initMasks();
+	}
+	return !btncand_.empty();
+}
+
+#define MOVE_VEC_PTR_UP(vec, rowvec, rowelt) { \
+	if(rowvec == 0) { \
+		rowvec += d.mat_.nvecrow_; \
+		vec += d.mat_.colstride_; \
+		rowelt--; \
+	} \
+	rowvec--; \
+	vec -= ROWSTRIDE; \
+}
+
+#define MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt) { vec -= d.mat_.colstride_; }
+
+#define MOVE_VEC_PTR_UPLEFT(vec, rowvec, rowelt) { \
+ 	MOVE_VEC_PTR_UP(vec, rowvec, rowelt); \
+ 	MOVE_VEC_PTR_LEFT(vec, rowvec, rowelt); \
+}
+
+#define MOVE_ALL_LEFT() { \
+	MOVE_VEC_PTR_LEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_LEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UP() { \
+	MOVE_VEC_PTR_UP(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UP(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UP(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define MOVE_ALL_UPLEFT() { \
+	MOVE_VEC_PTR_UPLEFT(cur_vec, rowvec, rowelt); \
+	MOVE_VEC_PTR_UPLEFT(left_vec, left_rowvec, left_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(up_vec, up_rowvec, up_rowelt); \
+	MOVE_VEC_PTR_UPLEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+#define NEW_ROW_COL(row, col) { \
+	rowelt = row / d.mat_.nvecrow_; \
+	rowvec = row % d.mat_.nvecrow_; \
+	eltvec = (col * d.mat_.colstride_) + (rowvec * ROWSTRIDE); \
+	cur_vec = d.mat_.matbuf_.ptr() + eltvec; \
+	left_vec = cur_vec; \
+	left_rowelt = rowelt; \
+	left_rowvec = rowvec; \
+	MOVE_VEC_PTR_LEFT(left_vec, left_rowvec, left_rowelt); \
+	up_vec = cur_vec; \
+	up_rowelt = rowelt; \
+	up_rowvec = rowvec; \
+	MOVE_VEC_PTR_UP(up_vec, up_rowvec, up_rowelt); \
+	upleft_vec = up_vec; \
+	upleft_rowelt = up_rowelt; \
+	upleft_rowvec = up_rowvec; \
+	MOVE_VEC_PTR_LEFT(upleft_vec, upleft_rowvec, upleft_rowelt); \
+}
+
+/**
+ * Given the dynamic programming table and a cell, trace backwards from the
+ * cell and install the edits and score/penalty in the appropriate fields
+ * of SwResult res, which contains an AlnRes.  The RandomSource is used to
+ * break ties among equally good ways of tracing back.
+ *
+ * Upon entering a cell, we check if the read/ref coordinates of the cell
+ * correspond to a cell we traversed constructing a previous alignment.  If so,
+ * we backtrack to the last decision point, mask out the path that led to the
+ * previously observed cell, and continue along a different path; or, if there
+ * are no more paths to try, we give up.
+ *
+ * An alignment found is subject to a filtering step designed to remove
+ * alignments that could spuriously trump a better alignment falling partially
+ * outside the rectangle.
+ *
+ *          1
+ *      67890123456   0: seed diagonal
+ *      **OO0oo----   o: right-hand "gap" diagonals: band of 'maxgap' diags
+ *      -**OO0oo---   O: left-hand "gap" diagonals: band of 'maxgap' diags
+ *      --**OO0oo--   *: "extra" diagonals: additional band of 'maxgap' diags
+ *      ---**OO0oo-   +: cells not in any of the above 
+ *      ----**OO0oo
+ *            |-|
+ *   Gotta touch one of these diags
+ *
+ * Basically, the filtering step removes alignments that do not at some point
+ * touch a cell labeled '0' or 'O' in the diagram above.
+ *
+ */
+bool SwAligner::backtraceNucleotidesLocalSseU8(
+	TAlScore       escore, // in: expected score
+	SwResult&      res,    // out: store results (edits and scores) here
+	size_t&        off,    // out: store diagonal projection of origin
+	size_t&        nbts,   // out: # backtracks
+	size_t         row,    // start in this row
+	size_t         col,    // start in this column
+	RandomSource&  rnd)    // random gen, to choose among equal paths
+{
+	assert_lt(row, dpRows());
+	assert_lt(col, (size_t)(rff_ - rfi_));
+	SSEData& d = fw_ ? sseU8fw_ : sseU8rc_;
+	SSEMetrics& met = extend_ ? sseU8ExtendMet_ : sseU8MateMet_;
+	met.bt++;
+	assert(!d.profbuf_.empty());
+	assert_lt(row, rd_->length());
+	btnstack_.clear(); // empty the backtrack stack
+	btcells_.clear();  // empty the cells-so-far list
+	AlnScore score; score.score_ = 0;
+	// score.gaps_ = score.ns_ = 0;
+	ASSERT_ONLY(size_t origCol = col);
+	size_t gaps = 0, readGaps = 0, refGaps = 0;
+	res.alres.reset();
+    EList<Edit>& ned = res.alres.ned();
+	assert(ned.empty());
+	assert_gt(dpRows(), row);
+	ASSERT_ONLY(size_t trimEnd = dpRows() - row - 1);
+	size_t trimBeg = 0;
+	size_t ct = SSEMatrix::H; // cell type
+	// Row and col in terms of where they fall in the SSE vector matrix
+	size_t rowelt, rowvec, eltvec;
+	size_t left_rowelt, up_rowelt, upleft_rowelt;
+	size_t left_rowvec, up_rowvec, upleft_rowvec;
+	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	NEW_ROW_COL(row, col);
+	while((int)row >= 0) {
+		met.btcell++;
+		nbts++;
+		int readc = (*rd_)[rdi_ + row];
+		int refm  = (int)rf_[rfi_ + col];
+		int readq = (*qu_)[row];
+		assert_leq(col, origCol);
+		// Get score in this cell
+		bool empty = false, reportedThru, canMoveThru, branch = false;
+		int cur = SSEMatrix::H;
+		if(!d.mat_.reset_[row]) {
+			d.mat_.resetRow(row);
+		}
+		reportedThru = d.mat_.reportedThrough(row, col);
+		canMoveThru = true;
+		if(reportedThru) {
+			canMoveThru = false;
+		} else {
+			empty = false;
+			if(row > 0) {
+				assert_gt(row, 0);
+				size_t rowFromEnd = d.mat_.nrow() - row - 1;
+				bool gapsAllowed = true;
+				if(row < (size_t)sc_->gapbar ||
+				   rowFromEnd < (size_t)sc_->gapbar)
+				{
+					gapsAllowed = false;
+				}
+				const int floorsc = 0;
+				const int offsetsc = 0;
+				// Move to beginning of column/row
+				if(ct == SSEMatrix::E) { // AKA rdgap
+					assert_gt(col, 0);
+					TAlScore sc_cur = ((TCScore*)(cur_vec + SSEMatrix::E))[rowelt] + offsetsc;
+					assert(gapsAllowed);
+					// Currently in the E matrix; incoming transition must come from the
+					// left.  It's either a gap open from the H matrix or a gap extend from
+					// the E matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell to the left
+					TAlScore sc_h_left = ((TCScore*)(left_vec + SSEMatrix::H))[left_rowelt] + offsetsc;
+					if(sc_h_left > 0 && sc_h_left - sc_->readGapOpen() == sc_cur) {
+						mask |= (1 << 0);
+					}
+					// Get E score of cell to the left
+					TAlScore sc_e_left = ((TCScore*)(left_vec + SSEMatrix::E))[left_rowelt] + offsetsc;
+					if(sc_e_left > 0 && sc_e_left - sc_->readGapExtend() == sc_cur) {
+						mask |= (1 << 1);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isEMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 8) & 3;
+					}
+					if(mask == 3) {
+#if 1
+						// Pick H -> E cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// Pick H -> E cell
+							cur = SW_BT_OALL_READ_OPEN;
+							d.mat_.eMaskSet(row, col, 2); // might choose E later
+						} else {
+							// Pick E -> E cell
+							cur = SW_BT_RDGAP_EXTEND;
+							d.mat_.eMaskSet(row, col, 1); // might choose H later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// I chose the E cell
+						cur = SW_BT_RDGAP_EXTEND;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_READ_OPEN;
+						d.mat_.eMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					assert(!empty || !canMoveThru);
+				} else if(ct == SSEMatrix::F) { // AKA rfgap
+					assert_gt(row, 0);
+					assert(gapsAllowed);
+					TAlScore sc_h_up = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_f_up = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_cur  = ((TCScore*)(cur_vec + SSEMatrix::F))[rowelt] + offsetsc;
+					// Currently in the F matrix; incoming transition must come from above.
+					// It's either a gap open from the H matrix or a gap extend from the F
+					// matrix.
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					// Get H score of cell above
+					if(sc_h_up > floorsc && sc_h_up - sc_->refGapOpen() == sc_cur) {
+						mask |= (1 << 0);
+					}
+					// Get F score of cell above
+					if(sc_f_up > floorsc && sc_f_up - sc_->refGapExtend() == sc_cur) {
+						mask |= (1 << 1);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isFMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 11) & 3;
+					}
+					if(mask == 3) {
+#if 1
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 2); // might choose E later
+#else
+						if(rnd.nextU2()) {
+							// I chose the H cell
+							cur = SW_BT_OALL_REF_OPEN;
+							d.mat_.fMaskSet(row, col, 2); // might choose E later
+						} else {
+							// I chose the F cell
+							cur = SW_BT_RFGAP_EXTEND;
+							d.mat_.fMaskSet(row, col, 1); // might choose E later
+						}
+#endif
+						branch = true;
+					} else if(mask == 2) {
+						// I chose the F cell
+						cur = SW_BT_RFGAP_EXTEND;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else if(mask == 1) {
+						// I chose the H cell
+						cur = SW_BT_OALL_REF_OPEN;
+						d.mat_.fMaskSet(row, col, 0); // done
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+					assert(!empty || !canMoveThru);
+				} else {
+					assert_eq(SSEMatrix::H, ct);
+					TAlScore sc_cur      = ((TCScore*)(cur_vec + SSEMatrix::H))[rowelt]    + offsetsc;
+					TAlScore sc_f_up     = ((TCScore*)(up_vec  + SSEMatrix::F))[up_rowelt] + offsetsc;
+					TAlScore sc_h_up     = ((TCScore*)(up_vec  + SSEMatrix::H))[up_rowelt] + offsetsc;
+					TAlScore sc_h_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::H))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_e_left   = col > 0 ? (((TCScore*)(left_vec   + SSEMatrix::E))[left_rowelt]   + offsetsc) : floorsc;
+					TAlScore sc_h_upleft = col > 0 ? (((TCScore*)(upleft_vec + SSEMatrix::H))[upleft_rowelt] + offsetsc) : floorsc;
+					TAlScore sc_diag     = sc_->score(readc, refm, readq - 33);
+					// TODO: save and restore origMask as well as mask
+					int origMask = 0, mask = 0;
+					if(gapsAllowed) {
+						if(sc_h_up     > floorsc && sc_cur == sc_h_up   - sc_->refGapOpen()) {
+							mask |= (1 << 0);
+						}
+						if(sc_h_left   > floorsc && sc_cur == sc_h_left - sc_->readGapOpen()) {
+							mask |= (1 << 1);
+						}
+						if(sc_f_up     > floorsc && sc_cur == sc_f_up   - sc_->refGapExtend()) {
+							mask |= (1 << 2);
+						}
+						if(sc_e_left   > floorsc && sc_cur == sc_e_left - sc_->readGapExtend()) {
+							mask |= (1 << 3);
+						}
+					}
+					if(sc_h_upleft > floorsc && sc_cur == sc_h_upleft + sc_diag) {
+						mask |= (1 << 4);
+					}
+					origMask = mask;
+					assert(origMask > 0 || sc_cur <= sc_->match());
+					if(d.mat_.isHMaskSet(row, col)) {
+						mask = (d.mat_.masks_[row][col] >> 2) & 31;
+					}
+					assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+					int opts = alts5[mask];
+					int select = -1;
+					if(opts == 1) {
+						select = firsts5[mask];
+						assert_geq(mask, 0);
+						d.mat_.hMaskSet(row, col, 0);
+					} else if(opts > 1) {
+#if 1
+						if(       (mask & 16) != 0) {
+							select = 4; // H diag
+						} else if((mask & 1) != 0) {
+							select = 0; // H up
+						} else if((mask & 4) != 0) {
+							select = 2; // F up
+						} else if((mask & 2) != 0) {
+							select = 1; // H left
+						} else if((mask & 8) != 0) {
+							select = 3; // E left
+						}
+#else
+						select = randFromMask(rnd, mask);
+#endif
+						assert_geq(mask, 0);
+						mask &= ~(1 << select);
+						assert(gapsAllowed || mask == (1 << 4) || mask == 0);
+						d.mat_.hMaskSet(row, col, mask);
+						branch = true;
+					} else { /* No way to backtrack! */ }
+					if(select != -1) {
+						if(select == 4) {
+							cur = SW_BT_OALL_DIAG;
+						} else if(select == 0) {
+							cur = SW_BT_OALL_REF_OPEN;
+						} else if(select == 1) {
+							cur = SW_BT_OALL_READ_OPEN;
+						} else if(select == 2) {
+							cur = SW_BT_RFGAP_EXTEND;
+						} else {
+							assert_eq(3, select)
+							cur = SW_BT_RDGAP_EXTEND;
+						}
+					} else {
+						empty = true;
+						// It's empty, so the only question left is whether we should be
+						// allowed in terimnate in this cell.  If it's got a valid score
+						// then we *shouldn't* be allowed to terminate here because that
+						// means it's part of a larger alignment that was already reported.
+						canMoveThru = (origMask == 0);
+					}
+				}
+				assert(!empty || !canMoveThru || ct == SSEMatrix::H);
+			}
+		}
+		d.mat_.setReportedThrough(row, col);
+		assert_eq(gaps, Edit::numGaps(ned));
+		assert_leq(gaps, rdgap_ + rfgap_);
+		// Cell was involved in a previously-reported alignment?
+		if(!canMoveThru) {
+			if(!btnstack_.empty()) {
+				// Remove all the cells from list back to and including the
+				// cell where the branch occurred
+				btcells_.resize(btnstack_.back().celsz);
+				// Pop record off the top of the stack
+				ned.resize(btnstack_.back().nedsz);
+				//aed.resize(btnstack_.back().aedsz);
+				row      = btnstack_.back().row;
+				col      = btnstack_.back().col;
+				gaps     = btnstack_.back().gaps;
+				readGaps = btnstack_.back().readGaps;
+				refGaps  = btnstack_.back().refGaps;
+				score    = btnstack_.back().score;
+				ct       = btnstack_.back().ct;
+				btnstack_.pop_back();
+				assert(!sc_->monotone || score.score() >= escore);
+				NEW_ROW_COL(row, col);
+				continue;
+			} else {
+				// No branch points to revisit; just give up
+				res.reset();
+				met.btfail++; // DP backtraces failed
+				return false;
+			}
+		}
+		assert(!reportedThru);
+		assert(!sc_->monotone || score.score() >= minsc_);
+		if(empty || row == 0) {
+			assert_eq(SSEMatrix::H, ct);
+			btcells_.expand();
+			btcells_.back().first = row;
+			btcells_.back().second = col;
+			// This cell is at the end of a legitimate alignment
+			trimBeg = row;
+			assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+			break;
+		}
+		if(branch) {
+			// Add a frame to the backtrack stack
+			btnstack_.expand();
+			btnstack_.back().init(
+				ned.size(),
+				0,               // aed.size()
+				btcells_.size(),
+				row,
+				col,
+				gaps,
+				readGaps,
+				refGaps,
+				score,
+				(int)ct);
+		}
+		btcells_.expand();
+		btcells_.back().first = row;
+		btcells_.back().second = col;
+		switch(cur) {
+			// Move up and to the left.  If the reference nucleotide in the
+			// source row mismatches the read nucleotide, penalize
+			// it and add a nucleotide mismatch.
+			case SW_BT_OALL_DIAG: {
+				assert_gt(row, 0); assert_gt(col, 0);
+				// Check for color mismatch
+				int readC = (*rd_)[row];
+				int refNmask = (int)rf_[rfi_+col];
+				assert_gt(refNmask, 0);
+				int m = matchesEx(readC, refNmask);
+				ct = SSEMatrix::H;
+				if(m != 1) {
+					Edit e(
+						(int)row,
+						mask2dna[refNmask],
+						"ACGTN"[readC],
+						EDIT_TYPE_MM);
+					assert(e.repOk());
+					assert(ned.empty() || ned.back().pos >= row);
+					ned.push_back(e);
+					int pen = QUAL2(row, col);
+					score.score_ -= pen;
+					assert(!sc_->monotone || score.score() >= escore);
+				} else {
+					// Reward a match
+					int64_t bonus = sc_->match(30);
+					score.score_ += bonus;
+					assert(!sc_->monotone || score.score() >= escore);
+				}
+				if(m == -1) {
+					// score.ns_++;
+				}
+				row--; col--;
+				MOVE_ALL_UPLEFT();
+				assert(VALID_AL_SCORE(score));
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_OALL_REF_OPEN:
+			{
+				assert_gt(row, 0);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::H;
+				int pen = sc_->refGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			// Move up.  Add an edit encoding the ref gap.
+			case SW_BT_RFGAP_EXTEND:
+			{
+				assert_gt(row, 1);
+				Edit e(
+					(int)row,
+					'-',
+					"ACGTN"[(int)(*rd_)[row]],
+					EDIT_TYPE_REF_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				row--;
+				ct = SSEMatrix::F;
+				int pen = sc_->refGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; refGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_UP();
+				break;
+			}
+			case SW_BT_OALL_READ_OPEN:
+			{
+				assert_gt(col, 0);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::H;
+				int pen = sc_->readGapOpen();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			case SW_BT_RDGAP_EXTEND:
+			{
+				assert_gt(col, 1);
+				Edit e(
+					(int)row+1,
+					mask2dna[(int)rf_[rfi_+col]],
+					'-',
+					EDIT_TYPE_READ_GAP);
+				assert(e.repOk());
+				assert(ned.empty() || ned.back().pos >= row);
+				ned.push_back(e);
+				assert_geq(row, (size_t)sc_->gapbar);
+				assert_geq((int)(rdf_-rdi_-row-1), sc_->gapbar-1);
+				col--;
+				ct = SSEMatrix::E;
+				int pen = sc_->readGapExtend();
+				score.score_ -= pen;
+				assert(!sc_->monotone || score.score() >= minsc_);
+				gaps++; readGaps++;
+				assert_eq(gaps, Edit::numGaps(ned));
+				assert_leq(gaps, rdgap_ + rfgap_);
+				MOVE_ALL_LEFT();
+				break;
+			}
+			default: throw 1;
+		}
+	} // while((int)row > 0)
+	assert_geq(col, 0);
+	assert_eq(SSEMatrix::H, ct);
+	// The number of cells in the backtracs should equal the number of read
+	// bases after trimming plus the number of gaps
+	assert_eq(btcells_.size(), dpRows() - trimBeg - trimEnd + readGaps);
+	// Check whether we went through a core diagonal and set 'reported' flag on
+	// each cell
+	bool overlappedCoreDiag = false;
+	for(size_t i = 0; i < btcells_.size(); i++) {
+		size_t rw = btcells_[i].first;
+		size_t cl = btcells_[i].second;
+		// Calculate the diagonal within the *trimmed* rectangle, i.e. the
+		// rectangle we dealt with in align, gather and backtrack.
+		int64_t diagi = cl - rw;
+		// Now adjust to the diagonal within the *untrimmed* rectangle by
+		// adding on the amount trimmed from the left.
+		diagi += rect_->triml;
+		if(diagi >= 0) {
+			size_t diag = (size_t)diagi;
+			if(diag >= rect_->corel && diag <= rect_->corer) {
+				overlappedCoreDiag = true;
+				break;
+			}
+		}
+#ifndef NDEBUG
+		//assert(!d.mat_.reportedThrough(rw, cl));
+		//d.mat_.setReportedThrough(rw, cl);
+		assert(d.mat_.reportedThrough(rw, cl));
+#endif
+	}
+	if(!overlappedCoreDiag) {
+		// Must overlap a core diagonal.  Otherwise, we run the risk of
+		// reporting an alignment that overlaps (and trumps) a higher-scoring
+		// alignment that lies partially outside the dynamic programming
+		// rectangle.
+		res.reset();
+		met.corerej++;
+		return false;
+	}
+	int readC = (*rd_)[rdi_+row];      // get last char in read
+	int refNmask = (int)rf_[rfi_+col]; // get last ref char ref involved in aln
+	assert_gt(refNmask, 0);
+	int m = matchesEx(readC, refNmask);
+	if(m != 1) {
+		Edit e((int)row, mask2dna[refNmask], "ACGTN"[readC], EDIT_TYPE_MM);
+		assert(e.repOk());
+		assert(ned.empty() || ned.back().pos >= row);
+		ned.push_back(e);
+		score.score_ -= QUAL2(row, col);
+		assert_geq(score.score(), minsc_);
+	} else {
+		score.score_ += sc_->match(30);
+	}
+	if(m == -1) {
+		// score.ns_++;
+	}
+#if 0
+	if(score.ns_ > nceil_) {
+		// Alignment has too many Ns in it!
+		res.reset();
+		met.nrej++;
+		return false;
+	}
+#endif
+	res.reverse();
+	assert(Edit::repOk(ned, (*rd_)));
+	assert_eq(score.score(), escore);
+	assert_leq(gaps, rdgap_ + rfgap_);
+	off = col;
+	assert_lt(col + (size_t)rfi_, (size_t)rff_);
+	// score.gaps_ = gaps;
+	res.alres.setScore(score);
+#if 0
+	res.alres.setShape(
+		refidx_,                  // ref id
+		off + rfi_ + rect_->refl, // 0-based ref offset
+		reflen_,                  // reference length
+		fw_,                      // aligned to Watson?
+		rdf_ - rdi_,              // read length
+		true,                     // pretrim soft?
+		0,                        // pretrim 5' end
+		0,                        // pretrim 3' end
+		true,                     // alignment trim soft?
+		fw_ ? trimBeg : trimEnd,  // alignment trim 5' end
+		fw_ ? trimEnd : trimBeg); // alignment trim 3' end
+	size_t refns = 0;
+	for(size_t i = col; i <= origCol; i++) {
+		if((int)rf_[rfi_+i] > 15) {
+			refns++;
+		}
+	}
+#endif
+	// res.alres.setRefNs(refns);
+	assert(Edit::repOk(ned, (*rd_), true, trimBeg, trimEnd));
+	assert(res.repOk());
+#ifndef NDEBUG
+	size_t gapsCheck = 0;
+	for(size_t i = 0; i < ned.size(); i++) {
+		if(ned[i].isGap()) gapsCheck++;
+	}
+	assert_eq(gaps, gapsCheck);
+	BTDnaString refstr;
+	for(size_t i = col; i <= origCol; i++) {
+		refstr.append(firsts5[(int)rf_[rfi_+i]]);
+	}
+	BTDnaString editstr;
+	Edit::toRef((*rd_), ned, editstr, true, trimBeg, trimEnd);
+	if(refstr != editstr) {
+		cerr << "Decoded nucleotides and edits don't match reference:" << endl;
+		cerr << "           score: " << score.score()
+		     << " (" << gaps << " gaps)" << endl;
+		cerr << "           edits: ";
+		Edit::print(cerr, ned);
+		cerr << endl;
+		cerr << "    decoded nucs: " << (*rd_) << endl;
+		cerr << "     edited nucs: " << editstr << endl;
+		cerr << "  reference nucs: " << refstr << endl;
+		assert(0);
+	}
+#endif
+	met.btsucc++; // DP backtraces succeeded
+	return true;
+}
diff --git a/aln_sink.h b/aln_sink.h
new file mode 100644
index 0000000..b76eec7
--- /dev/null
+++ b/aln_sink.h
@@ -0,0 +1,2427 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALN_SINK_H_
+#define ALN_SINK_H_
+
+#include <limits>
+#include <utility>
+#include <map>
+#include "read.h"
+#include "ds.h"
+#include "simple_func.h"
+#include "outq.h"
+#include "aligner_result.h"
+#include "hyperloglogplus.h"
+#include "timer.h"
+#include "taxonomy.h"
+
+
+// Forward decl
+template <typename index_t>
+class SeedResults;
+
+enum {
+	OUTPUT_SAM = 1
+};
+
+
+struct ReadCounts {
+	uint32_t n_reads;
+	uint32_t sum_score;
+	double summed_hit_len;
+	double weighted_reads;
+	uint32_t n_unique_reads;
+};
+
+/**
+ * Metrics summarizing the species level information we have
+ */
+struct SpeciesMetrics {
+    
+    //
+    struct IDs {
+        EList<uint64_t, 5> ids;
+        bool operator<(const IDs& o) const {
+            if(ids.size() != o.ids.size()) return ids.size() < o.ids.size();
+            for(size_t i = 0; i < ids.size(); i++) {
+                assert_lt(i, o.ids.size());
+                if(ids[i] != o.ids[i]) return ids[i] < o.ids[i];
+            }
+            return false;
+        }
+        
+        IDs& operator=(const IDs& other) {
+            if(this == &other)
+                return *this;
+            
+            ids = other.ids;
+            return *this;
+        }
+    };
+
+
+	SpeciesMetrics():mutex_m() {
+	    reset();
+	}
+
+	void reset() {
+		species_counts.clear();
+		//for(map<uint32_t, HyperLogLogPlusMinus<uint64_t> >::iterator it = this->species_kmers.begin(); it != this->species_kmers.end(); ++it) {
+		//	it->second.reset();
+		//} //TODO: is this required?
+		species_kmers.clear();
+        num_non_leaves = 0;
+	}
+
+	void init(
+              const map<uint64_t, ReadCounts>& species_counts_,
+              const map<uint64_t, HyperLogLogPlusMinus<uint64_t> >& species_kmers_,
+              const map<IDs, uint64_t>& observed_)
+	{
+		species_counts = species_counts_;
+		species_kmers = species_kmers_;
+        observed = observed_;
+        num_non_leaves = 0;
+    }
+
+	/**
+	 * Merge (add) the counters in the given ReportingMetrics object
+	 * into this object.  This is the only safe way to update a
+	 * ReportingMetrics shared by multiple threads.
+	 */
+	void merge(const SpeciesMetrics& met, bool getLock = false) {
+        ThreadSafe ts(&mutex_m, getLock);
+
+        // update species read count
+        for(map<uint64_t, ReadCounts>::const_iterator it = met.species_counts.begin(); it != met.species_counts.end(); ++it) {
+        	if (species_counts.find(it->first) == species_counts.end()) {
+        		species_counts[it->first] = it->second;
+        	} else {
+        		species_counts[it->first].n_reads += it->second.n_reads;
+        		species_counts[it->first].sum_score += it->second.sum_score;
+        		species_counts[it->first].summed_hit_len += it->second.summed_hit_len;
+        		species_counts[it->first].weighted_reads += it->second.weighted_reads;
+        		species_counts[it->first].n_unique_reads += it->second.n_unique_reads;
+        	}
+        }
+
+        // update species k-mers
+        for(map<uint64_t, HyperLogLogPlusMinus<uint64_t> >::const_iterator it = met.species_kmers.begin(); it != met.species_kmers.end(); ++it) {
+        	species_kmers[it->first].merge(&(it->second));
+        }
+
+        for(map<IDs, uint64_t>::const_iterator itr = met.observed.begin(); itr != met.observed.end(); itr++) {
+            const IDs& ids = itr->first;
+            uint64_t count = itr->second;
+            
+            if(observed.find(ids) == observed.end()) {
+                observed[ids] = count;
+            } else {
+                observed[ids] += count;
+            }
+        }
+    }
+
+	void addSpeciesCounts(
+                          uint64_t taxID,
+                          int64_t score,
+                          int64_t max_score,
+                          double summed_hit_len,
+                          double weighted_read,
+                          uint32_t nresult)
+    {
+		species_counts[taxID].n_reads += 1;
+		species_counts[taxID].sum_score += 1;
+		species_counts[taxID].weighted_reads += weighted_read;
+		species_counts[taxID].summed_hit_len += summed_hit_len;
+		if(nresult == 1) {
+			species_counts[taxID].n_unique_reads += 1;
+		}
+
+        // Only consider good hits for abundance analysis
+        if(score >= max_score) {
+            cur_ids.ids.push_back(taxID);
+            if(cur_ids.ids.size() == nresult) {
+                cur_ids.ids.sort();
+                if(observed.find(cur_ids) == observed.end()) {
+                    observed[cur_ids] = 1;
+                } else {
+                    observed[cur_ids] += 1;
+                }
+                cur_ids.ids.clear();
+            }
+        }
+	}
+
+	void addAllKmers(
+                     uint64_t taxID,
+                     const BTDnaString &btdna,
+                     size_t begin,
+                     size_t len) {
+#ifdef FLORIAN_DEBUG
+		cerr << "add all kmers for " << taxID << " from " << begin << " for " << len << ": " << string(btdna.toZBuf()).substr(begin,len) << endl;
+#endif
+		uint64_t kmer = btdna.int_kmer<uint64_t>(begin,begin+len);
+		species_kmers[taxID].add(kmer);
+		size_t i = begin;
+		while (i+32 < len) {
+			kmer = btdna.next_kmer(kmer,i);
+			species_kmers[taxID].add(kmer);
+			++i;
+		}
+	}
+
+	size_t nDistinctKmers(uint64_t taxID) {
+		return(species_kmers[taxID].cardinality());
+	}
+    
+    static void EM(
+                   const map<IDs, uint64_t>& observed,
+                   const map<uint64_t, EList<uint64_t> >& ancestors,
+                   const map<uint64_t, uint64_t>& tid_to_num,
+                   const EList<double>& p,
+                   EList<double>& p_next,
+                   const EList<size_t>& len)
+    {
+        assert_eq(p.size(), len.size());
+        
+        // E step
+        p_next.fill(0.0);
+        for(map<IDs, uint64_t>::const_iterator itr = observed.begin(); itr != observed.end(); itr++) {
+            const EList<uint64_t, 5>& ids = itr->first.ids;
+            uint64_t count = itr->second;
+            double psum = 0.0;
+            for(size_t i = 0; i < ids.size(); i++) {
+                uint64_t tid = ids[i];
+                // Leaves?
+                map<uint64_t, uint64_t>::const_iterator id_itr = tid_to_num.find(tid);
+                if(id_itr != tid_to_num.end()) {
+                    uint64_t num = id_itr->second;
+                    assert_lt(num, p.size());
+                    psum += p[num];
+                } else { // Ancestors
+                    map<uint64_t, EList<uint64_t> >::const_iterator a_itr = ancestors.find(tid);
+                    if(a_itr == ancestors.end())
+                        continue;
+                    const EList<uint64_t>& children = a_itr->second;
+                    for(size_t c = 0; c < children.size(); c++) {
+                        uint64_t c_tid = children[c];
+                        map<uint64_t, uint64_t>::const_iterator id_itr = tid_to_num.find(c_tid);
+                        if(id_itr == tid_to_num.end())
+                            continue;
+                        uint64_t c_num = id_itr->second;
+                        psum += p[c_num];
+                    }
+                }
+            }
+
+            if(psum == 0.0) continue;
+            
+            for(size_t i = 0; i < ids.size(); i++) {
+                uint64_t tid = ids[i];
+                // Leaves?
+                map<uint64_t, uint64_t>::const_iterator id_itr = tid_to_num.find(tid);
+                if(id_itr != tid_to_num.end()) {
+                    uint64_t num = id_itr->second;
+                    assert_leq(p[num], psum);
+                    p_next[num] += (count * (p[num] / psum));
+                } else {
+                    map<uint64_t, EList<uint64_t> >::const_iterator a_itr = ancestors.find(tid);
+                    if(a_itr == ancestors.end())
+                        continue;
+                    const EList<uint64_t>& children = a_itr->second;
+                    for(size_t c = 0; c < children.size(); c++) {
+                        uint64_t c_tid = children[c];
+                        map<uint64_t, uint64_t>::const_iterator id_itr = tid_to_num.find(c_tid);
+                        if(id_itr == tid_to_num.end())
+                            continue;
+                        uint64_t c_num = id_itr->second;
+                        p_next[c_num] += (count * (p[c_num] / psum));
+                    }
+                }
+            }
+        }
+        
+        // M step
+        double sum = 0.0;
+        for(size_t i = 0; i < p_next.size(); i++) {
+            sum += (p_next[i] / len[i]);
+        }
+        for(size_t i = 0; i < p_next.size(); i++) {
+            p_next[i] = p_next[i] / len[i] / sum;
+        }
+    }
+    
+    void calculateAbundance(const Ebwt<uint64_t>& ebwt, uint8_t rank)
+    {
+        const map<uint64_t, TaxonomyNode>& tree = ebwt.tree();
+        
+        // Find leaves
+        set<uint64_t> leaves;
+        for(map<IDs, uint64_t>::iterator itr = observed.begin(); itr != observed.end(); itr++) {
+            const IDs& ids = itr->first;
+            for(size_t i = 0; i < ids.ids.size(); i++) {
+                uint64_t tid = ids.ids[i];
+                map<uint64_t, TaxonomyNode>::const_iterator tree_itr = tree.find(tid);
+                if(tree_itr == tree.end())
+                    continue;
+                const TaxonomyNode& node = tree_itr->second;
+                if(!node.leaf) {
+                    //if(tax_rank_num[node.rank] > tax_rank_num[rank]) {
+                        continue;
+                    //}
+                }
+                leaves.insert(tree_itr->first);
+            }
+        }
+        
+ 
+#ifdef DAEHWAN_DEBUG
+        cerr << "\t\tnumber of leaves: " << leaves.size() << endl;
+#endif
+        
+        // Find all descendants coming from the same ancestor
+        map<uint64_t, EList<uint64_t> > ancestors;
+        for(map<IDs, uint64_t>::iterator itr = observed.begin(); itr != observed.end(); itr++) {
+            const IDs& ids = itr->first;
+            for(size_t i = 0; i < ids.ids.size(); i++) {
+                uint64_t tid = ids.ids[i];
+                if(leaves.find(tid) != leaves.end())
+                    continue;
+                if(ancestors.find(tid) != ancestors.end())
+                    continue;
+                ancestors[tid].clear();
+                for(set<uint64_t> ::const_iterator leaf_itr = leaves.begin(); leaf_itr != leaves.end(); leaf_itr++) {
+                    uint64_t tid2 = *leaf_itr;
+                    assert(tree.find(tid2) != tree.end());
+                    assert(tree.find(tid2)->second.leaf);
+                    uint64_t temp_tid2 = tid2;
+                    while(true) {
+                        map<uint64_t, TaxonomyNode>::const_iterator tree_itr = tree.find(temp_tid2);
+                        if(tree_itr == tree.end())
+                            break;
+                        const TaxonomyNode& node = tree_itr->second;
+                        if(tid == node.parent_tid) {
+                            ancestors[tid].push_back(tid2);
+                        }
+                        if(temp_tid2 == node.parent_tid)
+                            break;
+                        temp_tid2 = node.parent_tid;
+                    }
+                }
+                ancestors[tid].sort();
+            }
+        }
+        
+#ifdef DAEHWAN_DEBUG
+        cerr << "\t\tnumber of ancestors: " << ancestors.size() << endl;
+        for(map<uint64_t, EList<uint64_t> >::const_iterator itr = ancestors.begin(); itr != ancestors.end(); itr++) {
+            uint64_t tid = itr->first;
+            const EList<uint64_t>& children = itr->second;
+            if(children.size() <= 0)
+                continue;
+            map<uint64_t, TaxonomyNode>::const_iterator tree_itr = tree.find(tid);
+            if(tree_itr == tree.end())
+                continue;
+            const TaxonomyNode& node = tree_itr->second;
+            cerr << "\t\t\t" << tid << ": " << children.size() << "\t" << get_tax_rank(node.rank) << endl;
+            cerr << "\t\t\t\t";
+            for(size_t i = 0; i < children.size(); i++) {
+                cerr << children[i];
+                if(i + 1 < children.size())
+                    cerr << ",";
+                if(i > 10) {
+                    cerr << " ...";
+                    break;
+                }
+            }
+            cerr << endl;
+        }
+        
+        uint64_t test_tid = 0, test_tid2 = 0;
+#endif
+        // Lengths of genomes (or contigs)
+        const map<uint64_t, uint64_t>& size_table = ebwt.size();
+        
+        // Initialize probabilities
+        map<uint64_t, uint64_t> tid_to_num; // taxonomic ID to corresponding element of a list
+        EList<double> p;
+        EList<size_t> len; // genome lengths
+        for(map<IDs, uint64_t>::iterator itr = observed.begin(); itr != observed.end(); itr++) {
+            const IDs& ids = itr->first;
+            uint64_t count = itr->second;
+            for(size_t i = 0; i < ids.ids.size(); i++) {
+                uint64_t tid = ids.ids[i];
+                if(leaves.find(tid) == leaves.end())
+                    continue;
+                
+#ifdef DAEHWAN_DEBUG
+                if((tid == test_tid || tid == test_tid2) &&
+                   count >= 100) {
+                    cerr << tid << ": " << count << "\t";
+                    for(size_t j = 0; j < ids.ids.size(); j++) {
+                        cerr << ids.ids[j];
+                        if(j + 1 < ids.ids.size())
+                            cerr << ",";
+                    }
+                    cerr << endl;
+                }
+#endif
+                
+                if(tid_to_num.find(tid) == tid_to_num.end()) {
+                    tid_to_num[tid] = p.size();
+                    p.push_back(1.0 / ids.ids.size() * count);
+                    map<uint64_t, uint64_t>::const_iterator size_itr = size_table.find(tid);
+                    if(size_itr != size_table.end()) {
+                        len.push_back(size_itr->second);
+                    } else {
+                        len.push_back(std::numeric_limits<size_t>::max());
+                    }
+                } else {
+                    uint64_t num = tid_to_num[tid];
+                    assert_lt(num, p.size());
+                    p[num] += (1.0 / ids.ids.size() * count);
+                }
+            }
+        }
+        
+        assert_eq(p.size(), len.size());
+        
+        {
+            double sum = 0.0;
+            for(size_t i = 0; i < p.size(); i++) {
+                sum += (p[i] / len[i]);
+            }
+            for(size_t i = 0; i < p.size(); i++) {
+                p[i] = (p[i] / len[i]) / sum;
+            }
+        }
+        
+        EList<double> p_next; p_next.resizeExact(p.size());
+        EList<double> p_next2; p_next2.resizeExact(p.size());
+        EList<double> p_r; p_r.resizeExact(p.size());
+        EList<double> p_v; p_v.resizeExact(p.size());
+        size_t num_iteration = 0;
+        double diff = 0.0;
+        while(true) {
+#ifdef DAEHWAN_DEBUG
+            if(num_iteration % 50 == 0) {
+                if(test_tid != 0 || test_tid2 != 0)
+                    cerr << "iter " << num_iteration << endl;
+                if(test_tid != 0)
+                    cerr << "\t" << test_tid << ": " << p[tid_to_num[test_tid]] << endl;
+                if(test_tid2 != 0)
+                    cerr << "\t" << test_tid2 << ": " << p[tid_to_num[test_tid2]] << endl;
+            }
+#endif
+            
+            // Accelerated version of EM - SQUAREM iteration
+            //    Varadhan, R. & Roland, C. Scand. J. Stat. 35, 335–353 (2008).
+            //    Also, this algorithm is used in Sailfish - http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html
+#if 1
+            EM(observed, ancestors, tid_to_num, p, p_next, len);
+            EM(observed, ancestors, tid_to_num, p_next, p_next2, len);
+            double sum_squared_r = 0.0, sum_squared_v = 0.0;
+            for(size_t i = 0; i < p.size(); i++) {
+                p_r[i] = p_next[i] - p[i];
+                sum_squared_r += (p_r[i] * p_r[i]);
+                p_v[i] = p_next2[i] - p_next[i] - p_r[i];
+                sum_squared_v += (p_v[i] * p_v[i]);
+            }
+            if(sum_squared_v > 0.0) {
+                double gamma = -sqrt(sum_squared_r / sum_squared_v);
+                for(size_t i = 0; i < p.size(); i++) {
+                    p_next2[i] = max(0.0, p[i] - 2 * gamma * p_r[i] + gamma * gamma * p_v[i]);
+                }
+                EM(observed, ancestors, tid_to_num, p_next2, p_next, len);
+            }
+            
+#else
+            EM(observed, ancestors, tid_to_num, p, p_next, len);
+#endif
+            
+            diff = 0.0;
+            for(size_t i = 0; i < p.size(); i++) {
+                diff += (p[i] > p_next[i] ? p[i] - p_next[i] : p_next[i] - p[i]);
+            }
+            if(diff < 0.0000000001) break;
+            if(++num_iteration >= 10000) break;
+            p = p_next;
+        }
+        
+        cerr << "Number of iterations in EM algorithm: " << num_iteration << endl;
+        cerr << "Probability diff. (P - P_prev) in the last iteration: " << diff << endl;
+        
+        {
+            // Calculate abundance normalized by genome size
+            abundance_len.clear();
+            double sum = 0.0;
+            for(map<uint64_t, uint64_t>::iterator itr = tid_to_num.begin(); itr != tid_to_num.end(); itr++) {
+                uint64_t tid = itr->first;
+                uint64_t num = itr->second;
+                assert_lt(num, p.size());
+                abundance_len[tid] = p[num];
+                sum += (p[num] * len[num]);
+            }
+            
+            // Calculate abundance without genome size taken into account
+            abundance.clear();
+            for(map<uint64_t, uint64_t>::iterator itr = tid_to_num.begin(); itr != tid_to_num.end(); itr++) {
+                uint64_t tid = itr->first;
+                uint64_t num = itr->second;
+                assert_lt(num, p.size());
+                abundance[tid] = (p[num] * len[num]) / sum;
+            }
+        }
+    }
+
+	map<uint64_t, ReadCounts> species_counts;                        // read count per species
+	map<uint64_t, HyperLogLogPlusMinus<uint64_t> > species_kmers;    // unique k-mer count per species
+    
+    map<IDs, uint64_t>     observed;
+    IDs                    cur_ids;
+    uint32_t               num_non_leaves;
+    map<uint64_t, double>  abundance;      // abundance without genome size taken into consideration
+    map<uint64_t, double>  abundance_len;  // abundance normalized by genome size
+
+	MUTEX_T mutex_m;
+};
+
+
+/**
+ * Metrics summarizing the work done by the reporter and summarizing
+ * the number of reads that align, that fail to align, and that align
+ * non-uniquely.
+ */
+struct ReportingMetrics {
+
+	ReportingMetrics():mutex_m() {
+	    reset();
+	}
+
+	void reset() {
+		init(0, 0, 0, 0);
+	}
+
+	void init(
+		uint64_t nread_,
+		uint64_t npaired_,
+		uint64_t nunpaired_,
+		uint64_t nconcord_uni_)
+	{
+		nread         = nread_;
+		
+		npaired       = npaired_;
+		nunpaired     = nunpaired_;
+		
+		nconcord_uni  = nconcord_uni_;
+    }
+	
+	/**
+	 * Merge (add) the counters in the given ReportingMetrics object
+	 * into this object.  This is the only safe way to update a
+	 * ReportingMetrics shared by multiple threads.
+	 */
+	void merge(const ReportingMetrics& met, bool getLock = false) {
+        ThreadSafe ts(&mutex_m, getLock);
+		nread         += met.nread;
+
+		npaired       += met.npaired;
+		nunpaired     += met.nunpaired;
+
+		nconcord_uni  += met.nconcord_uni;
+    }
+
+	uint64_t  nread;         // # reads
+	uint64_t  npaired;       // # pairs
+	uint64_t  nunpaired;     // # unpaired reads
+	
+	// Paired
+	
+	// Concordant
+	uint64_t  nconcord_uni;  // # pairs with unique concordant alns
+		
+	MUTEX_T mutex_m;
+};
+
+// Type for expression numbers of hits
+typedef int64_t THitInt;
+
+/**
+ * Parameters affecting reporting of alignments, specifically -k & -a,
+ * -m & -M.
+ */
+struct ReportingParams {
+
+	explicit ReportingParams(THitInt khits_, bool compressed_)
+	{
+		init(khits_, compressed_);
+	}
+
+	void init(THitInt khits_, bool compressed_)
+	{
+		khits = khits_;     // -k (or high if -a)
+        if(compressed_) {
+            ihits = max<THitInt>(khits, 5) * 4;
+        } else {
+            ihits = max<THitInt>(khits, 5) * 40;
+        }
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that reporting parameters are internally consistent.
+	 */
+	bool repOk() const {
+		assert_geq(khits, 1);
+		return true;
+	}
+#endif
+	
+	inline THitInt mult() const {
+		return khits;
+	}
+
+	// Number of assignments to report
+	THitInt khits;
+    
+    // Number of internal assignments
+    THitInt ihits;
+};
+
+/**
+ * A state machine keeping track of the number and type of alignments found so
+ * far.  Its purpose is to inform the caller as to what stage the alignment is
+ * in and what categories of alignment are still of interest.  This information
+ * should allow the caller to short-circuit some alignment work.  Another
+ * purpose is to tell the AlnSinkWrap how many and what type of alignment to
+ * report.
+ *
+ * TODO: This class does not keep accurate information about what
+ * short-circuiting took place.  If a read is identical to a previous read,
+ * there should be a way to query this object to determine what work, if any,
+ * has to be re-done for the new read.
+ */
+class ReportingState {
+
+public:
+
+	enum {
+		NO_READ = 1,        // haven't got a read yet
+		CONCORDANT_PAIRS,   // looking for concordant pairs
+		DONE                // finished looking
+	};
+
+	// Flags for different ways we can finish out a category of potential
+	// alignments.
+	
+	enum {
+		EXIT_DID_NOT_EXIT = 1,        // haven't finished
+		EXIT_DID_NOT_ENTER,           // never tried search	
+		EXIT_SHORT_CIRCUIT_k,         // -k exceeded
+		EXIT_NO_ALIGNMENTS,           // none found
+		EXIT_WITH_ALIGNMENTS          // some found
+	};
+	
+	ReportingState(const ReportingParams& p) : p_(p) { reset(); }
+	
+	/**
+	 * Set all state to uninitialized defaults.
+	 */
+	void reset() {
+		state_ = ReportingState::NO_READ;
+		paired_ = false;
+		nconcord_ = 0;
+		doneConcord_ = false;
+		exitConcord_ = ReportingState::EXIT_DID_NOT_ENTER;
+		done_ = false;
+	}
+	
+	/**
+	 * Return true iff this ReportingState has been initialized with a call to
+	 * nextRead() since the last time reset() was called.
+	 */
+	bool inited() const { return state_ != ReportingState::NO_READ; }
+
+	/**
+	 * Initialize state machine with a new read.  The state we start in depends
+	 * on whether it's paired-end or unpaired.
+	 */
+	void nextRead(bool paired);
+
+	/**
+	 * Caller uses this member function to indicate that one additional
+	 * concordant alignment has been found.
+	 */
+	bool foundConcordant();
+
+	/**
+	 * Caller uses this member function to indicate that one additional
+	 * discordant alignment has been found.
+	 */
+	bool foundUnpaired(bool mate1);
+	
+	/**
+	 * Called to indicate that the aligner has finished searching for
+	 * alignments.  This gives us a chance to finalize our state.
+	 *
+	 * TODO: Keep track of short-circuiting information.
+	 */
+	void finish();
+	
+	/**
+	 * Populate given counters with the number of various kinds of alignments
+	 * to report for this read.  Concordant alignments are preferable to (and
+	 * mutually exclusive with) discordant alignments, and paired-end
+	 * alignments are preferable to unpaired alignments.
+	 *
+	 * The caller also needs some additional information for the case where a
+	 * pair or unpaired read aligns repetitively.  If the read is paired-end
+	 * and the paired-end has repetitive concordant alignments, that should be
+	 * reported, and 'pairMax' is set to true to indicate this.  If the read is
+	 * paired-end, does not have any conordant alignments, but does have
+	 * repetitive alignments for one or both mates, then that should be
+	 * reported, and 'unpair1Max' and 'unpair2Max' are set accordingly.
+	 *
+	 * Note that it's possible in the case of a paired-end read for the read to
+	 * have repetitive concordant alignments, but for one mate to have a unique
+	 * unpaired alignment.
+	 */
+	void getReport(uint64_t& nconcordAln) const; // # concordant alignments to report
+
+	/**
+	 * Return an integer representing the alignment state we're in.
+	 */
+	inline int state() const { return state_; }
+	
+	/**
+	 * If false, there's no need to solve any more dynamic programming problems
+	 * for finding opposite mates.
+	 */
+	inline bool doneConcordant() const { return doneConcord_; }
+	
+	/**
+	 * Return true iff all alignment stages have been exited.
+	 */
+	inline bool done() const { return done_; }
+
+	inline uint64_t numConcordant() const { return nconcord_; }
+
+	inline int exitConcordant() const { return exitConcord_; }
+
+	/**
+	 * Return ReportingParams object governing this ReportingState.
+	 */
+	const ReportingParams& params() const {
+		return p_;
+	}
+
+protected:
+	const ReportingParams& p_;  // reporting parameters
+	int state_;          // state we're currently in
+	bool paired_;        // true iff read we're currently handling is paired
+	uint64_t nconcord_;  // # concordants found so far
+	bool doneConcord_;   // true iff we're no longner interested in concordants
+	int exitConcord_;    // flag indicating how we exited concordant state
+	bool done_;          // done with all alignments
+};
+
+/**
+ * Global hit sink for hits from the MultiSeed aligner.  Encapsulates
+ * all aspects of the MultiSeed aligner hitsink that are global to all
+ * threads.  This includes aspects relating to:
+ *
+ * (a) synchronized access to the output stream
+ * (b) the policy to be enforced by the per-thread wrapper
+ *
+ * TODO: Implement splitting up of alignments into separate files
+ * according to genomic coordinate.
+ */
+template <typename index_t>
+class AlnSink {
+
+	typedef EList<std::string> StrList;
+
+public:
+
+	explicit AlnSink(
+                     OutputQueue& oq,
+                     const StrList& refnames,
+                     bool quiet) :
+    oq_(oq),
+    refnames_(refnames),
+    quiet_(quiet)
+	{
+	}
+
+	/**
+	 * Destroy HitSinkobject;
+	 */
+	virtual ~AlnSink() { }
+
+	/**
+	 * Called when the AlnSink is wrapped by a new AlnSinkWrap.  This helps us
+	 * keep track of whether the main lock or any of the per-stream locks will
+	 * be contended by multiple threads.
+	 */
+	void addWrapper() { numWrappers_++; }
+
+	/**
+	 * Append a single hit to the given output stream.  If
+	 * synchronization is required, append() assumes the caller has
+	 * already grabbed the appropriate lock.
+	 */
+	virtual void append(
+		BTString&             o,
+		size_t                threadId,
+		const Read           *rd1,
+		const Read           *rd2,
+		const TReadId         rdid,
+		AlnRes               *rs1,
+		AlnRes               *rs2,
+		const AlnSetSumm&     summ,
+		const PerReadMetrics& prm,
+		SpeciesMetrics& sm,
+		bool report2,
+		size_t n_results) = 0;
+
+	/**
+	 * Report a given batch of hits for the given read or read pair.
+	 * Should be called just once per read pair.  Assumes all the
+	 * alignments are paired, split between rs1 and rs2.
+	 *
+	 * The caller hasn't decided which alignments get reported as primary
+	 * or secondary; that's up to the routine.  Because the caller might
+	 * want to know this, we use the pri1 and pri2 out arguments to
+	 * convey this.
+	 */
+	virtual void reportHits(
+		BTString&             o,              // write to this buffer
+		size_t                threadId,       // which thread am I?
+		const Read           *rd1,            // mate #1
+		const Read           *rd2,            // mate #2
+		const TReadId         rdid,           // read ID
+		const EList<size_t>&  select1,        // random subset of rd1s
+		const EList<size_t>*  select2,        // random subset of rd2s
+		EList<AlnRes>        *rs1,            // alignments for mate #1
+		EList<AlnRes>        *rs2,            // alignments for mate #2
+		bool                  maxed,          // true iff -m/-M exceeded
+		const AlnSetSumm&     summ,           // summary
+		const PerReadMetrics& prm,            // per-read metrics
+		SpeciesMetrics& sm,             // species metrics
+		bool                  getLock = true) // true iff lock held by caller
+	{
+		assert(rd1 != NULL || rd2 != NULL);
+		assert(rs1 != NULL || rs2 != NULL);
+
+        for(size_t i = 0; i < select1.size(); i++) {
+            AlnRes* r1 = ((rs1 != NULL) ? &rs1->get(select1[i]) : NULL);
+            AlnRes* r2 = ((rs2 != NULL) ? &rs2->get(select1[i]) : NULL);
+            append(o, threadId, rd1, rd2, rdid, r1, r2, summ, prm, sm, true, select1.size());
+        }
+	}
+
+	/**
+	 * Report an unaligned read.  Typically we do nothing, but we might
+	 * want to print a placeholder when output is chained.
+	 */
+	virtual void reportUnaligned(
+		BTString&             o,              // write to this string
+		size_t                threadId,       // which thread am I?
+		const Read           *rd1,            // mate #1
+		const Read           *rd2,            // mate #2
+		const TReadId         rdid,           // read ID
+		const AlnSetSumm&     summ,           // summary
+		const PerReadMetrics& prm,            // per-read metrics
+		bool                  report2,        // report alns for both mates?
+		bool                  getLock = true) // true iff lock held by caller
+	{
+		// FIXME: reportUnaligned does nothing
+		//append(o, threadId, rd1, rd2, rdid, NULL, NULL, summ, prm, NULL,report2);
+	}
+
+	/**
+	 * Print summary of how many reads aligned, failed to align and aligned
+	 * repetitively.  Write it to stderr.  Optionally write Hadoop counter
+	 * updates.
+	 */
+	void printAlSumm(
+		const ReportingMetrics& met,
+		size_t repThresh, // threshold for uniqueness, or max if no thresh
+		bool discord,     // looked for discordant alignments
+		bool mixed,       // looked for unpaired alignments where paired failed?
+		bool hadoopOut);  // output Hadoop counters?
+
+	/**
+	 * Called when all alignments are complete.  It is assumed that no
+	 * synchronization is necessary.
+	 */
+	void finish(
+		size_t repThresh,
+		bool discord,
+		bool mixed,
+		bool hadoopOut)
+	{
+		// Close output streams
+		if(!quiet_) {
+			printAlSumm(
+				met_,
+				repThresh,
+				discord,
+				mixed,
+				hadoopOut);
+		}
+	}
+
+#ifndef NDEBUG
+	/**
+	 * Check that hit sink is internally consistent.
+	 */
+	bool repOk() const { return true; }
+#endif
+	
+	//
+	// Related to reporting seed hits
+	//
+
+	/**
+	 * Given a Read and associated, filled-in SeedResults objects,
+	 * print a record summarizing the seed hits.
+	 */
+	void reportSeedSummary(
+		BTString&          o,
+		const Read&        rd,
+		TReadId            rdid,
+		size_t             threadId,
+		const SeedResults<index_t>& rs,
+		bool               getLock = true);
+
+	/**
+	 * Given a Read, print an empty record (all 0s).
+	 */
+	void reportEmptySeedSummary(
+		BTString&          o,
+		const Read&        rd,
+		TReadId            rdid,
+		size_t             threadId,
+		bool               getLock = true);
+
+	/**
+	 * Append a batch of unresolved seed alignment results (i.e. seed
+	 * alignments where all we know is the reference sequence aligned
+	 * to and its SA range, not where it falls in the reference
+	 * sequence) to the given output stream in Bowtie's seed-alignment
+	 * verbose-mode format.
+	 */
+	virtual void appendSeedSummary(
+		BTString&     o,
+		const Read&   rd,
+		const TReadId rdid,
+		size_t        seedsTried,
+		size_t        nonzero,
+		size_t        ranges,
+		size_t        elts,
+		size_t        seedsTriedFw,
+		size_t        nonzeroFw,
+		size_t        rangesFw,
+		size_t        eltsFw,
+		size_t        seedsTriedRc,
+		size_t        nonzeroRc,
+		size_t        rangesRc,
+		size_t        eltsRc);
+
+	/**
+	 * Merge given metrics in with ours by summing all individual metrics.
+	 */
+	void mergeMetrics(const ReportingMetrics& met, bool getLock = true) {
+		met_.merge(met, getLock);
+	}
+
+	/**
+	 * Return mutable reference to the shared OutputQueue.
+	 */
+	OutputQueue& outq() {
+		return oq_;
+	}
+
+protected:
+    
+	OutputQueue&       oq_;           // output queue
+	int                numWrappers_;  // # threads owning a wrapper for this HitSink
+	const StrList&     refnames_;     // reference names
+	bool               quiet_;        // true -> don't print alignment stats at the end
+	ReportingMetrics   met_;          // global repository of reporting metrics
+};
+
+/**
+ * Per-thread hit sink "wrapper" for the MultiSeed aligner.  Encapsulates
+ * aspects of the MultiSeed aligner hit sink that are per-thread.  This
+ * includes aspects relating to:
+ *
+ * (a) Enforcement of the reporting policy
+ * (b) Tallying of results
+ * (c) Storing of results for the previous read in case this allows us to
+ *     short-circuit some work for the next read (i.e. if it's identical)
+ *
+ * PHASED ALIGNMENT ASSUMPTION
+ *
+ * We make some assumptions about how alignment proceeds when we try to
+ * short-circuit work for identical reads.  Specifically, we assume that for
+ * each read the aligner proceeds in a series of stages (or perhaps just one
+ * stage).  In each stage, the aligner either:
+ *
+ * (a)  Finds no alignments, or
+ * (b)  Finds some alignments and short circuits out of the stage with some
+ *      random reporting involved (e.g. in -k and/or -M modes), or
+ * (c)  Finds all of the alignments in the stage
+ *
+ * In the event of (a), the aligner proceeds to the next stage and keeps
+ * trying; we can skip the stage entirely for the next read if it's identical.
+ * In the event of (b), or (c), the aligner stops and does not proceed to
+ * further stages.  In the event of (b1), if the next read is identical we
+ * would like to tell the aligner to start again at the beginning of the stage
+ * that was short-circuited.
+ *
+ * In any event, the rs1_/rs2_/rs1u_/rs2u_ fields contain the alignments found
+ * in the last alignment stage attempted.
+ *
+ * HANDLING REPORTING LIMITS
+ *
+ * The user can specify reporting limits, like -k (specifies number of
+ * alignments to report out of those found) and -M (specifies a ceiling s.t. if
+ * there are more alignments than the ceiling, read is called repetitive and
+ * best found is reported).  Enforcing these limits is straightforward for
+ * unpaired alignments: if a new alignment causes us to exceed the -M ceiling,
+ * we can stop looking.
+ *
+ * The case where both paired-end and unpaired alignments are possible is
+ * trickier.  Once we have a number of unpaired alignments that exceeds the
+ * ceiling, we can stop looking *for unpaired alignments* - but we can't
+ * necessarily stop looking for paired-end alignments, since there may yet be
+ * more to find.  However, if the input read is not a pair, then we can stop at
+ * this point.  If the input read is a pair and we have a number of paired
+ * aligments that exceeds the -M ceiling, we can stop looking.
+ *
+ * CONCORDANT & DISCORDANT, PAIRED & UNPAIRED
+ *
+ * A note on paired-end alignment: Clearly, if an input read is
+ * paired-end and we find either concordant or discordant paired-end
+ * alignments for the read, then we would like to tally and report
+ * those alignments as such (and not as groups of 2 unpaired
+ * alignments).  And if we fail to find any paired-end alignments, but
+ * we do find some unpaired alignments for one mate or the other, then
+ * we should clearly tally and report those alignments as unpaired
+ * alignments (if the user so desires).
+ *
+ * The situation is murkier when there are no paired-end alignments,
+ * but there are unpaired alignments for *both* mates.  In this case,
+ * we might want to pick out zero or more pairs of mates and classify
+ * those pairs as discordant paired-end alignments.  And we might want
+ * to classify the remaining alignments as unpaired.  But how do we
+ * pick which pairs if any to call discordant?
+ *
+ * Because the most obvious use for discordant pairs is for identifying
+ * large-scale variation, like rearrangements or large indels, we would
+ * usually like to be conservative about what we call a discordant
+ * alignment.  If there's a good chance that one or the other of the
+ * two mates has a good alignment to another place on the genome, this
+ * compromises the evidence for the large-scale variant.  For this
+ * reason, Bowtie 2's policy is: if there are no paired-end alignments
+ * and there is *exactly one alignment each* for both mates, then the
+ * two alignments are paired and treated as a discordant paired-end
+ * alignment.  Otherwise, all alignments are treated as unpaired
+ * alignments.
+ *
+ * When both paired and unpaired alignments are discovered by the
+ * aligner, only the paired alignments are reported by default.  This
+ * is sensible considering relative likelihoods: if a good paired-end
+ * alignment is found, it is much more likely that the placement of
+ * the two mates implied by that paired alignment is correct than any
+ * placement implied by an unpaired alignment.
+ *
+ * 
+ */
+template <typename index_t>
+class AlnSinkWrap {
+public:
+
+	AlnSinkWrap(
+		AlnSink<index_t>& g,       // AlnSink being wrapped
+		const ReportingParams& rp, // Parameters governing reporting
+		size_t threadId,           // Thread ID
+        bool secondary = false) :  // Secondary alignments
+		g_(g),
+		rp_(rp),
+        threadid_(threadId),
+    	secondary_(secondary),
+		init_(false),   
+		maxed1_(false),       // read is pair and we maxed out mate 1 unp alns
+		maxed2_(false),       // read is pair and we maxed out mate 2 unp alns
+		maxedOverall_(false), // alignments found so far exceed -m/-M ceiling
+		bestPair_(std::numeric_limits<TAlScore>::min()),
+		best2Pair_(std::numeric_limits<TAlScore>::min()),
+		bestUnp1_(std::numeric_limits<TAlScore>::min()),
+		best2Unp1_(std::numeric_limits<TAlScore>::min()),
+		bestUnp2_(std::numeric_limits<TAlScore>::min()),
+		best2Unp2_(std::numeric_limits<TAlScore>::min()),
+        bestSplicedPair_(0),
+        best2SplicedPair_(0),
+        bestSplicedUnp1_(0),
+        best2SplicedUnp1_(0),
+        bestSplicedUnp2_(0),
+        best2SplicedUnp2_(0),
+		rd1_(NULL),    // mate 1
+		rd2_(NULL),    // mate 2
+		rdid_(std::numeric_limits<TReadId>::max()), // read id
+		rs_(),         // mate 1 alignments for paired-end alignments
+		select_(),     // for selecting random subsets for mate 1
+		st_(rp)        // reporting state - what's left to do?
+	{
+		assert(rp_.repOk());
+	}
+
+	AlnSink<index_t>& getSink() {
+		return(g_);
+	}
+
+	/**
+	 * Initialize the wrapper with a new read pair and return an
+	 * integer >= -1 indicating which stage the aligner should start
+	 * at.  If -1 is returned, the aligner can skip the read entirely.
+	 * at.  If .  Checks if the new read pair is identical to the
+	 * previous pair.  If it is, then we return the id of the first
+	 * stage to run.
+	 */
+	int nextRead(
+		// One of the other of rd1, rd2 will = NULL if read is unpaired
+		const Read* rd1,      // new mate #1
+		const Read* rd2,      // new mate #2
+		TReadId rdid,         // read ID for new pair
+		bool qualitiesMatter);// aln policy distinguishes b/t quals?
+
+	/**
+	 * Inform global, shared AlnSink object that we're finished with
+	 * this read.  The global AlnSink is responsible for updating
+	 * counters, creating the output record, and delivering the record
+	 * to the appropriate output stream.
+	 */
+	void finishRead(
+		const SeedResults<index_t> *sr1, // seed alignment results for mate 1
+		const SeedResults<index_t> *sr2, // seed alignment results for mate 2
+		bool               exhaust1,     // mate 1 exhausted?
+		bool               exhaust2,     // mate 2 exhausted?
+		bool               nfilt1,       // mate 1 N-filtered?
+		bool               nfilt2,       // mate 2 N-filtered?
+		bool               scfilt1,      // mate 1 score-filtered?
+		bool               scfilt2,      // mate 2 score-filtered?
+		bool               lenfilt1,     // mate 1 length-filtered?
+		bool               lenfilt2,     // mate 2 length-filtered?
+		bool               qcfilt1,      // mate 1 qc-filtered?
+		bool               qcfilt2,      // mate 2 qc-filtered?
+		bool               sortByScore,  // prioritize alignments by score
+		RandomSource&      rnd,          // pseudo-random generator
+		ReportingMetrics&  met,          // reporting metrics
+		SpeciesMetrics&    smet,         // species metrics
+		const PerReadMetrics& prm,       // per-read metrics
+		bool suppressSeedSummary = true,
+		bool suppressAlignments = false);
+	
+	/**
+	 * Called by the aligner when a new unpaired or paired alignment is
+	 * discovered in the given stage.  This function checks whether the
+	 * addition of this alignment causes the reporting policy to be
+	 * violated (by meeting or exceeding the limits set by -k, -m, -M),
+	 * in which case true is returned immediately and the aligner is
+	 * short circuited.  Otherwise, the alignment is tallied and false
+	 * is returned.
+	 */
+	bool report(
+		int stage,
+        const AlnRes* rs);
+
+#ifndef NDEBUG
+	/**
+	 * Check that hit sink wrapper is internally consistent.
+	 */
+	bool repOk() const {
+		if(init_) {
+			assert(rd1_ != NULL);
+			assert_neq(std::numeric_limits<TReadId>::max(), rdid_);
+		}
+		return true;
+	}
+#endif
+	
+	/**
+	 * Return true iff no alignments have been reported to this wrapper
+	 * since the last call to nextRead().
+	 */
+	bool empty() const {
+		return rs_.empty();
+	}
+	
+	/**
+	 * Return true iff we have already encountered a number of alignments that
+	 * exceeds the -m/-M ceiling.  TODO: how does this distinguish between
+	 * pairs and mates?
+	 */
+	bool maxed() const {
+		return maxedOverall_;
+	}
+	
+	/**
+	 * Return true if the current read is paired.
+	 */
+	bool readIsPair() const {
+		return rd1_ != NULL && rd2_ != NULL;
+	}
+	
+	/**
+	 * Return true iff nextRead() has been called since the last time
+	 * finishRead() was called.
+	 */
+	bool inited() const { return init_; }
+
+	/**
+	 * Return a const ref to the ReportingState object associated with the
+	 * AlnSinkWrap.
+	 */
+	const ReportingState& state() const { return st_; }
+    
+    const ReportingParams& reportingParams() { return rp_;}
+	
+    SpeciesMetrics& speciesMetrics() { return g_.speciesMetrics(); }
+	
+	/**
+	 * Return true iff at least two alignments have been reported so far for an
+	 * unpaired read or mate 1.
+	 */
+	bool hasSecondBestUnp1() const {
+		return best2Unp1_ != std::numeric_limits<TAlScore>::min();
+	}
+
+	/**
+	 * Return true iff at least two alignments have been reported so far for
+	 * mate 2.
+	 */
+	bool hasSecondBestUnp2() const {
+		return best2Unp2_ != std::numeric_limits<TAlScore>::min();
+	}
+
+	/**
+	 * Return true iff at least two paired-end alignments have been reported so
+	 * far.
+	 */
+	bool hasSecondBestPair() const {
+		return best2Pair_ != std::numeric_limits<TAlScore>::min();
+	}
+	
+	/**
+	 * Get best score observed so far for an unpaired read or mate 1.
+	 */
+	TAlScore bestUnp1() const {
+		return bestUnp1_;
+	}
+
+	/**
+	 * Get second-best score observed so far for an unpaired read or mate 1.
+	 */
+	TAlScore secondBestUnp1() const {
+		return best2Unp1_;
+	}
+
+	/**
+	 * Get best score observed so far for mate 2.
+	 */
+	TAlScore bestUnp2() const {
+		return bestUnp2_;
+	}
+
+	/**
+	 * Get second-best score observed so far for mate 2.
+	 */
+	TAlScore secondBestUnp2() const {
+		return best2Unp2_;
+	}
+
+	/**
+	 * Get best score observed so far for paired-end read.
+	 */
+	TAlScore bestPair() const {
+		return bestPair_;
+	}
+
+	/**
+	 * Get second-best score observed so far for paired-end read.
+	 */
+	TAlScore secondBestPair() const {
+		return best2Pair_;
+	}
+    
+    
+    /**
+     *
+     */
+    void getPair(const EList<AlnRes>*& rs) const { rs = &rs_; }
+
+protected:
+
+	/**
+	 * Return true iff the read in rd1/rd2 matches the last read handled, which
+	 * should still be in rd1_/rd2_.
+	 */
+	bool sameRead(
+		const Read* rd1,
+		const Read* rd2,
+		bool qualitiesMatter);
+
+	/**
+	 * Given that rs is already populated with alignments, consider the
+	 * alignment policy and make random selections where necessary.  E.g. if we
+	 * found 10 alignments and the policy is -k 2 -m 20, select 2 alignments at
+	 * random.  We "select" an alignment by setting the parallel entry in the
+	 * 'select' list to true.
+	 */
+	size_t selectAlnsToReport(
+		const EList<AlnRes>& rs,     // alignments to select from
+		uint64_t             num,    // number of alignments to select
+		EList<size_t>&       select, // list to put results in
+		RandomSource&        rnd)
+		const;
+
+	/**
+	 * rs1 (possibly together with rs2 if reads are paired) are populated with
+	 * alignments.  Here we prioritize them according to alignment score, and
+	 * some randomness to break ties.  Priorities are returned in the 'select'
+	 * list.
+	 */
+	size_t selectByScore(
+		const EList<AlnRes>* rs,    // alignments to select from (mate 1)
+		uint64_t             num,    // number of alignments to select
+		EList<size_t>&       select, // prioritized list to put results in
+		RandomSource&        rnd)
+		const;
+
+	AlnSink<index_t>& g_;     // global alignment sink
+	ReportingParams   rp_;    // reporting parameters: khits, mhits etc
+	size_t            threadid_; // thread ID
+    bool              secondary_; // allow for secondary alignments
+	bool              init_;  // whether we're initialized w/ read pair
+	bool              maxed1_; // true iff # unpaired mate-1 alns reported so far exceeded -m/-M
+	bool              maxed2_; // true iff # unpaired mate-2 alns reported so far exceeded -m/-M
+	bool              maxedOverall_; // true iff # paired-end alns reported so far exceeded -m/-M
+	TAlScore          bestPair_;     // greatest score so far for paired-end
+	TAlScore          best2Pair_;    // second-greatest score so far for paired-end
+	TAlScore          bestUnp1_;     // greatest score so far for unpaired/mate1
+	TAlScore          best2Unp1_;    // second-greatest score so far for unpaired/mate1
+	TAlScore          bestUnp2_;     // greatest score so far for mate 2
+	TAlScore          best2Unp2_;    // second-greatest score so far for mate 2
+    index_t           bestSplicedPair_;
+    index_t           best2SplicedPair_;
+    index_t           bestSplicedUnp1_;
+    index_t           best2SplicedUnp1_;
+    index_t           bestSplicedUnp2_;
+    index_t           best2SplicedUnp2_;
+	const Read*       rd1_;   // mate #1
+	const Read*       rd2_;   // mate #2
+	TReadId           rdid_;  // read ID (potentially used for ordering)
+	EList<AlnRes>     rs_;   // paired alignments for mate #1
+	EList<size_t>     select_; // parallel to rs1_/rs2_ - which to report
+	ReportingState    st_;      // reporting state - what's left to do?
+	
+	EList<std::pair<TAlScore, size_t> > selectBuf_;
+	BTString obuf_;
+};
+
+/**
+ * An AlnSink concrete subclass for printing SAM alignments.  The user might
+ * want to customize SAM output in various ways.  We encapsulate all these
+ * customizations, and some of the key printing routines, in the SamConfig
+ * class in sam.h/sam.cpp.
+ */
+template <typename index_t>
+class AlnSinkSam : public AlnSink<index_t> {
+
+	typedef EList<std::string> StrList;
+
+public:
+
+	AlnSinkSam(
+               Ebwt<index_t>*   ebwt,
+               OutputQueue&     oq,           // output queue
+               const StrList&   refnames,     // reference names
+               bool             quiet) :
+    AlnSink<index_t>(oq,
+                     refnames,
+                     quiet),
+    ebwt_(ebwt)
+    { }
+	
+	virtual ~AlnSinkSam() { }
+
+	/**
+	 * Append a single alignment result, which might be paired or
+	 * unpaired, to the given output stream in Bowtie's verbose-mode
+	 * format.  If the alignment is paired-end, print mate1's alignment
+	 * then mate2's alignment.
+	 */
+	virtual void append(
+		BTString&     o,           // write output to this string
+		size_t        threadId,    // which thread am I?
+		const Read*   rd1,         // mate #1
+		const Read*   rd2,         // mate #2
+		const TReadId rdid,        // read ID
+		AlnRes* rs1,               // alignments for mate #1
+		AlnRes* rs2,               // alignments for mate #2
+		const AlnSetSumm& summ,    // summary
+		const PerReadMetrics& prm, // per-read metrics
+		SpeciesMetrics& sm,  // species metrics
+		bool report2,              // report alns for both mates
+		size_t n_results)          // number of results for read
+	{
+		assert(rd1 != NULL || rd2 != NULL);
+        appendMate(*ebwt_, o, *rd1, rd2, rdid, rs1, rs2, summ, prm, sm, n_results);
+	}
+
+protected:
+
+	/**
+	 * Append a single per-mate alignment result to the given output
+	 * stream.  If the alignment is part of a pair, information about
+	 * the opposite mate and its alignment are given in rdo/rso.
+	 */
+	void appendMate(
+                    Ebwt<index_t>& ebwt,
+                    BTString&     o,
+                    const Read&   rd,
+                    const Read*   rdo,
+                    const TReadId rdid,
+                    AlnRes* rs,
+                    AlnRes* rso,
+                    const AlnSetSumm& summ,
+                    const PerReadMetrics& prm, // per-read metrics
+                    SpeciesMetrics& sm,   // species metrics
+                    size_t n_results);
+
+
+    Ebwt<index_t>*   ebwt_;
+	BTDnaString      dseq_;    // buffer for decoded read sequence
+	BTString         dqual_;   // buffer for decoded quality sequence
+};
+
+static inline std::ostream& printPct(
+							  std::ostream& os,
+							  uint64_t num,
+							  uint64_t denom)
+{
+	double pct = 0.0f;
+	if(denom != 0) { pct = 100.0 * (double)num / (double)denom; }
+	os << fixed << setprecision(2) << pct << '%';
+	return os;
+}
+
+/**
+ * Print a friendly summary of:
+ *
+ *  1. How many reads were aligned and had one or more alignments
+ *     reported
+ *  2. How many reads exceeded the -m or -M ceiling and therefore had
+ *     their alignments suppressed or sampled
+ *  3. How many reads failed to align entirely
+ *
+ * Optionally print a series of Hadoop streaming-style counter updates
+ * with similar information.
+ */
+template <typename index_t>
+void AlnSink<index_t>::printAlSumm(
+								   const ReportingMetrics& met,
+								   size_t repThresh,   // threshold for uniqueness, or max if no thresh
+								   bool discord,       // looked for discordant alignments
+								   bool mixed,         // looked for unpaired alignments where paired failed?
+								   bool hadoopOut)     // output Hadoop counters?
+{
+	// NOTE: there's a filtering step at the very beginning, so everything
+	// being reported here is post filtering
+#if 0
+	bool canRep = repThresh != MAX_SIZE_T;
+	if(hadoopOut) {
+		cerr << "reporter:counter:Centrifuge,Reads processed," << met.nread << endl;
+	}
+	uint64_t totread = met.nread;
+	if(totread > 0) {
+		cerr << "" << met.nread << " reads (or pairs); of these:" << endl;
+	} else {
+		assert_eq(0, met.npaired);
+		assert_eq(0, met.nunpaired);
+		cerr << "" << totread << " reads (or pairs)" << endl;
+	}
+	if(totread > 0) {
+		// Concordants
+		cerr << "    " << met.nconcord << " (";
+		printPct(cerr, met.nconcord, met.npaired);
+		cerr << ") classified 0 times" << endl;
+		
+        // Print the number that aligned concordantly exactly once
+        cerr << "    " << met.nconcord_uni << " (";
+        printPct(cerr, met.nconcord_uni, met.npaired);
+        cerr << ") classified exactly 1 time" << endl;
+        
+#if 0
+        // Print the number that aligned concordantly more than once
+        cerr << "    " << met.nconcord_uni2 << " (";
+        printPct(cerr, met.nconcord_uni2, met.npaired);
+        cerr << ") classified >1 times" << endl;
+#endif
+	}
+    
+#if 0
+	uint64_t totunpair = met.nunpaired;
+	uint64_t tot_al_cand = totunpair + totpair*2;
+	uint64_t tot_al =
+	(met.nconcord_uni + met.nconcord_rep)*2 +
+	(met.ndiscord)*2 +
+	met.nunp_0_uni +
+	met.nunp_0_rep + 
+	met.nunp_uni +
+	met.nunp_rep;
+	assert_leq(tot_al, tot_al_cand);
+	printPct(cerr, tot_al, tot_al_cand);
+#endif
+	cerr << " overall classification rate" << endl;
+#endif
+}
+
+/**
+ * Return true iff the read in rd1/rd2 matches the last read handled, which
+ * should still be in rd1_/rd2_.
+ */
+template <typename index_t>
+bool AlnSinkWrap<index_t>::sameRead(
+									// One of the other of rd1, rd2 will = NULL if read is unpaired
+									const Read* rd1,      // new mate #1
+									const Read* rd2,      // new mate #2
+									bool qualitiesMatter) // aln policy distinguishes b/t quals?
+{
+	bool same = false;
+	if(rd1_ != NULL || rd2_ != NULL) {
+		// This is not the first time the sink was initialized with
+		// a read.  Check if new read/pair is identical to previous
+		// read/pair
+		if((rd1_ == NULL) == (rd1 == NULL) &&
+		   (rd2_ == NULL) == (rd2 == NULL))
+		{
+			bool m1same = (rd1 == NULL && rd1_ == NULL);
+			if(!m1same) {
+				assert(rd1 != NULL);
+				assert(rd1_ != NULL);
+				m1same = Read::same(
+									rd1->patFw,  // new seq
+									rd1->qual,   // new quals
+									rd1_->patFw, // old seq
+									rd1_->qual,  // old quals
+									qualitiesMatter);
+			}
+			if(m1same) {
+				bool m2same = (rd2 == NULL && rd2_ == NULL);
+				if(!m2same) {
+					m2same = Read::same(
+										rd2->patFw,  // new seq
+										rd2->qual,   // new quals
+										rd2_->patFw, // old seq
+										rd2_->qual,  // old quals
+										qualitiesMatter);
+				}
+				same = m2same;
+			}
+		}
+	}
+	return same;
+}
+
+/**
+ * Initialize the wrapper with a new read pair and return an integer >= -1
+ * indicating which stage the aligner should start at.  If -1 is returned, the
+ * aligner can skip the read entirely.  Checks if the new read pair is
+ * identical to the previous pair.  If it is, then we return the id of the
+ * first stage to run.
+ */
+template <typename index_t>
+int AlnSinkWrap<index_t>::nextRead(
+								   // One of the other of rd1, rd2 will = NULL if read is unpaired
+								   const Read* rd1,      // new mate #1
+								   const Read* rd2,      // new mate #2
+								   TReadId rdid,         // read ID for new pair
+								   bool qualitiesMatter) // aln policy distinguishes b/t quals?
+{
+	assert(!init_);
+	assert(rd1 != NULL || rd2 != NULL);
+	init_ = true;
+	// Keep copy of new read, so that we can compare it with the
+	// next one
+	if(rd1 != NULL) {
+		rd1_ = rd1;
+	} else rd1_ = NULL;
+	if(rd2 != NULL) {
+		rd2_ = rd2;
+	} else rd2_ = NULL;
+	rdid_ = rdid;
+	// Caller must now align the read
+	maxed1_ = false;
+	maxed2_ = false;
+	maxedOverall_ = false;
+	bestPair_ = best2Pair_ =
+	bestUnp1_ = best2Unp1_ =
+	bestUnp2_ = best2Unp2_ = std::numeric_limits<THitInt>::min();
+    bestSplicedPair_ = best2SplicedPair_ =
+    bestSplicedUnp1_ = best2SplicedUnp1_ =
+    bestSplicedUnp2_ = best2SplicedUnp2_ = 0;
+	rs_.clear();     // clear out paired-end alignments
+	st_.nextRead(readIsPair()); // reset state
+	assert(empty());
+	assert(!maxed());
+	// Start from the first stage
+	return 0;
+}
+
+/**
+ * Inform global, shared AlnSink object that we're finished with this read.
+ * The global AlnSink is responsible for updating counters, creating the output
+ * record, and delivering the record to the appropriate output stream.
+ *
+ * What gets reported for a paired-end alignment?
+ *
+ * 1. If there are reportable concordant alignments, report those and stop
+ * 2. If there are reportable discordant alignments, report those and stop
+ * 3. If unpaired alignments can be reported:
+ *    3a. Report 
+ #
+ * Update metrics.  Only ambiguity is: what if a pair aligns repetitively and
+ * one of its mates aligns uniquely?
+ *
+ * 	uint64_t al;   // # mates w/ >= 1 reported alignment
+ *  uint64_t unal; // # mates w/ 0 alignments
+ *  uint64_t max;  // # mates withheld for exceeding -M/-m ceiling
+ *  uint64_t al_concord;  // # pairs w/ >= 1 concordant alignment
+ *  uint64_t al_discord;  // # pairs w/ >= 1 discordant alignment
+ *  uint64_t max_concord; // # pairs maxed out
+ *  uint64_t unal_pair;   // # pairs where neither mate aligned
+ */
+template <typename index_t>
+void AlnSinkWrap<index_t>::finishRead(
+									  const SeedResults<index_t> *sr1, // seed alignment results for mate 1
+									  const SeedResults<index_t> *sr2, // seed alignment results for mate 2
+									  bool               exhaust1,     // mate 1 exhausted?
+									  bool               exhaust2,     // mate 2 exhausted?
+									  bool               nfilt1,       // mate 1 N-filtered?
+									  bool               nfilt2,       // mate 2 N-filtered?
+									  bool               scfilt1,      // mate 1 score-filtered?
+									  bool               scfilt2,      // mate 2 score-filtered?
+									  bool               lenfilt1,     // mate 1 length-filtered?
+									  bool               lenfilt2,     // mate 2 length-filtered?
+									  bool               qcfilt1,      // mate 1 qc-filtered?
+									  bool               qcfilt2,      // mate 2 qc-filtered?
+									  bool               sortByScore,  // prioritize alignments by score
+									  RandomSource&      rnd,          // pseudo-random generator
+									  ReportingMetrics&  met,          // reporting metrics
+									  SpeciesMetrics&    smet,         // species metrics
+									  const PerReadMetrics& prm,       // per-read metrics
+									  bool suppressSeedSummary,        // = true
+									  bool suppressAlignments)         // = false
+{
+	obuf_.clear();
+	OutputQueueMark qqm(g_.outq(), obuf_, rdid_, threadid_);
+	assert(init_);
+	if(!suppressSeedSummary) {
+		if(sr1 != NULL) {
+			assert(rd1_ != NULL);
+			// Mate exists and has non-empty SeedResults
+			g_.reportSeedSummary(obuf_, *rd1_, rdid_, threadid_, *sr1, true);
+		} else if(rd1_ != NULL) {
+			// Mate exists but has NULL SeedResults
+			g_.reportEmptySeedSummary(obuf_, *rd1_, rdid_, true);
+		}
+		if(sr2 != NULL) {
+			assert(rd2_ != NULL);
+			// Mate exists and has non-empty SeedResults
+			g_.reportSeedSummary(obuf_, *rd2_, rdid_, threadid_, *sr2, true);
+		} else if(rd2_ != NULL) {
+			// Mate exists but has NULL SeedResults
+			g_.reportEmptySeedSummary(obuf_, *rd2_, rdid_, true);
+		}
+	}
+
+	// TODO FB: Cconsider counting species here, and allow to disable counting
+
+	if(!suppressAlignments) {
+		// Ask the ReportingState what to report
+		st_.finish();
+		uint64_t nconcord = 0;
+		bool pairMax = false;
+		st_.getReport(nconcord);
+		assert_leq(nconcord, rs_.size());
+		assert_gt(rp_.khits, 0);
+		met.nread++;
+
+		if(readIsPair()) {
+			met.npaired++;
+		} else {
+			met.nunpaired++;
+		}
+		// Report concordant paired-end alignments if possible
+		if(nconcord > 0) {
+			AlnSetSumm concordSumm(rd1_, rd2_, &rs_);
+			// Possibly select a random subset
+			size_t off;
+			if(sortByScore) {
+				// Sort by score then pick from low to high
+				off = selectByScore(&rs_, nconcord, select_, rnd);
+			} else {
+				// Select subset randomly
+				off = selectAlnsToReport(rs_, nconcord, select_, rnd);
+			}
+			assert_lt(off, rs_.size());
+			_unused(off); // make production build happy
+			assert(!select_.empty());
+			g_.reportHits(
+						  obuf_,
+						  threadid_,
+						  rd1_,
+						  rd2_,
+						  rdid_,
+						  select_,
+						  NULL,
+						  &rs_,
+						  NULL,
+						  pairMax,
+						  concordSumm,
+                          prm,
+						  smet);
+			if(pairMax) {
+				// met.nconcord_rep++;
+			} else {
+				met.nconcord_uni++;
+				assert(!rs_.empty());
+				if(rs_.size() == 1) {
+					// met.nconcord_uni1++;
+				} else {
+					// met.nconcord_uni2++;
+				}
+			}
+			init_ = false;
+			// write read to file
+			//g_.outq().finishRead(obuf_, rdid_, threadid_);
+			return;
+		}
+		
+#if 0
+		// Update counters given that one mate didn't align
+		if(readIsPair()) {
+			met.nconcord_0++;
+		}
+		if(rd1_ != NULL) {
+			if(nunpair1 > 0) {
+				// Update counters
+				if(readIsPair()) {
+					if(unpair1Max) met.nunp_0_rep++;
+					else {
+						met.nunp_0_uni++;
+						assert(!rs1u_.empty());
+						if(rs1u_.size() == 1) {
+							met.nunp_0_uni1++;
+						} else {
+							met.nunp_0_uni2++;
+						}
+					}
+				} else {
+					if(unpair1Max) met.nunp_rep++;
+					else {
+						met.nunp_uni++;
+						assert(!rs1u_.empty());
+						if(rs1u_.size() == 1) {
+							met.nunp_uni1++;
+						} else {
+							met.nunp_uni2++;
+						}
+					}
+				}
+			} else if(unpair1Max) {
+				// Update counters
+				if(readIsPair())   met.nunp_0_rep++;
+				else               met.nunp_rep++;
+			} else {
+				// Update counters
+				if(readIsPair())   met.nunp_0_0++;
+				else               met.nunp_0++;
+			}
+		}
+		if(rd2_ != NULL) {
+			if(nunpair2 > 0) {
+				// Update counters
+				if(readIsPair()) {
+					if(unpair2Max) met.nunp_0_rep++;
+					else {
+						assert(!rs2u_.empty());
+						met.nunp_0_uni++;
+						if(rs2u_.size() == 1) {
+							met.nunp_0_uni1++;
+						} else {
+							met.nunp_0_uni2++;
+						}
+					}
+				} else {
+					if(unpair2Max) met.nunp_rep++;
+					else {
+						assert(!rs2u_.empty());
+						met.nunp_uni++;
+						if(rs2u_.size() == 1) {
+							met.nunp_uni1++;
+						} else {
+							met.nunp_uni2++;
+						}
+					}
+				}
+			} else if(unpair2Max) {
+				// Update counters
+				if(readIsPair())   met.nunp_0_rep++;
+				else               met.nunp_rep++;
+			} else {
+				// Update counters
+				if(readIsPair())   met.nunp_0_0++;
+				else               met.nunp_0++;
+			}
+		}
+        
+#endif
+	} // if(suppress alignments)
+	init_ = false;
+	return;
+}
+
+/**
+ * Called by the aligner when a new unpaired or paired alignment is
+ * discovered in the given stage.  This function checks whether the
+ * addition of this alignment causes the reporting policy to be
+ * violated (by meeting or exceeding the limits set by -k, -m, -M),
+ * in which case true is returned immediately and the aligner is
+ * short circuited.  Otherwise, the alignment is tallied and false
+ * is returned.
+ */
+template <typename index_t>
+bool AlnSinkWrap<index_t>::report(int stage,
+								  const AlnRes* rs)
+{
+	assert(init_);
+	assert(rs != NULL);
+    st_.foundConcordant();
+    rs_.push_back(*rs);
+
+	// Tally overall alignment score
+	TAlScore score = rs->score();
+	// Update best score so far
+    if(score > bestPair_) {
+        best2Pair_ = bestPair_;
+        bestPair_ = score;
+    } else if(score > best2Pair_) {
+        best2Pair_ = score;
+    }
+	return st_.done();
+}
+
+/**
+ * rs1 (possibly together with rs2 if reads are paired) are populated with
+ * alignments.  Here we prioritize them according to alignment score, and
+ * some randomness to break ties.  Priorities are returned in the 'select'
+ * list.
+ */
+template <typename index_t>
+size_t AlnSinkWrap<index_t>::selectByScore(
+										   const EList<AlnRes>* rs,    // alignments to select from (mate 1)
+										   uint64_t             num,    // number of alignments to select
+										   EList<size_t>&       select, // prioritized list to put results in
+										   RandomSource&        rnd)
+const
+{
+	assert(init_);
+	assert(repOk());
+	assert_gt(num, 0);
+	assert(rs != NULL);
+	size_t sz = rs->size(); // sz = # alignments found
+	assert_leq(num, sz);
+	if(sz < num) {
+		num = sz;
+	}
+	// num = # to select
+	if(sz < 1) {
+		return 0;
+	}
+	select.resize((size_t)num);
+	// Use 'selectBuf_' as a temporary list for sorting purposes
+	EList<std::pair<TAlScore, size_t> >& buf =
+	const_cast<EList<std::pair<TAlScore, size_t> >& >(selectBuf_);
+	buf.resize(sz);
+	// Sort by score.  If reads are pairs, sort by sum of mate scores.
+	for(size_t i = 0; i < sz; i++) {
+        buf[i].first = (*rs)[i].score();
+		buf[i].second = i; // original offset
+	}
+	buf.sort(); buf.reverse(); // sort in descending order by score
+	
+	// Randomize streaks of alignments that are equal by score
+	size_t streak = 0;
+	for(size_t i = 1; i < buf.size(); i++) {
+		if(buf[i].first == buf[i-1].first) {
+			if(streak == 0) { streak = 1; }
+			streak++;
+		} else {
+			if(streak > 1) {
+				assert_geq(i, streak);
+				buf.shufflePortion(i-streak, streak, rnd);
+			}
+			streak = 0;
+		}
+	}
+	if(streak > 1) {
+		buf.shufflePortion(buf.size() - streak, streak, rnd);
+	}
+	
+	for(size_t i = 0; i < num; i++) { select[i] = buf[i].second; }
+    
+    if(!secondary_) {
+        assert_geq(buf.size(), select.size());
+        for(size_t i = 0; i + 1 < select.size(); i++) {
+            if(buf[i].first != buf[i+1].first) {
+                select.resize(i+1);
+                break;
+            }
+        }
+    }
+    
+	// Returns index of the representative alignment, but in 'select' also
+	// returns the indexes of the next best selected alignments in order by
+	// score.
+	return selectBuf_[0].second;
+}
+
+/**
+ * Given that rs is already populated with alignments, consider the
+ * alignment policy and make random selections where necessary.  E.g. if we
+ * found 10 alignments and the policy is -k 2 -m 20, select 2 alignments at
+ * random.  We "select" an alignment by setting the parallel entry in the
+ * 'select' list to true.
+ *
+ * Return the "representative" alignment.  This is simply the first one
+ * selected.  That will also be what SAM calls the "primary" alignment.
+ */
+template <typename index_t>
+size_t AlnSinkWrap<index_t>::selectAlnsToReport(
+												const EList<AlnRes>& rs,     // alignments to select from
+												uint64_t             num,    // number of alignments to select
+												EList<size_t>&       select, // list to put results in
+												RandomSource&        rnd)
+const
+{
+	assert(init_);
+	assert(repOk());
+	assert_gt(num, 0);
+	size_t sz = rs.size();
+	if(sz < num) {
+		num = sz;
+	}
+	if(sz < 1) {
+		return 0;
+	}
+	select.resize((size_t)num);
+	if(sz == 1) {
+		assert_eq(1, num);
+		select[0] = 0;
+		return 0;
+	}
+	// Select a random offset into the list of alignments
+	uint32_t off = rnd.nextU32() % (uint32_t)sz;
+	uint32_t offOrig = off;
+	// Now take elements starting at that offset, wrapping around to 0 if
+	// necessary.  Leave the rest.
+	for(size_t i = 0; i < num; i++) {
+		select[i] = off;
+		off++;
+		if(off == sz) {
+			off = 0;
+		}
+	}
+	return offOrig;
+}
+
+#define NOT_SUPPRESSED !suppress_[field++]
+#define BEGIN_FIELD { \
+if(firstfield) firstfield = false; \
+else o.append('\t'); \
+}
+#define WRITE_TAB { \
+if(firstfield) firstfield = false; \
+else o.append('\t'); \
+}
+#define WRITE_NUM(o, x) { \
+itoa10(x, buf); \
+o.append(buf); \
+}
+
+/**
+ * Print a seed summary to the first output stream in the outs_ list.
+ */
+template <typename index_t>
+void AlnSink<index_t>::reportSeedSummary(
+										 BTString&          o,
+										 const Read&        rd,
+										 TReadId            rdid,
+										 size_t             threadId,
+										 const SeedResults<index_t>& rs,
+										 bool               getLock)
+{
+#if 0
+	appendSeedSummary(
+					  o,                     // string to write to
+					  rd,                    // read
+					  rdid,                  // read id
+					  rs.numOffs()*2,        // # seeds tried
+					  rs.nonzeroOffsets(),   // # seeds with non-empty results
+					  rs.numRanges(),        // # ranges for all seed hits
+					  rs.numElts(),          // # elements for all seed hits
+					  rs.numOffs(),          // # seeds tried from fw read
+					  rs.nonzeroOffsetsFw(), // # seeds with non-empty results from fw read
+					  rs.numRangesFw(),      // # ranges for seed hits from fw read
+					  rs.numEltsFw(),        // # elements for seed hits from fw read
+					  rs.numOffs(),          // # seeds tried from rc read
+					  rs.nonzeroOffsetsRc(), // # seeds with non-empty results from fw read
+					  rs.numRangesRc(),      // # ranges for seed hits from fw read
+					  rs.numEltsRc());       // # elements for seed hits from fw read
+#endif
+}
+
+/**
+ * Print an empty seed summary to the first output stream in the outs_ list.
+ */
+template <typename index_t>
+void AlnSink<index_t>::reportEmptySeedSummary(
+											  BTString&          o,
+											  const Read&        rd,
+											  TReadId            rdid,
+											  size_t             threadId,
+											  bool               getLock)
+{
+	appendSeedSummary(
+					  o,                     // string to append to
+					  rd,                    // read
+					  rdid,                  // read id
+					  0,                     // # seeds tried
+					  0,                     // # seeds with non-empty results
+					  0,                     // # ranges for all seed hits
+					  0,                     // # elements for all seed hits
+					  0,                     // # seeds tried from fw read
+					  0,                     // # seeds with non-empty results from fw read
+					  0,                     // # ranges for seed hits from fw read
+					  0,                     // # elements for seed hits from fw read
+					  0,                     // # seeds tried from rc read
+					  0,                     // # seeds with non-empty results from fw read
+					  0,                     // # ranges for seed hits from fw read
+					  0);                    // # elements for seed hits from fw read
+}
+
+/**
+ * Print the given string.  If ws = true, print only up to and not
+ * including the first space or tab.  Useful for printing reference
+ * names.
+ */
+template<typename T>
+static inline void printUptoWs(
+							   BTString& s,
+							   const T& str,
+							   bool chopws)
+{
+	size_t len = str.length();
+	for(size_t i = 0; i < len; i++) {
+		if(!chopws || (str[i] != ' ' && str[i] != '\t')) {
+			s.append(str[i]);
+		} else {
+			break;
+		}
+	}
+}
+
+/**
+ * Append a batch of unresolved seed alignment summary results (i.e.
+ * seed alignments where all we know is the reference sequence aligned
+ * to and its SA range, not where it falls in the reference
+ * sequence) to the given output stream in Bowtie's seed-sumamry
+ * verbose-mode format.
+ *
+ * The seed summary format is:
+ *
+ *  - One line per read
+ *  - A typical line consists of a set of tab-delimited fields:
+ *
+ *    1. Read name
+ *    2. Total number of seeds extracted from the read
+ *    3. Total number of seeds that aligned to the reference at
+ *       least once (always <= field 2)
+ *    4. Total number of distinct BW ranges found in all seed hits
+ *       (always >= field 3)
+ *    5. Total number of distinct BW elements found in all seed
+ *       hits (always >= field 4)
+ *    6-9.:   Like 2-5. but just for seeds extracted from the
+ *            forward representation of the read
+ *    10-13.: Like 2-5. but just for seeds extracted from the
+ *            reverse-complement representation of the read
+ *
+ *    Note that fields 6 and 10 should add to field 2, 7 and 11
+ *    should add to 3, etc.
+ *
+ *  - Lines for reads that are filtered out for any reason (e.g. too
+ *    many Ns) have columns 2 through 13 set to 0.
+ */
+template <typename index_t>
+void AlnSink<index_t>::appendSeedSummary(
+										 BTString&     o,
+										 const Read&   rd,
+										 const TReadId rdid,
+										 size_t        seedsTried,
+										 size_t        nonzero,
+										 size_t        ranges,
+										 size_t        elts,
+										 size_t        seedsTriedFw,
+										 size_t        nonzeroFw,
+										 size_t        rangesFw,
+										 size_t        eltsFw,
+										 size_t        seedsTriedRc,
+										 size_t        nonzeroRc,
+										 size_t        rangesRc,
+										 size_t        eltsRc)
+{
+	char buf[1024];
+	bool firstfield = true;
+	//
+	// Read name
+	//
+	BEGIN_FIELD;
+	printUptoWs(o, rd.name, true);
+	
+	//
+	// Total number of seeds tried
+	//
+	BEGIN_FIELD;
+	WRITE_NUM(o, seedsTried);
+	
+	//
+	// Total number of seeds tried where at least one range was found.
+	//
+	BEGIN_FIELD;
+	WRITE_NUM(o, nonzero);
+	
+	//
+	// Total number of ranges found
+	//
+	BEGIN_FIELD;
+	WRITE_NUM(o, ranges);
+	
+	//
+	// Total number of elements found
+	//
+	BEGIN_FIELD;
+	WRITE_NUM(o, elts);
+	
+	//
+	// The same four numbers, but only for seeds extracted from the
+	// forward read representation.
+	//
+	BEGIN_FIELD;
+	WRITE_NUM(o, seedsTriedFw);
+	
+	BEGIN_FIELD;
+	WRITE_NUM(o, nonzeroFw);
+	
+	BEGIN_FIELD;
+	WRITE_NUM(o, rangesFw);
+	
+	BEGIN_FIELD;
+	WRITE_NUM(o, eltsFw);
+	
+	//
+	// The same four numbers, but only for seeds extracted from the
+	// reverse complement read representation.
+	//
+	BEGIN_FIELD;
+	WRITE_NUM(o, seedsTriedRc);
+	
+	BEGIN_FIELD;
+	WRITE_NUM(o, nonzeroRc);
+	
+	BEGIN_FIELD;
+	WRITE_NUM(o, rangesRc);
+	
+	BEGIN_FIELD;
+	WRITE_NUM(o, eltsRc);
+	
+	o.append('\n');
+}
+
+/**
+ * Append a single hit to the given output stream in Bowtie's
+ * verbose-mode format.
+ */
+template <typename index_t>
+void AlnSinkSam<index_t>::appendMate(
+                                     Ebwt<index_t>& ebwt,
+									 BTString&      o,           // append to this string
+									 const Read&    rd,
+									 const Read*    rdo,
+									 const TReadId  rdid,
+									 AlnRes* rs,
+									 AlnRes* rso,
+									 const AlnSetSumm& summ,
+									 const PerReadMetrics& prm,
+									 SpeciesMetrics& sm,
+									 size_t n_results)
+{
+	if(rs == NULL) {
+		return;
+	}
+	char buf[1024];
+
+    // QNAME
+    size_t namelen = rd.name.length();
+    if(namelen >= 2 &&
+       rd.name[namelen-2] == '/' &&
+       (rd.name[namelen-1] == '1' || rd.name[namelen-1] == '2' || rd.name[namelen-1] == '3'))
+    {
+        namelen -= 2;
+    }
+    for(size_t i = 0; i < namelen; i++) {
+        if(isspace(rd.name[i])) {
+            break;
+        }
+        o.append(rd.name[i]);
+    }
+    o.append('\t');
+
+	sm.addSpeciesCounts(
+                        rs->taxID(),
+                        rs->score(),
+                        rs->max_score(),
+                        rs->summedHitLen(),
+                        1.0 / n_results,
+                        (uint32_t)n_results);
+
+	// only count k-mers if the read is unique
+    if (n_results == 1) {
+		for (size_t i = 0; i< rs->nReadPositions(); ++i) {
+			sm.addAllKmers(rs->taxID(),
+                           rs->isFw()? rd.patFw : rd.patRc,
+                           rs->readPositions(i).first,
+                           rs->readPositions(i).second);
+		}
+	}
+
+//    (sc[rs->speciesID_])++;
+    
+    const std::map<uint64_t, TaxonomyNode>& tree = ebwt.tree();
+    bool leaf = true;
+    std::map<uint64_t, TaxonomyNode>::const_iterator itr = tree.find(rs->taxID());
+    if(itr != tree.end()) {
+        const TaxonomyNode& node = itr->second;
+        leaf = node.leaf;
+    }
+
+    // unique ID
+    if(leaf) {
+        o.append(rs->uid().c_str());
+    } else {
+        o.append(get_tax_rank_string(rs->taxRank()));
+    }
+    o.append('\t');
+    
+    // tax ID
+    uint64_t tid = rs->taxID();
+    uint64_t tid1 = tid & 0xffffffff;
+    uint64_t tid2 = tid >> 32;
+    itoa10<int64_t>(tid1, buf);
+    o.append(buf);
+    if(tid2 > 0) {
+        o.append(".");
+        itoa10<int64_t>(tid2, buf);
+        o.append(buf);
+    }
+    o.append('\t');
+    
+    // score
+    itoa10<int64_t>(rs->score(), buf);
+    o.append(buf);
+    o.append('\t');
+    
+    // second best score
+    if(summ.secbest().valid()) {
+        itoa10<int64_t>(summ.secbest().score(), buf);
+    } else {
+        itoa10<int64_t>(0, buf);
+    }
+    o.append(buf);
+    o.append('\t');
+    
+    // hit length
+    itoa10<int64_t>(rs->summedHitLen(), buf);
+    o.append(buf);
+    o.append('\t');
+    
+    size_t rdlen = rd.patFw.length() + (rdo != NULL ? rdo->patFw.length() : 0);
+    itoa10<size_t>(rdlen, buf);
+    o.append(buf);
+    o.append('\t');
+
+    // number of results
+    itoa10<int64_t>(n_results, buf);
+    o.append(buf);
+    o.append('\n');
+
+}
+
+// #include <iomanip>
+
+/**
+ * Initialize state machine with a new read.  The state we start in depends
+ * on whether it's paired-end or unpaired.
+ */
+void ReportingState::nextRead(bool paired) {
+    paired_ = paired;
+    state_ = CONCORDANT_PAIRS;
+    doneConcord_ = false;
+    exitConcord_ = ReportingState::EXIT_DID_NOT_EXIT;
+    done_ = false;
+    nconcord_ = 0;
+}
+
+/**
+ * Caller uses this member function to indicate that one additional
+ * concordant alignment has been found.
+ */
+bool ReportingState::foundConcordant() {
+    assert_geq(state_, ReportingState::CONCORDANT_PAIRS);
+    assert(!doneConcord_);
+    nconcord_++;
+    if(doneConcord_) {
+        // If we're finished looking for concordant alignments, do we have to
+        // continue on to search for unpaired alignments?  Only if our exit
+        // from the concordant stage is EXIT_SHORT_CIRCUIT_M.  If it's
+        // EXIT_SHORT_CIRCUIT_k or EXIT_WITH_ALIGNMENTS, we can skip unpaired.
+        assert_neq(ReportingState::EXIT_NO_ALIGNMENTS, exitConcord_);
+    }
+    return done();
+}
+
+/**
+ * Caller uses this member function to indicate that one additional unpaired
+ * mate alignment has been found for the specified mate.
+ */
+bool ReportingState::foundUnpaired(bool mate1) {
+    return done();
+}
+
+/**
+ * Called to indicate that the aligner has finished searching for
+ * alignments.  This gives us a chance to finalize our state.
+ *
+ * TODO: Keep track of short-circuiting information.
+ */
+void ReportingState::finish() {
+    if(!doneConcord_) {
+        doneConcord_ = true;
+        exitConcord_ =
+        ((nconcord_ > 0) ?
+         ReportingState::EXIT_WITH_ALIGNMENTS :
+         ReportingState::EXIT_NO_ALIGNMENTS);
+    }
+    assert_gt(exitConcord_, EXIT_DID_NOT_EXIT);
+    done_ = true;
+    assert(done());
+}
+
+
+/**
+ * Populate given counters with the number of various kinds of alignments
+ * to report for this read.  Concordant alignments are preferable to (and
+ * mutually exclusive with) discordant alignments, and paired-end
+ * alignments are preferable to unpaired alignments.
+ *
+ * The caller also needs some additional information for the case where a
+ * pair or unpaired read aligns repetitively.  If the read is paired-end
+ * and the paired-end has repetitive concordant alignments, that should be
+ * reported, and 'pairMax' is set to true to indicate this.  If the read is
+ * paired-end, does not have any conordant alignments, but does have
+ * repetitive alignments for one or both mates, then that should be
+ * reported, and 'unpair1Max' and 'unpair2Max' are set accordingly.
+ *
+ * Note that it's possible in the case of a paired-end read for the read to
+ * have repetitive concordant alignments, but for one mate to have a unique
+ * unpaired alignment.
+ */
+void ReportingState::getReport(uint64_t& nconcordAln) const // # concordant alignments to report
+{
+    nconcordAln = 0;
+    assert_gt(p_.khits, 0);
+    // Do we have 1 or more concordant alignments to report?
+    if(exitConcord_ == ReportingState::EXIT_SHORT_CIRCUIT_k) {
+        // k at random
+        assert_geq(nconcord_, (uint64_t)p_.khits);
+        nconcordAln = p_.khits;
+        return;
+    } else if(exitConcord_ == ReportingState::EXIT_WITH_ALIGNMENTS) {
+        assert_gt(nconcord_, 0);
+        // <= k at random
+        nconcordAln = min<uint64_t>(nconcord_, p_.khits);
+        return;
+    }
+}
+
+#if 0
+/**
+ * Given the number of alignments in a category, check whether we
+ * short-circuited out of the category.  Set the done and exit arguments to
+ * indicate whether and how we short-circuited.
+ */
+inline void ReportingState::areDone(
+                                    uint64_t cnt,    // # alignments in category
+                                    bool& done,      // out: whether we short-circuited out of category
+                                    int& exit) const // out: if done, how we short-circuited (-k? -m? etc)
+{
+    assert(!done);
+    // Have we exceeded the -k limit?
+    assert_gt(p_.khits, 0);
+    assert_gt(p_.mhits, 0);
+    if(cnt >= (uint64_t)p_.khits && !p_.mhitsSet()) {
+        done = true;
+        exit = ReportingState::EXIT_SHORT_CIRCUIT_k;
+    }
+    // Have we exceeded the -m or -M limit?
+    else if(p_.mhitsSet() && cnt > (uint64_t)p_.mhits) {
+        done = true;
+        assert(p_.msample);
+        exit = ReportingState::EXIT_SHORT_CIRCUIT_M;
+    }
+}
+#endif
+
+#endif /*ndef ALN_SINK_H_*/
diff --git a/alphabet.cpp b/alphabet.cpp
new file mode 100644
index 0000000..7613557
--- /dev/null
+++ b/alphabet.cpp
@@ -0,0 +1,440 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdint.h>
+#include <cassert>
+#include <string>
+#include "alphabet.h"
+
+using namespace std;
+
+/**
+ * Mapping from ASCII characters to DNA categories:
+ *
+ * 0 = invalid - error
+ * 1 = DNA
+ * 2 = IUPAC (ambiguous DNA)
+ * 3 = not an error, but unmatchable; alignments containing this
+ *     character are invalid
+ */
+uint8_t asc2dnacat[] = {
+	/*   0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,
+	       /*                                        - */
+	/*  48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  64 */ 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 2, 2, 0,
+	       /*    A  B  C  D        G  H        K     M  N */
+	/*  80 */ 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
+	       /*       R  S  T     V  W  X  Y */
+	/*  96 */ 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 2, 2, 0,
+	       /*    a  b  c  d        g  h        k     m  n */
+	/* 112 */ 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
+	       /*       r  s  t     v  w  x  y */
+	/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+// 5-bit pop count
+int mask2popcnt[] = {
+	0, 1, 1, 2, 1, 2, 2, 3,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5
+};
+
+/**
+ * Mapping from masks to ASCII characters for ambiguous nucleotides.
+ */
+char mask2dna[] = {
+	'?', // 0
+	'A', // 1
+	'C', // 2
+	'M', // 3
+	'G', // 4
+	'R', // 5
+	'S', // 6
+	'V', // 7
+	'T', // 8
+	'W', // 9
+	'Y', // 10
+	'H', // 11
+	'K', // 12
+	'D', // 13
+	'B', // 14
+	'N', // 15 (inclusive N)
+	'N'  // 16 (exclusive N)
+};
+
+/**
+ * Mapping from ASCII characters for ambiguous nucleotides into masks:
+ */
+uint8_t asc2dnamask[] = {
+	/*   0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  64 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
+	       /*    A  B  C  D        G  H        K     M  N */
+	/*  80 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
+	       /*       R  S  T     V  W     Y */
+	/*  96 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
+	       /*    a  b  c  d        g  h        k     m  n */
+	/* 112 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
+	       /*       r  s  t     v  w     y */
+	/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/**
+ * Convert a pair of DNA masks to a color mask
+ *
+ * 
+ */ 
+uint8_t dnamasks2colormask[16][16] = {
+	         /* 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 */
+	/*  0 */ {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+	/*  1 */ {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	/*  2 */ {  0,  2,  1,  3,  8, 10,  9, 11,  4,  6,  5,  7, 12, 14, 13, 15 },
+	/*  3 */ {  0,  3,  3,  3, 12, 15, 15, 15, 12, 15, 15, 15, 12, 15, 15, 15 },
+	/*  4 */ {  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 },
+	/*  5 */ {  0,  5, 10, 15,  5,  5, 15, 15, 10, 15, 10, 15, 15, 15, 15, 15 },
+	/*  6 */ {  0,  6,  9, 15,  9, 15,  9, 15,  6,  6, 15, 15, 15, 15, 15, 15 },
+	/*  7 */ {  0,  7, 11, 15, 13, 15, 15, 15, 14, 15, 15, 15, 15, 15, 15, 15 },
+	/*  8 */ {  0,  8,  4, 12,  2, 10,  6, 14,  1,  9,  5, 13,  3, 11,  7, 15 },
+	/*  9 */ {  0,  9,  6, 15,  6, 15,  6, 15,  9,  9, 15, 15, 15, 15, 15, 15 },
+	/* 10 */ {  0, 10,  5, 15, 10, 10, 15, 15,  5, 15,  5, 15, 15, 15, 15, 15 },
+	/* 11 */ {  0, 11,  7, 15, 14, 15, 15, 15, 13, 15, 15, 15, 15, 15, 15, 15 },
+	/* 12 */ {  0, 12, 12, 12,  3, 15, 15, 15,  3, 15, 15, 15,  3, 15, 15, 15 },
+	/* 13 */ {  0, 13, 14, 15,  7, 15, 15, 15, 11, 15, 15, 15, 15, 15, 15, 15 },
+	/* 14 */ {  0, 14, 13, 15, 11, 15, 15, 15,  7, 15, 15, 15, 15, 15, 15, 15 },
+	/* 15 */ {  0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 }
+};
+
+/**
+ * Mapping from ASCII characters for ambiguous nucleotides into masks:
+ */
+char asc2dnacomp[] = {
+	/*   0 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  16 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  32 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,'-',  0,  0,
+	/*  48 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  64 */ 0,'T','V','G','H',  0,  0,'C','D',  0,  0,'M',  0,'K','N',  0,
+	       /*    A   B   C   D           G   H           K       M   N */
+	/*  80 */ 0,  0,'Y','S','A',  0,'B','W',  0,'R',  0,  0,  0,  0,  0,  0,
+	       /*        R   S   T       V   W       Y */
+	/*  96 */ 0,'T','V','G','H',  0,  0,'C','D',  0,  0,'M',  0,'K','N',  0,
+	        /*   a   b   c   d           g   h           k       m   n */
+	/* 112 */ 0,  0,'Y','S','A',  0,'B','W',  0,'R',  0,  0,  0,  0,  0,  0,
+	       /*        r   s   t       v   w       y */
+	/* 128 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 144 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 160 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 176 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 192 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 208 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 224 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 240 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+/**
+ * Mapping from ASCII characters for ambiguous nucleotides into masks:
+ */
+char col2dna[] = {
+	/*   0 */  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  16 */  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  32 */  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,'-','N',  0,
+	       /*                                                     -   . */
+	/*  48 */'A','C','G','T','N',  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	       /* 0   1   2   3   4  */
+	/*  64 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  80 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  96 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 112 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 128 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 144 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 160 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 176 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 192 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 208 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 224 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 240 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+/**
+ * Mapping from ASCII characters for ambiguous nucleotides into masks:
+ */
+char dna2col[] = {
+	/*   0 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  16 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  32 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,'-',  0,  0,
+	/*  48 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/*  64 */ 0,'0',  0,'1',  0,  0,  0,'2',  0,  0,  0,  0,  0,  0,'.',  0,
+	       /*    A       C               G                           N */
+	/*  80 */ 0,  0,  0,  0,'3',  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	       /*                T */
+	/*  92 */ 0,'0',  0,'1',  0,  0,  0,'2',  0,  0,  0,  0,  0,  0,'.',  0,
+	       /*    a       c               g                           n */
+	/* 112 */ 0,  0,  0,  0,'3',  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	       /*                t */
+	/* 128 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 144 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 160 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 176 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 192 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 208 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 224 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	/* 240 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+/**
+ * Mapping from ASCII characters for ambiguous nucleotides into masks:
+ */
+const char* dna2colstr[] = {
+	/*   0 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/*  16 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/*  32 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "-",  "?",  "?",
+	/*  48 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/*  64 */ "?",  "0","1|2|3","1","0|2|3","?",  "?",  "2","0|1|3","?",  "?", "2|3", "?", "0|1", ".",  "?",
+	/*               A     B     C     D                 G     H                 K           M     N */
+	/*  80 */ "?",  "?", "0|2","1|2", "3",  "?","0|1|2","0|3","?", "1|3", "?",  "?",  "?",  "?",  "?",  "?",
+	/*                     R     S     T           V     W           Y */
+	/*  92 */ "?",  "?","1|2|3","1","0|2|3","?",  "?",  "2","0|1|3","?",  "?", "2|3", "?", "0|1", ".",  "?",
+	/*               a     b     c     d                 g     h                 k           m     n */
+	/* 112 */ "?",  "0", "0|2","1|2", "3",  "?","0|1|2","0|3","?", "1|3", "?",  "?",  "?",  "?",  "?",  "?",
+	/*                     r     s     t           v     w           y */
+	/* 128 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/* 144 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/* 160 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/* 176 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/* 192 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/* 208 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/* 224 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",
+	/* 240 */ "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?",  "?"
+};
+
+/**
+ * Mapping from ASCII characters to color categories:
+ *
+ * 0 = invalid - error
+ * 1 = valid color
+ * 2 = IUPAC (ambiguous DNA) - there is no such thing for colors to my
+ *     knowledge
+ * 3 = not an error, but unmatchable; alignments containing this
+ *     character are invalid
+ */
+uint8_t asc2colcat[] = {
+	/*   0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0,
+	       /*                                        -  . */
+	/*  48 */ 1, 1, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	       /* 0  1  2  3  4  */
+	/*  64 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  96 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 112 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/**
+ * Set the category for all IUPAC codes.  By default they're in
+ * category 2 (IUPAC), but sometimes we'd like to put them in category
+ * 3 (unmatchable), for example.
+ */
+void setIupacsCat(uint8_t cat) {
+	assert(cat < 4);
+	asc2dnacat[(int)'B'] = asc2dnacat[(int)'b'] =
+	asc2dnacat[(int)'D'] = asc2dnacat[(int)'d'] =
+	asc2dnacat[(int)'H'] = asc2dnacat[(int)'h'] =
+	asc2dnacat[(int)'K'] = asc2dnacat[(int)'k'] =
+	asc2dnacat[(int)'M'] = asc2dnacat[(int)'m'] =
+	asc2dnacat[(int)'N'] = asc2dnacat[(int)'n'] =
+	asc2dnacat[(int)'R'] = asc2dnacat[(int)'r'] =
+	asc2dnacat[(int)'S'] = asc2dnacat[(int)'s'] =
+	asc2dnacat[(int)'V'] = asc2dnacat[(int)'v'] =
+	asc2dnacat[(int)'W'] = asc2dnacat[(int)'w'] =
+	asc2dnacat[(int)'X'] = asc2dnacat[(int)'x'] =
+	asc2dnacat[(int)'Y'] = asc2dnacat[(int)'y'] = cat;
+}
+
+/// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
+/// T=3, N=4
+uint8_t asc2dna[] = {
+	/*   0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
+	       /*    A     C           G                    N */
+	/*  80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	       /*             T */
+	/*  96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
+	       /*    a     c           g                    n */
+	/* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	       /*             t */
+	/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/// Convert an ascii char representing a base or a color to a 2-bit
+/// code: 0=A,0; 1=C,1; 2=G,2; 3=T,3; 4=N,.
+uint8_t asc2dnaOrCol[] = {
+	/*   0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0,
+	/*                                               -  . */
+	/*  48 */ 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*        0  1  2  3 */
+	/*  64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
+	/*           A     C           G                    N */
+	/*  80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*                    T */
+	/*  96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
+	/*           a     c           g                    n */
+	/* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*                    t */
+	/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
+/// T=3, N=4
+uint8_t asc2col[] = {
+	/*   0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0,
+	       /*                                        -  . */
+	/*  48 */ 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	       /* 0  1  2  3 */
+	/*  64 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/*  96 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 112 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/**
+ * Convert a nucleotide and a color to the paired nucleotide.  Indexed
+ * first by nucleotide then by color.  Note that this is exactly the
+ * same as the dinuc2color array.
+ */
+uint8_t nuccol2nuc[5][5] = {
+	/*       B  G  O  R  . */
+	/* A */ {0, 1, 2, 3, 4},
+	/* C */ {1, 0, 3, 2, 4},
+	/* G */ {2, 3, 0, 1, 4},
+	/* T */ {3, 2, 1, 0, 4},
+	/* N */ {4, 4, 4, 4, 4}
+};
+
+/**
+ * Convert a pair of nucleotides to a color.
+ */
+uint8_t dinuc2color[5][5] = {
+	/* A */ {0, 1, 2, 3, 4},
+	/* C */ {1, 0, 3, 2, 4},
+	/* G */ {2, 3, 0, 1, 4},
+	/* T */ {3, 2, 1, 0, 4},
+	/* N */ {4, 4, 4, 4, 4}
+};
+
+/// Convert bit encoded DNA char to its complement
+int dnacomp[5] = {
+	3, 2, 1, 0, 4
+};
+
+const char *iupacs = "!ACMGRSVTWYHKDBN!acmgrsvtwyhkdbn";
+
+char mask2iupac[16] = {
+	-1,
+	'A', // 0001
+	'C', // 0010
+	'M', // 0011
+	'G', // 0100
+	'R', // 0101
+	'S', // 0110
+	'V', // 0111
+	'T', // 1000
+	'W', // 1001
+	'Y', // 1010
+	'H', // 1011
+	'K', // 1100
+	'D', // 1101
+	'B', // 1110
+	'N', // 1111
+};
+
+int maskcomp[16] = {
+	0,  // 0000 (!) -> 0000 (!)
+	8,  // 0001 (A) -> 1000 (T)
+	4,  // 0010 (C) -> 0100 (G)
+	12, // 0011 (M) -> 1100 (K)
+	2,  // 0100 (G) -> 0010 (C)
+	10, // 0101 (R) -> 1010 (Y)
+	6,  // 0110 (S) -> 0110 (S)
+	14, // 0111 (V) -> 1110 (B)
+	1,  // 1000 (T) -> 0001 (A)
+	9,  // 1001 (W) -> 1001 (W)
+	5,  // 1010 (Y) -> 0101 (R)
+	13, // 1011 (H) -> 1101 (D)
+	3,  // 1100 (K) -> 0011 (M)
+	11, // 1101 (D) -> 1011 (H)
+	7,  // 1110 (B) -> 0111 (V)
+	15, // 1111 (N) -> 1111 (N)
+};
+
diff --git a/alphabet.h b/alphabet.h
new file mode 100644
index 0000000..340942e
--- /dev/null
+++ b/alphabet.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALPHABETS_H_
+#define ALPHABETS_H_
+
+#include <stdexcept>
+#include <string>
+#include <sstream>
+#include <stdint.h>
+#include "assert_helpers.h"
+
+using namespace std;
+
+/// Convert an ascii char to a DNA category.  Categories are:
+/// 0 -> invalid
+/// 1 -> unambiguous a, c, g or t
+/// 2 -> ambiguous
+/// 3 -> unmatchable
+extern uint8_t asc2dnacat[];
+/// Convert masks to ambiguous nucleotides
+extern char mask2dna[];
+/// Convert ambiguous ASCII nuceleotide to mask
+extern uint8_t asc2dnamask[];
+/// Convert mask to # of alternative in the mask
+extern int mask2popcnt[];
+/// Convert an ascii char to a 2-bit base: 0=A, 1=C, 2=G, 3=T, 4=N
+extern uint8_t asc2dna[];
+/// Convert an ascii char representing a base or a color to a 2-bit
+/// code: 0=A,0; 1=C,1; 2=G,2; 3=T,3; 4=N,.
+extern uint8_t asc2dnaOrCol[];
+/// Convert a pair of DNA masks to a color mask
+extern uint8_t dnamasks2colormask[16][16];
+
+/// Convert an ascii char to a color category.  Categories are:
+/// 0 -> invalid
+/// 1 -> unambiguous 0, 1, 2 or 3
+/// 2 -> ambiguous (not applicable for colors)
+/// 3 -> unmatchable
+extern uint8_t asc2colcat[];
+/// Convert an ascii char to a 2-bit base: 0=A, 1=C, 2=G, 3=T, 4=N
+extern uint8_t asc2col[];
+/// Convert an ascii char to its DNA complement, including IUPACs
+extern char asc2dnacomp[];
+
+/// Convert a pair of 2-bit (and 4=N) encoded DNA bases to a color
+extern uint8_t dinuc2color[5][5];
+/// Convert a 2-bit nucleotide (and 4=N) and a color to the
+/// corresponding 2-bit nucleotide
+extern uint8_t nuccol2nuc[5][5];
+/// Convert a 4-bit mask into an IUPAC code
+extern char mask2iupac[16];
+
+/// Convert an ascii color to an ascii dna char
+extern char col2dna[];
+/// Convert an ascii dna to a color char
+extern char dna2col[];
+/// Convert an ascii dna to a color char
+extern const char* dna2colstr[];
+
+/// Convert bit encoded DNA char to its complement
+extern int dnacomp[5];
+
+/// String of all DNA and IUPAC characters
+extern const char *iupacs;
+
+/// Map from masks to their reverse-complement masks
+extern int maskcomp[16];
+
+/**
+ * Return true iff c is a Dna character.
+ */
+static inline bool isDna(char c) {
+	return asc2dnacat[(int)c] > 0;
+}
+
+/**
+ * Return true iff c is a color character.
+ */
+static inline bool isColor(char c) {
+	return asc2colcat[(int)c] > 0;
+}
+
+/**
+ * Return true iff c is an ambiguous Dna character.
+ */
+static inline bool isAmbigNuc(char c) {
+	return asc2dnacat[(int)c] == 2;
+}
+
+/**
+ * Return true iff c is an ambiguous color character.
+ */
+static inline bool isAmbigColor(char c) {
+	return asc2colcat[(int)c] == 2;
+}
+
+/**
+ * Return true iff c is an ambiguous character.
+ */
+static inline bool isAmbig(char c, bool color) {
+	return (color ? asc2colcat[(int)c] : asc2dnacat[(int)c]) == 2;
+}
+
+/**
+ * Return true iff c is an unambiguous DNA character.
+ */
+static inline bool isUnambigNuc(char c) {
+	return asc2dnacat[(int)c] == 1;
+}
+
+/**
+ * Return the DNA complement of the given ASCII char.
+ */
+static inline char comp(char c) {
+	switch(c) {
+	case 'a': return 't';
+	case 'A': return 'T';
+	case 'c': return 'g';
+	case 'C': return 'G';
+	case 'g': return 'c';
+	case 'G': return 'C';
+	case 't': return 'a';
+	case 'T': return 'A';
+	default: return c;
+	}
+}
+
+/**
+ * Return the reverse complement of a bit-encoded nucleotide.
+ */
+static inline int compDna(int c) {
+	assert_leq(c, 4);
+	return dnacomp[c];
+}
+
+/**
+ * Return true iff c is an unambiguous Dna character.
+ */
+static inline bool isUnambigDna(char c) {
+	return asc2dnacat[(int)c] == 1;
+}
+
+/**
+ * Return true iff c is an unambiguous color character (0,1,2,3).
+ */
+static inline bool isUnambigColor(char c) {
+	return asc2colcat[(int)c] == 1;
+}
+
+/// Convert a pair of 2-bit (and 4=N) encoded DNA bases to a color
+extern uint8_t dinuc2color[5][5];
+
+/**
+ * Decode a not-necessarily-ambiguous nucleotide.
+ */
+static inline void decodeNuc(char c , int& num, int *alts) {
+	switch(c) {
+	case 'A': alts[0] = 0; num = 1; break;
+	case 'C': alts[0] = 1; num = 1; break;
+	case 'G': alts[0] = 2; num = 1; break;
+	case 'T': alts[0] = 3; num = 1; break;
+	case 'M': alts[0] = 0; alts[1] = 1; num = 2; break;
+	case 'R': alts[0] = 0; alts[1] = 2; num = 2; break;
+	case 'W': alts[0] = 0; alts[1] = 3; num = 2; break;
+	case 'S': alts[0] = 1; alts[1] = 2; num = 2; break;
+	case 'Y': alts[0] = 1; alts[1] = 3; num = 2; break;
+	case 'K': alts[0] = 2; alts[1] = 3; num = 2; break;
+	case 'V': alts[0] = 0; alts[1] = 1; alts[2] = 2; num = 3; break;
+	case 'H': alts[0] = 0; alts[1] = 1; alts[2] = 3; num = 3; break;
+	case 'D': alts[0] = 0; alts[1] = 2; alts[2] = 3; num = 3; break;
+	case 'B': alts[0] = 1; alts[1] = 2; alts[2] = 3; num = 3; break;
+	case 'N': alts[0] = 0; alts[1] = 1; alts[2] = 2; alts[3] = 3; num = 4; break;
+	default: {
+		std::cerr << "Bad IUPAC code: " << c << ", (int: " << (int)c << ")" << std::endl;
+		throw std::runtime_error("");
+	}
+	}
+}
+
+extern void setIupacsCat(uint8_t cat);
+
+#endif /*ALPHABETS_H_*/
diff --git a/assert_helpers.h b/assert_helpers.h
new file mode 100644
index 0000000..6a2fe97
--- /dev/null
+++ b/assert_helpers.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ASSERT_HELPERS_H_
+#define ASSERT_HELPERS_H_
+
+#include <stdexcept>
+#include <string>
+#include <cassert>
+#include <iostream>
+
+/**
+ * Assertion for release-enabled assertions
+ */
+class ReleaseAssertException : public std::runtime_error {
+public:
+	ReleaseAssertException(const std::string& msg = "") : std::runtime_error(msg) {}
+};
+
+/**
+ * Macros for release-enabled assertions, and helper macros to make
+ * all assertion error messages more helpful.
+ */
+#ifndef NDEBUG
+#define ASSERT_ONLY(...) __VA_ARGS__
+#else
+#define ASSERT_ONLY(...)
+#endif
+
+#define rt_assert(b)  \
+	if(!(b)) { \
+		std::cerr << "rt_assert at " << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_msg(b,msg)  \
+	if(!(b)) { \
+		std::cerr << msg <<  " at " << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#define rt_assert_eq(ex,ac)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "rt_assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_eq_msg(ex,ac,msg)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "rt_assert_eq: " << msg <<  ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_eq(ex,ac)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_eq_msg(ex,ac,msg)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "assert_eq: " << msg <<  ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_eq(ex,ac)
+#define assert_eq_msg(ex,ac,msg)
+#endif
+
+#define rt_assert_neq(ex,ac)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "rt_assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_neq_msg(ex,ac,msg)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "rt_assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_neq(ex,ac)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_neq_msg(ex,ac,msg)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_neq(ex,ac)
+#define assert_neq_msg(ex,ac,msg)
+#endif
+
+#define rt_assert_gt(a,b) \
+	if(!((a) > (b))) { \
+		std::cerr << "rt_assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_gt_msg(a,b,msg) \
+	if(!((a) > (b))) { \
+		std::cerr << "rt_assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_gt(a,b) \
+	if(!((a) > (b))) { \
+		std::cerr << "assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_gt_msg(a,b,msg) \
+	if(!((a) > (b))) { \
+		std::cerr << "assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_gt(a,b)
+#define assert_gt_msg(a,b,msg)
+#endif
+
+#define rt_assert_geq(a,b) \
+	if(!((a) >= (b))) { \
+		std::cerr << "rt_assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_geq_msg(a,b,msg) \
+	if(!((a) >= (b))) { \
+		std::cerr << "rt_assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_geq(a,b) \
+	if(!((a) >= (b))) { \
+		std::cerr << "assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_geq_msg(a,b,msg) \
+	if(!((a) >= (b))) { \
+		std::cerr << "assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_geq(a,b)
+#define assert_geq_msg(a,b,msg)
+#endif
+
+#define rt_assert_lt(a,b) \
+	if(!(a < b)) { \
+		std::cerr << "rt_assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_lt_msg(a,b,msg) \
+	if(!(a < b)) { \
+		std::cerr << "rt_assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_lt(a,b) \
+	if(!(a < b)) { \
+		std::cerr << "assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_lt_msg(a,b,msg) \
+	if(!(a < b)) { \
+		std::cerr << "assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_lt(a,b)
+#define assert_lt_msg(a,b,msg)
+#endif
+
+#define rt_assert_leq(a,b) \
+	if(!((a) <= (b))) { \
+		std::cerr << "rt_assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_leq_msg(a,b,msg) \
+	if(!((a) <= (b))) { \
+		std::cerr << "rt_assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_leq(a,b) \
+	if(!((a) <= (b))) { \
+		std::cerr << "assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_leq_msg(a,b,msg) \
+	if(!((a) <= (b))) { \
+		std::cerr << "assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_leq(a,b)
+#define assert_leq_msg(a,b,msg)
+#endif
+
+#ifndef NDEBUG
+#define assert_in(c, s) assert_in2(c, s, __FILE__, __LINE__)
+static inline void assert_in2(char c, const char *str, const char *file, int line) {
+	const char *s = str;
+	while(*s != '\0') {
+		if(c == *s) return;
+		s++;
+	}
+	std::cerr << "assert_in: (" << c << ") not in  (" << str << ")" << std::endl;
+	std::cerr << file << ":" << line << std::endl;
+	assert(0);
+}
+#else
+#define assert_in(c, s)
+#endif
+
+#ifndef NDEBUG
+#define assert_range(b, e, v) assert_range_helper(b, e, v, __FILE__, __LINE__)
+template<typename T>
+inline static void assert_range_helper(const T& begin,
+                                       const T& end,
+                                       const T& val,
+                                       const char *file,
+                                       int line)
+{
+	if(val < begin || val > end) {
+		std::cerr << "assert_range: (" << val << ") not in  ["
+		          << begin << ", " << end << "]" << std::endl;
+		std::cerr << file << ":" << line << std::endl;
+		assert(0);
+	}
+}
+#else
+#define assert_range(b, e, v)
+#endif
+
+// define a macro to indicate variables that are only required for asserts
+// used to make production build happy, i.e. disable "warning: variable ‘x’ set but not used [-Wunused-but-set-variable]"
+#define _unused(x) ((void)x)
+
+#endif /*ASSERT_HELPERS_H_*/
diff --git a/binary_sa_search.h b/binary_sa_search.h
new file mode 100644
index 0000000..4bb6eb7
--- /dev/null
+++ b/binary_sa_search.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BINARY_SA_SEARCH_H_
+#define BINARY_SA_SEARCH_H_
+
+#include <stdint.h>
+#include <iostream>
+#include <limits>
+#include "alphabet.h"
+#include "assert_helpers.h"
+#include "ds.h"
+#include "btypes.h"
+
+/**
+ * Do a binary search using the suffix of 'host' beginning at offset
+ * 'qry' as the query and 'sa' as an already-lexicographically-sorted
+ * list of suffixes of host.  'sa' may be all suffixes of host or just
+ * a subset.  Returns the index in sa of the smallest suffix of host
+ * that is larger than qry, or length(sa) if all suffixes of host are
+ * less than qry.
+ *
+ * We use the Manber and Myers optimization of maintaining a pair of
+ * counters for the longest lcp observed so far on the left- and right-
+ * hand sides and using the min of the two as a way of skipping over
+ * characters at the beginning of a new round.
+ *
+ * Returns maximum value if the query suffix matches an element of sa.
+ */
+template<typename TStr, typename TSufElt> inline
+TIndexOffU binarySASearch(
+	const TStr& host,
+	TIndexOffU qry,
+	const EList<TSufElt>& sa)
+{
+	TIndexOffU lLcp = 0, rLcp = 0; // greatest observed LCPs on left and right
+	TIndexOffU l = 0, r = (TIndexOffU)sa.size()+1; // binary-search window
+	TIndexOffU hostLen = (TIndexOffU)host.length();
+	while(true) {
+		assert_gt(r, l);
+		TIndexOffU m = (l+r) >> 1;
+		if(m == l) {
+			// Binary-search window has closed: we have an answer
+			if(m > 0 && sa[m-1] == qry) {
+				return std::numeric_limits<TIndexOffU>::max(); // qry matches
+			}
+			assert_leq(m, sa.size());
+			return m; // Return index of right-hand suffix
+		}
+		assert_gt(m, 0);
+		TIndexOffU suf = sa[m-1];
+		if(suf == qry) {
+			return std::numeric_limits<TIndexOffU>::max(); // query matches an elt of sa
+		}
+		TIndexOffU lcp = min(lLcp, rLcp);
+#ifndef NDEBUG
+		if(sstr_suf_upto_neq(host, qry, host, suf, lcp)) {
+			assert(0);
+		}
+#endif
+		// Keep advancing lcp, but stop when query mismatches host or
+		// when the counter falls off either the query or the suffix
+		while(suf+lcp < hostLen && qry+lcp < hostLen && host[suf+lcp] == host[qry+lcp]) {
+			lcp++;
+		}
+		// Fell off the end of either the query or the sa elt?
+		bool fell = (suf+lcp == hostLen || qry+lcp == hostLen);
+		if((fell && qry+lcp == hostLen) || (!fell && host[suf+lcp] < host[qry+lcp])) {
+			// Query is greater than sa elt
+			l = m;                 // update left bound
+			lLcp = max(lLcp, lcp); // update left lcp
+		}
+		else if((fell && suf+lcp == hostLen) || (!fell && host[suf+lcp] > host[qry+lcp])) {
+			// Query is less than sa elt
+			r = m;                 // update right bound
+			rLcp = max(rLcp, lcp); // update right lcp
+		} else {
+			assert(false); // Must be one or the other!
+		}
+	}
+	// Shouldn't get here
+	assert(false);
+	return std::numeric_limits<TIndexOffU>::max();
+}
+
+#endif /*BINARY_SA_SEARCH_H_*/
diff --git a/bitpack.h b/bitpack.h
new file mode 100644
index 0000000..b6a7cf4
--- /dev/null
+++ b/bitpack.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BITPACK_H_
+#define BITPACK_H_
+
+#include <stdint.h>
+#include "assert_helpers.h"
+
+/**
+ * Routines for marshalling 2-bit values into and out of 8-bit or
+ * 32-bit hosts
+ */
+
+static inline void pack_2b_in_8b(const int two, uint8_t& eight, const int off) {
+	assert_lt(two, 4);
+	assert_lt(off, 4);
+	eight |= (two << (off*2));
+}
+
+static inline int unpack_2b_from_8b(const uint8_t eight, const int off) {
+	assert_lt(off, 4);
+	return ((eight >> (off*2)) & 0x3);
+}
+
+static inline void pack_2b_in_32b(const int two, uint32_t& thirty2, const int off) {
+	assert_lt(two, 4);
+	assert_lt(off, 16);
+	thirty2 |= (two << (off*2));
+}
+
+static inline int unpack_2b_from_32b(const uint32_t thirty2, const int off) {
+	assert_lt(off, 16);
+	return ((thirty2 >> (off*2)) & 0x3);
+}
+
+#endif /*BITPACK_H_*/
diff --git a/blockwise_sa.h b/blockwise_sa.h
new file mode 100644
index 0000000..8bc8f4a
--- /dev/null
+++ b/blockwise_sa.h
@@ -0,0 +1,1120 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BLOCKWISE_SA_H_
+#define BLOCKWISE_SA_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include "assert_helpers.h"
+#include "diff_sample.h"
+#include "multikey_qsort.h"
+#include "random_source.h"
+#include "binary_sa_search.h"
+#include "zbox.h"
+#include "alphabet.h"
+#include "timer.h"
+#include "ds.h"
+#include "mem_ids.h"
+#include "word_io.h"
+
+using namespace std;
+
+// Helpers for printing verbose messages
+
+#ifndef VMSG_NL
+#define VMSG_NL(...) \
+if(this->verbose()) { \
+	stringstream tmp; \
+	tmp << __VA_ARGS__ << endl; \
+	this->verbose(tmp.str()); \
+}
+#endif
+
+#ifndef VMSG
+#define VMSG(...) \
+if(this->verbose()) { \
+	stringstream tmp; \
+	tmp << __VA_ARGS__; \
+	this->verbose(tmp.str()); \
+}
+#endif
+
+/**
+ * Abstract parent class for blockwise suffix-array building schemes.
+ */
+template<typename TStr>
+class BlockwiseSA {
+public:
+	BlockwiseSA(const TStr& __text,
+	            TIndexOffU __bucketSz,
+                int  __nthreads = 1,
+	            bool __sanityCheck = false,
+	            bool __passMemExc = false,
+	            bool __verbose = false,
+	            ostream& __logger = cout) :
+	_text(__text),
+	_bucketSz(max<TIndexOffU>(__bucketSz, 2u)),
+    _nthreads(__nthreads),
+	_sanityCheck(__sanityCheck),
+	_passMemExc(__passMemExc),
+	_verbose(__verbose),
+	_itrBucket(EBWTB_CAT),
+    _itrBucketIdx(0),
+	_itrBucketPos(OFF_MASK),
+	_itrPushedBackSuffix(OFF_MASK),
+	_logger(__logger)
+	{
+    }
+
+	virtual ~BlockwiseSA() { }
+
+	/**
+	 * Get the next suffix; compute the next bucket if necessary.
+	 */
+    virtual TIndexOffU nextSuffix() = 0;
+
+	/**
+	 * Return true iff the next call to nextSuffix will succeed.
+	 */
+	bool hasMoreSuffixes() {
+		if(_itrPushedBackSuffix != OFF_MASK) return true;
+		try {
+			_itrPushedBackSuffix = nextSuffix();
+		} catch(out_of_range& e) {
+			assert_eq(OFF_MASK, _itrPushedBackSuffix);
+			return false;
+		}
+		return true;
+	}
+
+	/**
+	 * Reset the suffix iterator so that the next call to nextSuffix()
+	 * returns the lexicographically-first suffix.
+	 */
+	void resetSuffixItr() {
+		_itrBucket.clear();
+        _itrBucketIdx = 0;
+		_itrBucketPos = OFF_MASK;
+		_itrPushedBackSuffix = OFF_MASK;
+		reset();
+		assert(suffixItrIsReset());
+	}
+
+	/**
+	 * Returns true iff the next call to nextSuffix() returns the
+	 * lexicographically-first suffix.
+	 */
+	bool suffixItrIsReset() {
+		return _itrBucketIdx                       == 0 &&
+               _itrBucket.size()                   == 0 &&
+		       _itrBucketPos                       == OFF_MASK &&
+		       _itrPushedBackSuffix                == OFF_MASK &&
+		       isReset();
+	}
+
+	const TStr& text()  const { return _text; }
+	TIndexOffU bucketSz() const { return _bucketSz; }
+	bool sanityCheck()  const { return _sanityCheck; }
+	bool verbose()      const { return _verbose; }
+	ostream& log()      const { return _logger; }
+	size_t size()       const { return _text.length()+1; }
+
+protected:
+	/// Reset back to the first block
+	virtual void reset() = 0;
+	/// Return true iff reset to the first block
+	virtual bool isReset() = 0;
+
+	/**
+	 * Grab the next block of sorted suffixes.  The block is guaranteed
+	 * to have at most _bucketSz elements.
+	 */
+	virtual void nextBlock(int cur_block, int tid = 0) = 0;
+	/// Return true iff more blocks are available
+	virtual bool hasMoreBlocks() const = 0;
+	/// Optionally output a verbose message
+	void verbose(const string& s) const {
+		if(this->verbose()) {
+			this->log() << s.c_str();
+			this->log().flush();
+		}
+	}
+
+	const TStr&        _text;        /// original string
+	const TIndexOffU   _bucketSz;    /// target maximum bucket size
+    const int          _nthreads;    /// number of threads
+	const bool         _sanityCheck; /// whether to perform sanity checks
+	const bool         _passMemExc;  /// true -> pass on memory exceptions
+	const bool         _verbose;     /// be talkative
+	EList<TIndexOffU>  _itrBucket;   /// current bucket
+    TIndexOffU         _itrBucketIdx;
+	TIndexOffU         _itrBucketPos;/// offset into current bucket
+	TIndexOffU         _itrPushedBackSuffix; /// temporary slot for lookahead
+	ostream&           _logger;      /// write log messages here
+};
+
+/**
+ * Abstract parent class for a blockwise suffix array builder that
+ * always doles out blocks in lexicographical order.
+ */
+template<typename TStr>
+class InorderBlockwiseSA : public BlockwiseSA<TStr> {
+public:
+	InorderBlockwiseSA(const TStr& __text,
+	                   TIndexOffU __bucketSz,
+                       int  __nthreads = 1,
+	                   bool __sanityCheck = false,
+	   	               bool __passMemExc = false,
+	                   bool __verbose = false,
+	                   ostream& __logger = cout) :
+	BlockwiseSA<TStr>(__text, __bucketSz, __nthreads, __sanityCheck, __passMemExc, __verbose, __logger)
+	{}
+};
+
+/**
+ * Build the SA a block at a time according to the scheme outlined in
+ * Karkkainen's "Fast BWT" paper.
+ */
+template<typename TStr>
+class KarkkainenBlockwiseSA : public InorderBlockwiseSA<TStr> {
+public:
+	typedef DifferenceCoverSample<TStr> TDC;
+
+	KarkkainenBlockwiseSA(const TStr& __text,
+	                      TIndexOffU __bucketSz,
+                          int      __nthreads,
+	                      uint32_t __dcV,
+	                      uint32_t __seed = 0,
+	      	              bool __sanityCheck = false,
+	   	                  bool __passMemExc = false,
+	      	              bool __verbose = false,
+                          string base_fname = "",
+	      	              ostream& __logger = cout) :
+	InorderBlockwiseSA<TStr>(__text, __bucketSz, __nthreads, __sanityCheck, __passMemExc, __verbose, __logger),
+	_sampleSuffs(EBWTB_CAT), _cur(0), _dcV(__dcV), _dc(EBWTB_CAT), _built(false), _base_fname(base_fname), _bigEndian(currentlyBigEndian())
+	{ _randomSrc.init(__seed); reset(); }
+
+	~KarkkainenBlockwiseSA()
+    {
+        if(_threads.size() > 0) {
+            for (size_t tid = 0; tid < _threads.size(); tid++) {
+                _threads[tid]->join();
+                delete _threads[tid];
+            }
+        }
+    }
+
+	/**
+	 * Allocate an amount of memory that simulates the peak memory
+	 * usage of the DifferenceCoverSample with the given text and v.
+	 * Throws bad_alloc if it's not going to fit in memory.  Returns
+	 * the approximate number of bytes the Cover takes at all times.
+	 */
+	static size_t simulateAllocs(const TStr& text, TIndexOffU bucketSz) {
+		size_t len = text.length();
+		// _sampleSuffs and _itrBucket are in memory at the peak
+		size_t bsz = bucketSz;
+		size_t sssz = len / max<TIndexOffU>(bucketSz-1, 1);
+		AutoArray<TIndexOffU> tmp(bsz + sssz + (1024 * 1024 /*out of caution*/), EBWT_CAT);
+		return bsz;
+	}
+    
+    static void nextBlock_Worker(void *vp) {
+        pair<KarkkainenBlockwiseSA*, int> param = *(pair<KarkkainenBlockwiseSA*, int>*)vp;
+        KarkkainenBlockwiseSA* sa = param.first;
+        int tid = param.second;
+        while(true) {
+            size_t cur = 0;
+            {
+                ThreadSafe ts(&sa->_mutex, sa->_nthreads > 1);
+                cur = sa->_cur;
+                if(cur > sa->_sampleSuffs.size()) break;
+                sa->_cur++;
+            }
+            sa->nextBlock((int)cur, tid);
+            // Write suffixes into a file
+            std::ostringstream number; number << cur;
+            const string fname = sa->_base_fname + "." + number.str() + ".sa";
+            ofstream sa_file(fname.c_str(), ios::binary);
+            if(!sa_file.good()) {
+                cerr << "Could not open file for writing a reference graph: \"" << fname << "\"" << endl;
+                throw 1;
+            }
+            const EList<TIndexOffU>& bucket = sa->_itrBuckets[tid];
+            writeIndex<TIndexOffU>(sa_file, bucket.size(), sa->_bigEndian);
+            for(size_t i = 0; i < bucket.size(); i++) {
+                writeIndex<TIndexOffU>(sa_file, bucket[i], sa->_bigEndian);
+            }
+            sa_file.close();
+            sa->_itrBuckets[tid].clear();
+            sa->_done[cur] = true;
+        }
+    }
+    
+    /**
+     * Get the next suffix; compute the next bucket if necessary.
+     */
+    virtual TIndexOffU nextSuffix() {
+        // Launch threads if not
+        if(this->_nthreads > 1) {
+            if(_threads.size() == 0) {
+                _done.resize(_sampleSuffs.size() + 1);
+                _done.fill(false);
+                _itrBuckets.resize(this->_nthreads);
+                for(int tid = 0; tid < this->_nthreads; tid++) {
+                    _tparams.expand();
+                    _tparams.back().first = this;
+                    _tparams.back().second = tid;
+                    _threads.push_back(new tthread::thread(nextBlock_Worker, (void*)&_tparams.back()));
+                }
+                assert_eq(_threads.size(), (size_t)this->_nthreads);
+            }
+        }
+        if(this->_itrPushedBackSuffix != OFF_MASK) {
+            TIndexOffU tmp = this->_itrPushedBackSuffix;
+            this->_itrPushedBackSuffix = OFF_MASK;
+            return tmp;
+        }
+        while(this->_itrBucketPos >= this->_itrBucket.size() ||
+              this->_itrBucket.size() == 0)
+        {
+            if(!hasMoreBlocks()) {
+                throw out_of_range("No more suffixes");
+            }
+            if(this->_nthreads == 1) {
+                nextBlock((int)_cur);
+                _cur++;
+            } else {
+                while(!_done[this->_itrBucketIdx]) {
+#if defined(_TTHREAD_WIN32_)
+                    Sleep(1);
+#elif defined(_TTHREAD_POSIX_)
+                    const static timespec ts = {0, 1000000};  // 1 millisecond
+                    nanosleep(&ts, NULL);
+#endif
+                }
+                // Read suffixes from a file
+                std::ostringstream number; number << this->_itrBucketIdx;
+                const string fname = _base_fname + "." + number.str() + ".sa";
+                ifstream sa_file(fname.c_str(), ios::binary);
+                if(!sa_file.good()) {
+                    cerr << "Could not open file for reading a reference graph: \"" << fname << "\"" << endl;
+                    throw 1;
+                }
+                size_t numSAs = readIndex<TIndexOffU>(sa_file, _bigEndian);
+                this->_itrBucket.resizeExact(numSAs);
+                for(size_t i = 0; i < numSAs; i++) {
+                    this->_itrBucket[i] = readIndex<TIndexOffU>(sa_file, _bigEndian);
+                }
+                sa_file.close();
+                std::remove(fname.c_str());
+            }
+            this->_itrBucketIdx++;
+            this->_itrBucketPos = 0;
+        }
+        return this->_itrBucket[this->_itrBucketPos++];
+    }
+
+	/// Defined in blockwise_sa.cpp
+	virtual void nextBlock(int cur_block, int tid = 0);
+
+	/// Defined in blockwise_sa.cpp
+	virtual void qsort(EList<TIndexOffU>& bucket);
+
+	/// Return true iff more blocks are available
+	virtual bool hasMoreBlocks() const {
+        return this->_itrBucketIdx <= _sampleSuffs.size();
+	}
+
+	/// Return the difference-cover period
+	uint32_t dcV() const { return _dcV; }
+
+protected:
+
+	/**
+	 * Initialize the state of the blockwise suffix sort.  If the
+	 * difference cover sample and the sample set have not yet been
+	 * built, build them.  Then reset the block cursor to point to
+	 * the first block.
+	 */
+	virtual void reset() {
+		if(!_built) {
+			build();
+		}
+		assert(_built);
+		_cur = 0;
+	}
+
+	/// Return true iff we're about to dole out the first bucket
+	virtual bool isReset() {
+		return _cur == 0;
+	}
+
+private:
+
+	/**
+	 * Calculate the difference-cover sample and sample suffixes.
+	 */
+	void build() {
+		// Calculate difference-cover sample
+		assert(_dc.get() == NULL);
+		if(_dcV != 0) {
+			_dc.init(new TDC(this->text(), _dcV, this->verbose(), this->sanityCheck()));
+			_dc.get()->build(this->_nthreads);
+		}
+		// Calculate sample suffixes
+		if(this->bucketSz() <= this->text().length()) {
+			VMSG_NL("Building samples");
+			buildSamples();
+		} else {
+			VMSG_NL("Skipping building samples since text length " <<
+			        this->text().length() << " is less than bucket size: " <<
+			        this->bucketSz());
+		}
+		_built = true;
+	}
+
+	/**
+	 * Calculate the lcp between two suffixes using the difference
+	 * cover as a tie-breaker.  If the tie-breaker is employed, then
+	 * the calculated lcp may be an underestimate.
+	 *
+	 * Defined in blockwise_sa.cpp
+	 */
+	inline bool tieBreakingLcp(TIndexOffU aOff,
+	                           TIndexOffU bOff,
+	                           TIndexOffU& lcp,
+	                           bool& lcpIsSoft);
+
+	/**_randomSrc
+	 * Compare two suffixes using the difference-cover sample.
+	 */
+	inline bool suffixCmp(TIndexOffU cmp,
+	                      TIndexOffU i,
+	                      int64_t& j,
+	                      int64_t& k,
+	                      bool& kSoft,
+	                      const EList<TIndexOffU>& z);
+
+	void buildSamples();
+
+	EList<TIndexOffU>  _sampleSuffs; /// sample suffixes
+	TIndexOffU         _cur;         /// offset to 1st elt of next block
+	const uint32_t     _dcV;         /// difference-cover periodicity
+	PtrWrap<TDC>       _dc;          /// queryable difference-cover data
+	bool               _built;       /// whether samples/DC have been built
+	RandomSource       _randomSrc;   /// source of pseudo-randoms
+    
+
+    MUTEX_T                 _mutex;       /// synchronization of output message
+    string                  _base_fname;  /// base file name for storing SA blocks
+    bool                    _bigEndian;   /// bigEndian?
+    EList<tthread::thread*> _threads;     /// thread list
+    EList<pair<KarkkainenBlockwiseSA*, int> > _tparams;
+    ELList<TIndexOffU>      _itrBuckets;  /// buckets
+    EList<bool>             _done;        /// is a block processed?
+};
+
+/**
+ * Qsort the set of suffixes whose offsets are in 'bucket'.
+ */
+template<typename TStr>
+inline void KarkkainenBlockwiseSA<TStr>::qsort(EList<TIndexOffU>& bucket) {
+	const TStr& t = this->text();
+	TIndexOffU *s = bucket.ptr();
+	size_t slen = bucket.size();
+	TIndexOffU len = (TIndexOffU)t.length();
+	if(_dc.get() != NULL) {
+		// Use the difference cover as a tie-breaker if we have it
+		VMSG_NL("  (Using difference cover)");
+		// Extract the 'host' array because it's faster to work
+		// with than the EList<> container
+		const uint8_t *host = (const uint8_t *)t.buf();
+		assert(_dc.get() != NULL);
+		mkeyQSortSufDcU8(t, host, len, s, slen, *_dc.get(), 4,
+		                 this->verbose(), this->sanityCheck());
+	} else {
+		VMSG_NL("  (Not using difference cover)");
+		// We don't have a difference cover - just do a normal
+		// suffix sort
+		mkeyQSortSuf(t, s, slen, 4,
+		             this->verbose(), this->sanityCheck());
+	}
+}
+
+/**
+ * Qsort the set of suffixes whose offsets are in 'bucket'.  This
+ * specialization for packed strings does not attempt to extract and
+ * operate directly on the host string; the fact that the string is
+ * packed means that the array cannot be sorted directly.
+ */
+template<>
+inline void KarkkainenBlockwiseSA<S2bDnaString>::qsort(
+	EList<TIndexOffU>& bucket)
+{
+	const S2bDnaString& t = this->text();
+	TIndexOffU *s = bucket.ptr();
+	size_t slen = bucket.size();
+	size_t len = t.length();
+	if(_dc.get() != NULL) {
+		// Use the difference cover as a tie-breaker if we have it
+		VMSG_NL("  (Using difference cover)");
+		// Can't use the text's 'host' array because the backing
+		// store for the packed string is not one-char-per-elt.
+		mkeyQSortSufDcU8(t, t, len, s, slen, *_dc.get(), 4,
+		                 this->verbose(), this->sanityCheck());
+	} else {
+		VMSG_NL("  (Not using difference cover)");
+		// We don't have a difference cover - just do a normal
+		// suffix sort
+		mkeyQSortSuf(t, s, slen, 4,
+		             this->verbose(), this->sanityCheck());
+	}
+}
+
+template<typename TStr>
+struct BinarySortingParam {
+    const TStr*              t;
+    const EList<TIndexOffU>* sampleSuffs;
+    EList<TIndexOffU>        bucketSzs;
+    EList<TIndexOffU>        bucketReps;
+    size_t                   begin;
+    size_t                   end;
+};
+
+template<typename TStr>
+static void BinarySorting_worker(void *vp)
+{
+    BinarySortingParam<TStr>* param = (BinarySortingParam<TStr>*)vp;
+    const TStr& t = *(param->t);
+    size_t len = t.length();
+    const EList<TIndexOffU>& sampleSuffs = *(param->sampleSuffs);
+    EList<TIndexOffU>& bucketSzs = param->bucketSzs;
+    EList<TIndexOffU>& bucketReps = param->bucketReps;
+    ASSERT_ONLY(size_t numBuckets = bucketSzs.size());
+    size_t begin = param->begin;
+    size_t end = param->end;
+    // Iterate through every suffix in the text, determine which
+    // bucket it falls into by doing a binary search across the
+    // sorted list of samples, and increment a counter associated
+    // with that bucket.  Also, keep one representative for each
+    // bucket so that we can split it later.  We loop in ten
+    // stretches so that we can print out a helpful progress
+    // message.  (This step can take a long time.)
+    for(TIndexOffU i = begin; i < end && i < len; i++) {
+        TIndexOffU r = binarySASearch(t, i, sampleSuffs);
+        if(r == std::numeric_limits<TIndexOffU>::max()) continue; // r was one of the samples
+        assert_lt(r, numBuckets);
+        bucketSzs[r]++;
+        assert_lt(bucketSzs[r], len);
+        if(bucketReps[r] == OFF_MASK || (i & 100) == 0) {
+            bucketReps[r] = i; // clobbers previous one, but that's OK
+        }
+    }
+}
+
+/**
+ * Select a set of bucket-delineating sample suffixes such that no
+ * bucket is greater than the requested upper limit.  Some care is
+ * taken to make each bucket's size close to the limit without
+ * going over.
+ */
+template<typename TStr>
+void KarkkainenBlockwiseSA<TStr>::buildSamples() {
+	const TStr& t = this->text();
+    TIndexOffU bsz = this->bucketSz()-1; // subtract 1 to leave room for sample
+	size_t len = this->text().length();
+	// Prepare _sampleSuffs array
+	_sampleSuffs.clear();
+	TIndexOffU numSamples = (TIndexOffU)((len/bsz)+1)<<1; // ~len/bsz x 2
+	assert_gt(numSamples, 0);
+	VMSG_NL("Reserving space for " << numSamples << " sample suffixes");
+	if(this->_passMemExc) {
+		_sampleSuffs.resizeExact(numSamples);
+		// Randomly generate samples.  Allow duplicates for now.
+		VMSG_NL("Generating random suffixes");
+		for(size_t i = 0; i < numSamples; i++) {
+#ifdef BOWTIE_64BIT_INDEX         
+			_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU64() % len); 
+#else
+			_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU32() % len); 
+#endif
+		}
+	} else {
+		try {
+			_sampleSuffs.resizeExact(numSamples);
+			// Randomly generate samples.  Allow duplicates for now.
+			VMSG_NL("Generating random suffixes");
+			for(size_t i = 0; i < numSamples; i++) {
+#ifdef BOWTIE_64BIT_INDEX
+				_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU64() % len); 
+#else
+				_sampleSuffs[i] = (TIndexOffU)(_randomSrc.nextU32() % len); 
+#endif                
+			}
+		} catch(bad_alloc &e) {
+			if(this->_passMemExc) {
+				throw e; // rethrow immediately
+			} else {
+				cerr << "Could not allocate sample suffix container of " << (numSamples * OFF_SIZE) << " bytes." << endl
+				     << "Please try using a smaller number of blocks by specifying a larger --bmax or" << endl
+				     << "a smaller --bmaxdivn" << endl;
+				throw 1;
+			}
+		}
+	}
+	// Remove duplicates; very important to do this before the call to
+	// mkeyQSortSuf so that it doesn't try to calculate lexicographical
+	// relationships between very long, identical strings, which takes
+	// an extremely long time in general, and causes the stack to grow
+	// linearly with the size of the input
+	{
+		Timer timer(cout, "QSorting sample offsets, eliminating duplicates time: ", this->verbose());
+		VMSG_NL("QSorting " << _sampleSuffs.size() << " sample offsets, eliminating duplicates");
+		_sampleSuffs.sort();
+		size_t sslen = _sampleSuffs.size();
+		for(size_t i = 0; i < sslen-1; i++) {
+			if(_sampleSuffs[i] == _sampleSuffs[i+1]) {
+				_sampleSuffs.erase(i--);
+				sslen--;
+			}
+		}
+	}
+	// Multikey quicksort the samples
+	{
+		Timer timer(cout, "  Multikey QSorting samples time: ", this->verbose());
+		VMSG_NL("Multikey QSorting " << _sampleSuffs.size() << " samples");
+		this->qsort(_sampleSuffs);
+	}
+	// Calculate bucket sizes
+	VMSG_NL("Calculating bucket sizes");
+	int limit = 5;
+	// Iterate until all buckets are less than
+	while(--limit >= 0) {
+        TIndexOffU numBuckets = (TIndexOffU)_sampleSuffs.size()+1;
+        AutoArray<tthread::thread*> threads(this->_nthreads);
+        EList<BinarySortingParam<TStr> > tparams;
+        for(int tid = 0; tid < this->_nthreads; tid++) {
+            // Calculate bucket sizes by doing a binary search for each
+            // suffix and noting where it lands
+            tparams.expand();
+            try {
+                // Allocate and initialize containers for holding bucket
+                // sizes and representatives.
+                tparams.back().bucketSzs.resizeExact(numBuckets);
+                tparams.back().bucketReps.resizeExact(numBuckets);
+                tparams.back().bucketSzs.fillZero();
+                tparams.back().bucketReps.fill(OFF_MASK);
+            } catch(bad_alloc &e) {
+                if(this->_passMemExc) {
+                    throw e; // rethrow immediately
+                } else {
+                    cerr << "Could not allocate sizes, representatives (" << ((numBuckets*8)>>10) << " KB) for blocks." << endl
+                    << "Please try using a smaller number of blocks by specifying a larger --bmax or a" << endl
+                    << "smaller --bmaxdivn." << endl;
+                    throw 1;
+                }
+            }
+            tparams.back().t = &t;
+            tparams.back().sampleSuffs = &_sampleSuffs;
+            tparams.back().begin = (tid == 0 ? 0 : len / this->_nthreads * tid);
+            tparams.back().end = (tid + 1 == this->_nthreads ? len : len / this->_nthreads * (tid + 1));
+            if(this->_nthreads == 1) {
+                BinarySorting_worker<TStr>((void*)&tparams.back());
+            } else {
+                threads[tid] = new tthread::thread(BinarySorting_worker<TStr>, (void*)&tparams.back());
+            }
+        }
+        
+        if(this->_nthreads > 1) {
+            for (int tid = 0; tid < this->_nthreads; tid++) {
+                threads[tid]->join();
+            }
+        }
+        
+        EList<TIndexOffU>& bucketSzs = tparams[0].bucketSzs;
+        EList<TIndexOffU>& bucketReps = tparams[0].bucketReps;
+        for(int tid = 1; tid < this->_nthreads; tid++) {
+            for(size_t j = 0; j < numBuckets; j++) {
+                bucketSzs[j] += tparams[tid].bucketSzs[j];
+                if(bucketReps[j] == OFF_MASK) {
+                    bucketReps[j] = tparams[tid].bucketReps[j];
+                }
+            }
+        }
+		// Check for large buckets and mergeable pairs of small buckets
+		// and split/merge as necessary
+		TIndexOff added = 0;
+		TIndexOff merged = 0;
+		assert_eq(bucketSzs.size(), numBuckets);
+		assert_eq(bucketReps.size(), numBuckets);
+		{
+			Timer timer(cout, "  Splitting and merging time: ", this->verbose());
+			VMSG_NL("Splitting and merging");
+			for(TIndexOffU i = 0; i < numBuckets; i++) {
+				TIndexOffU mergedSz = bsz + 1;
+				assert(bucketSzs[(size_t)i] == 0 || bucketReps[(size_t)i] != OFF_MASK);
+				if(i < numBuckets-1) {
+					mergedSz = bucketSzs[(size_t)i] + bucketSzs[(size_t)i+1] + 1;
+				}
+				// Merge?
+				if(mergedSz <= bsz) {
+					bucketSzs[(size_t)i+1] += (bucketSzs[(size_t)i]+1);
+					// The following may look strange, but it's necessary
+					// to ensure that the merged bucket has a representative
+					bucketReps[(size_t)i+1] = _sampleSuffs[(size_t)i+added];
+					_sampleSuffs.erase((size_t)i+added);
+					bucketSzs.erase((size_t)i);
+					bucketReps.erase((size_t)i);
+					i--; // might go to -1 but ++ will overflow back to 0
+					numBuckets--;
+					merged++;
+					assert_eq(numBuckets, _sampleSuffs.size()+1-added);
+					assert_eq(numBuckets, bucketSzs.size());
+				}
+				// Split?
+				else if(bucketSzs[(size_t)i] > bsz) {
+					// Add an additional sample from the bucketReps[]
+					// set accumulated in the binarySASearch loop; this
+					// effectively splits the bucket
+					_sampleSuffs.insert(bucketReps[(size_t)i], (TIndexOffU)(i + (added++)));
+				}
+			}
+		}
+		if(added == 0) {
+			//if(this->verbose()) {
+			//	cout << "Final bucket sizes:" << endl;
+			//	cout << "  (begin): " << bucketSzs[0] << " (" << (int)(bsz - bucketSzs[0]) << ")" << endl;
+			//	for(uint32_t i = 1; i < numBuckets; i++) {
+			//		cout << "  " << bucketSzs[i] << " (" << (int)(bsz - bucketSzs[i]) << ")" << endl;
+			//	}
+			//}
+			break;
+		}
+		// Otherwise, continue until no more buckets need to be
+		// split
+		VMSG_NL("Split " << added << ", merged " << merged << "; iterating...");
+	}
+	// Do *not* force a do-over
+//	if(limit == 0) {
+//		VMSG_NL("Iterated too many times; trying again...");
+//		buildSamples();
+//	}
+	VMSG_NL("Avg bucket size: " << ((double)(len-_sampleSuffs.size()) / (_sampleSuffs.size()+1)) << " (target: " << bsz << ")");
+}
+
+/**
+ * Do a simple LCP calculation on two strings.
+ */
+template<typename T> inline
+static TIndexOffU suffixLcp(const T& t, TIndexOffU aOff, TIndexOffU bOff) {
+	TIndexOffU c = 0;
+	size_t len = t.length();
+	assert_leq(aOff, len);
+	assert_leq(bOff, len);
+	while(aOff + c < len && bOff + c < len && t[aOff + c] == t[bOff + c]) c++;
+	return c;
+}
+
+/**
+ * Calculate the lcp between two suffixes using the difference
+ * cover as a tie-breaker.  If the tie-breaker is employed, then
+ * the calculated lcp may be an underestimate.  If the tie-breaker is
+ * employed, lcpIsSoft will be set to true (otherwise, false).
+ */
+template<typename TStr> inline
+bool KarkkainenBlockwiseSA<TStr>::tieBreakingLcp(TIndexOffU aOff,
+                                                 TIndexOffU bOff,
+                                                 TIndexOffU& lcp,
+                                                 bool& lcpIsSoft)
+{
+	const TStr& t = this->text();
+	TIndexOffU c = 0;
+	TIndexOffU tlen = (TIndexOffU)t.length();
+	assert_leq(aOff, tlen);
+	assert_leq(bOff, tlen);
+	assert(_dc.get() != NULL);
+	uint32_t dcDist = _dc.get()->tieBreakOff(aOff, bOff);
+	lcpIsSoft = false; // hard until proven soft
+	while(c < dcDist &&    // we haven't hit the tie breaker
+	      c < tlen-aOff && // we haven't fallen off of LHS suffix
+	      c < tlen-bOff && // we haven't fallen off of RHS suffix
+	      t[aOff+c] == t[bOff+c]) // we haven't hit a mismatch
+		c++;
+	lcp = c;
+	if(c == tlen-aOff) {
+		// Fell off LHS (a), a is greater
+		return false;
+	} else if(c == tlen-bOff) {
+		// Fell off RHS (b), b is greater
+		return true;
+	} else if(c == dcDist) {
+		// Hit a tie-breaker element
+		lcpIsSoft = true;
+		assert_neq(dcDist, 0xffffffff);
+		return _dc.get()->breakTie(aOff+c, bOff+c) < 0;
+	} else {
+		assert_neq(t[aOff+c], t[bOff+c]);
+		return t[aOff+c] < t[bOff+c];
+	}
+}
+
+/**
+ * Lookup a suffix LCP in the given z array; if the element is not
+ * filled in then calculate it from scratch.
+ */
+template<typename T>
+static TIndexOffU lookupSuffixZ(
+	const T& t,
+	TIndexOffU zOff,
+	TIndexOffU off,
+	const EList<TIndexOffU>& z)
+{
+	if(zOff < z.size()) {
+		TIndexOffU ret = z[zOff];
+		assert_eq(ret, suffixLcp(t, off + zOff, off));
+		return ret;
+	}
+	assert_leq(off + zOff, t.length());
+	return suffixLcp(t, off + zOff, off);
+}
+
+/**
+ * true -> i < cmp
+ * false -> i > cmp
+ */
+template<typename TStr> inline
+bool KarkkainenBlockwiseSA<TStr>::suffixCmp(
+	TIndexOffU cmp,
+	TIndexOffU i,
+	int64_t& j,
+	int64_t& k,
+	bool& kSoft,
+	const EList<TIndexOffU>& z)
+{
+	const TStr& t = this->text();
+	TIndexOffU len = (TIndexOffU)t.length();
+	// i is not covered by any previous match
+	TIndexOffU l;
+	if((int64_t)i > k) {
+		k = i; // so that i + lHi == kHi
+		l = 0; // erase any previous l
+		kSoft = false;
+		// To be extended
+	}
+	// i is covered by a previous match
+	else /* i <= k */ {
+		assert_gt((int64_t)i, j);
+		TIndexOffU zIdx = (TIndexOffU)(i-j);
+		assert_leq(zIdx, len-cmp);
+		if(zIdx < _dcV || _dc.get() == NULL) {
+			// Go as far as the Z-box says
+			l = lookupSuffixZ(t, zIdx, cmp, z);
+			if(i + l > len) {
+				l = len-i;
+			}
+			assert_leq(i + l, len);
+			// Possibly to be extended
+		} else {
+			// But we're past the point of no-more-Z-boxes
+			bool ret = tieBreakingLcp(i, cmp, l, kSoft);
+			// Sanity-check tie-breaker
+			if(this->sanityCheck()) {
+				if(ret) assert(sstr_suf_lt(t, i, t, cmp, false));
+				else    assert(sstr_suf_gt(t, i, t, cmp, false));
+			}
+			j = i;
+			k = i + l;
+			if(this->sanityCheck()) {
+				if(kSoft) { assert_leq(l, suffixLcp(t, i, cmp)); }
+				else      { assert_eq (l, suffixLcp(t, i, cmp)); }
+			}
+			return ret;
+		}
+	}
+
+	// Z box extends exactly as far as previous match (or there
+	// is neither a Z box nor a previous match)
+	if((int64_t)(i + l) == k) {
+		// Extend
+		while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
+			k++; l++;
+		}
+		j = i; // update furthest-extending LHS
+		kSoft = false;
+		assert_eq(l, suffixLcp(t, i, cmp));
+	}
+	// Z box extends further than previous match
+	else if((int64_t)(i + l) > k) {
+		l = (TIndexOffU)(k - i); // point to just after previous match
+		j = i; // update furthest-extending LHS
+		if(kSoft) {
+			while(l < len-cmp && k < (int64_t)len && t[(size_t)(cmp+l)] == t[(size_t)k]) {
+				k++; l++;
+			}
+			kSoft = false;
+			assert_eq(l, suffixLcp(t, i, cmp));
+		} else assert_eq(l, suffixLcp(t, i, cmp));
+	}
+
+	// Check that calculated lcp matches actual lcp
+	if(this->sanityCheck()) {
+		if(!kSoft) {
+			// l should exactly match lcp
+			assert_eq(l, suffixLcp(t, i, cmp));
+		} else {
+			// l is an underestimate of LCP
+			assert_leq(l, suffixLcp(t, i, cmp));
+		}
+	}
+	assert_leq(l+i, len);
+	assert_leq(l, len-cmp);
+
+	// i and cmp should not be the same suffix
+	assert(l != len-cmp || i+l != len);
+
+	// Now we're ready to do a comparison on the next char
+	if(l+i != len && (
+	   l == len-cmp || // departure from paper algorithm:
+	                   // falling off pattern implies
+	                   // pattern is *greater* in our case
+	   t[i + l] < t[cmp + l]))
+	{
+		// Case 2: Text suffix is less than upper sample suffix
+#ifndef NDEBUG
+		if(this->sanityCheck()) {
+			assert(sstr_suf_lt(t, i, t, cmp, false));
+		}
+#endif
+		return true; // suffix at i is less than suffix at cmp
+	}
+	else {
+		// Case 3: Text suffix is greater than upper sample suffix
+#ifndef NDEBUG
+		if(this->sanityCheck()) {
+			assert(sstr_suf_gt(t, i, t, cmp, false));
+		}
+#endif
+		return false; // suffix at i is less than suffix at cmp
+	}
+}
+
+/**
+ * Retrieve the next block.  This is the most performance-critical part
+ * of the blockwise suffix sorting process.
+ */
+template<typename TStr>
+void KarkkainenBlockwiseSA<TStr>::nextBlock(int cur_block, int tid) {
+#ifndef NDEBUG
+    if(this->_nthreads > 1) {
+        assert_lt(tid, this->_itrBuckets.size());
+    }
+#endif
+    EList<TIndexOffU>& bucket = (this->_nthreads > 1 ? this->_itrBuckets[tid] : this->_itrBucket);
+    {
+        ThreadSafe ts(&_mutex, this->_nthreads > 1);
+        VMSG_NL("Getting block " << (cur_block+1) << " of " << _sampleSuffs.size()+1);
+    }
+	assert(_built);
+	assert_gt(_dcV, 3);
+	assert_leq(cur_block, _sampleSuffs.size());
+	const TStr& t = this->text();
+	TIndexOffU len = (TIndexOffU)t.length();
+	// Set up the bucket
+	bucket.clear();
+	TIndexOffU lo = OFF_MASK, hi = OFF_MASK;
+	if(_sampleSuffs.size() == 0) {
+		// Special case: if _sampleSuffs is 0, then multikey-quicksort
+		// everything
+        {
+            ThreadSafe ts(&_mutex, this->_nthreads > 1);
+            VMSG_NL("  No samples; assembling all-inclusive block");
+        }
+		assert_eq(0, cur_block);
+		try {
+			if(bucket.capacity() < this->bucketSz()) {
+				bucket.reserveExact(len+1);
+			}
+			bucket.resize(len);
+			for(TIndexOffU i = 0; i < len; i++) {
+				bucket[i] = i;
+			}
+		} catch(bad_alloc &e) {
+			if(this->_passMemExc) {
+				throw e; // rethrow immediately
+			} else {
+				cerr << "Could not allocate a master suffix-array block of " << ((len+1) * 4) << " bytes" << endl
+				     << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
+				     << "a larger --bmaxdivn" << endl;
+				throw 1;
+			}
+		}
+	} else {
+		try {
+            {
+                ThreadSafe ts(&_mutex, this->_nthreads > 1);
+                VMSG_NL("  Reserving size (" << this->bucketSz() << ") for bucket " << (cur_block+1));
+            }
+			// BTL: Add a +100 fudge factor; there seem to be instances
+			// where a bucket ends up having one more elt than bucketSz()
+			if(bucket.size() < this->bucketSz()+100) {
+				bucket.reserveExact(this->bucketSz()+100);
+			}
+		} catch(bad_alloc &e) {
+			if(this->_passMemExc) {
+				throw e; // rethrow immediately
+			} else {
+				cerr << "Could not allocate a suffix-array block of " << ((this->bucketSz()+1) * 4) << " bytes" << endl;
+				cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
+				     << "a larger --bmaxdivn" << endl;
+				throw 1;
+			}
+		}
+		// Select upper and lower bounds from _sampleSuffs[] and
+		// calculate the Z array up to the difference-cover periodicity
+		// for both.  Be careful about first/last buckets.
+		EList<TIndexOffU> zLo(EBWTB_CAT), zHi(EBWTB_CAT);
+		assert_geq(cur_block, 0);
+		assert_leq((size_t)cur_block, _sampleSuffs.size());
+		bool first = (cur_block == 0);
+		bool last  = ((size_t)cur_block == _sampleSuffs.size());
+		try {
+			// Timer timer(cout, "  Calculating Z arrays time: ", this->verbose());
+            {
+                ThreadSafe ts(&_mutex, this->_nthreads > 1);
+                VMSG_NL("  Calculating Z arrays for bucket " << (cur_block+1));
+            }
+			if(!last) {
+				// Not the last bucket
+				assert_lt(cur_block, _sampleSuffs.size());
+				hi = _sampleSuffs[cur_block];
+				zHi.resizeExact(_dcV);
+				zHi.fillZero();
+				assert_eq(zHi[0], 0);
+				calcZ(t, hi, zHi, this->verbose(), this->sanityCheck());
+			}
+			if(!first) {
+				// Not the first bucket
+				assert_gt(cur_block, 0);
+				assert_leq(cur_block, _sampleSuffs.size());
+				lo = _sampleSuffs[cur_block-1];
+				zLo.resizeExact(_dcV);
+				zLo.fillZero();
+				assert_gt(_dcV, 3);
+				assert_eq(zLo[0], 0);
+				calcZ(t, lo, zLo, this->verbose(), this->sanityCheck());
+			}
+		} catch(bad_alloc &e) {
+			if(this->_passMemExc) {
+				throw e; // rethrow immediately
+			} else {
+				cerr << "Could not allocate a z-array of " << (_dcV * 4) << " bytes" << endl;
+				cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
+				     << "a larger --bmaxdivn" << endl;
+				throw 1;
+			}
+		}
+
+		// This is the most critical loop in the algorithm; this is where
+		// we iterate over all suffixes in the text and pick out those that
+		// fall into the current bucket.
+		//
+		// This loop is based on the SMALLERSUFFIXES function outlined on
+		// p7 of the "Fast BWT" paper
+		//
+		int64_t kHi = -1, kLo = -1;
+		int64_t jHi = -1, jLo = -1;
+		bool kHiSoft = false, kLoSoft = false;
+		assert_eq(0, bucket.size());
+		{
+			// Timer timer(cout, "  Block accumulator loop time: ", this->verbose());
+            {
+                ThreadSafe ts(&_mutex, this->_nthreads > 1);
+                VMSG_NL("  Entering block accumulator loop for bucket " << (cur_block+1) << ":");
+            }
+			TIndexOffU lenDiv10 = (len + 9) / 10;
+			for(TIndexOffU iten = 0, ten = 0; iten < len; iten += lenDiv10, ten++) {
+                TIndexOffU itenNext = iten + lenDiv10;
+                {
+                    ThreadSafe ts(&_mutex, this->_nthreads > 1);
+                    if(ten > 0) VMSG_NL("  bucket " << (cur_block+1) << ": " << (ten * 10) << "%");
+                }
+                for(TIndexOffU i = iten; i < itenNext && i < len; i++) {
+                    assert_lt(jLo, (TIndexOff)i); assert_lt(jHi, (TIndexOff)i);
+                    // Advance the upper-bound comparison by one character
+                    if(i == hi || i == lo) continue; // equal to one of the bookends
+                    if(hi != OFF_MASK && !suffixCmp(hi, i, jHi, kHi, kHiSoft, zHi)) {
+                        continue; // not in the bucket
+                    }
+                    if(lo != OFF_MASK && suffixCmp(lo, i, jLo, kLo, kLoSoft, zLo)) {
+                        continue; // not in the bucket
+                    }
+                    // In the bucket! - add it
+                    assert_lt(i, len);
+                    try {
+                        bucket.push_back(i);
+                    } catch(bad_alloc &e) {
+                        cerr << "Could not append element to block of " << ((bucket.size()) * OFF_SIZE) << " bytes" << endl;
+                        if(this->_passMemExc) {
+                            throw e; // rethrow immediately
+                        } else {
+                            cerr << "Please try using a larger number of blocks by specifying a smaller --bmax or" << endl
+                            << "a larger --bmaxdivn" << endl;
+                            throw 1;
+                        }
+                    }
+                    // Not necessarily true; we allow overflowing buckets
+                    // since we can't guarantee that a good set of sample
+                    // suffixes can be found in a reasonable amount of time
+                    //assert_lt(bucket.size(), this->bucketSz());
+                }
+            } // end loop over all suffixes of t
+            {
+                ThreadSafe ts(&_mutex, this->_nthreads > 1);
+                VMSG_NL("  bucket " << (cur_block+1) << ": 100%");
+            }
+		}
+	} // end else clause of if(_sampleSuffs.size() == 0)
+	// Sort the bucket
+	if(bucket.size() > 0) {
+		Timer timer(cout, "  Sorting block time: ", this->verbose());
+        {
+            ThreadSafe ts(&_mutex, this->_nthreads > 1);
+            VMSG_NL("  Sorting block of length " << bucket.size() << " for bucket " << (cur_block+1));
+        }
+		this->qsort(bucket);
+	}
+	if(hi != OFF_MASK) {
+		// Not the final bucket; throw in the sample on the RHS
+		bucket.push_back(hi);
+	} else {
+		// Final bucket; throw in $ suffix
+		bucket.push_back(len);
+	}
+    {
+        ThreadSafe ts(&_mutex, this->_nthreads > 1);
+        VMSG_NL("Returning block of " << bucket.size() << " for bucket " << (cur_block+1));
+    }
+}
+
+#endif /*BLOCKWISE_SA_H_*/
diff --git a/bt2_idx.cpp b/bt2_idx.cpp
new file mode 100644
index 0000000..8c82306
--- /dev/null
+++ b/bt2_idx.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string>
+#include <stdexcept>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include "bt2_idx.h"
+
+using namespace std;
+
+const std::string gEbwt_ext("cf");
+
+/**
+ * Try to find the Bowtie index specified by the user.  First try the
+ * exact path given by the user.  Then try the user-provided string
+ * appended onto the path of the "indexes" subdirectory below this
+ * executable, then try the provided string appended onto
+ * "$BOWTIE2_INDEXES/".
+ */
+string adjustEbwtBase(const string& cmdline,
+					  const string& ebwtFileBase,
+					  bool verbose = false)
+{
+	string str = ebwtFileBase;
+	ifstream in;
+	if(verbose) cout << "Trying " << str.c_str() << endl;
+	in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
+	if(!in.is_open()) {
+		if(verbose) cout << "  didn't work" << endl;
+		in.close();
+		if(getenv("CENTRIFUGE_INDEXES") != NULL) {
+			str = string(getenv("CENTRIFUGE_INDEXES")) + "/" + ebwtFileBase;
+			if(verbose) cout << "Trying " << str.c_str() << endl;
+			in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
+			if(!in.is_open()) {
+				if(verbose) cout << "  didn't work" << endl;
+				in.close();
+			} else {
+				if(verbose) cout << "  worked" << endl;
+			}
+		}
+	}
+	if(!in.is_open()) {
+		cerr << "Could not locate a Centrifuge index corresponding to basename \"" << ebwtFileBase.c_str() << "\"" << endl;
+		throw 1;
+	}
+	return str;
+}
+
+string gLastIOErrMsg;
+
+uint8_t tax_rank_num[RANK_MAX];
diff --git a/bt2_idx.h b/bt2_idx.h
new file mode 100644
index 0000000..b265361
--- /dev/null
+++ b/bt2_idx.h
@@ -0,0 +1,3940 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EBWT_H_
+#define EBWT_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <memory>
+#include <fcntl.h>
+#include <math.h>
+#include <errno.h>
+#include <stdexcept>
+#include <sys/stat.h>
+#include <map>
+#include <set>
+#ifdef BOWTIE_MM
+#include <sys/mman.h>
+#include <sys/shm.h>
+#endif
+#include "shmem.h"
+#include "alphabet.h"
+#include "assert_helpers.h"
+#include "bitpack.h"
+#include "blockwise_sa.h"
+#include "endian_swap.h"
+#include "word_io.h"
+#include "random_source.h"
+#include "ref_read.h"
+#include "threading.h"
+#include "str_util.h"
+#include "mm.h"
+#include "timer.h"
+#include "reference.h"
+#include "search_globals.h"
+#include "ds.h"
+#include "random_source.h"
+#include "mem_ids.h"
+#include "btypes.h"
+#include "taxonomy.h"
+
+#ifdef POPCNT_CAPABILITY
+#include "processor_support.h"
+#endif
+
+using namespace std;
+
+// From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
+extern uint8_t cCntLUT_4[4][4][256];
+extern uint8_t cCntLUT_4_rev[4][4][256];
+
+static const uint64_t c_table[4] = {
+    0xffffffffffffffff,
+    0xaaaaaaaaaaaaaaaa,
+    0x5555555555555555,
+    0x0000000000000000
+};
+
+#ifndef VMSG_NL
+#define VMSG_NL(...) \
+if(this->verbose()) { \
+	stringstream tmp; \
+	tmp << __VA_ARGS__ << endl; \
+	this->verbose(tmp.str()); \
+}
+#endif
+
+#ifndef VMSG
+#define VMSG(...) \
+if(this->verbose()) { \
+	stringstream tmp; \
+	tmp << __VA_ARGS__; \
+	this->verbose(tmp.str()); \
+}
+#endif
+
+/**
+ * Flags describing type of Ebwt.
+ */
+enum EBWT_FLAGS {
+	EBWT_COLOR = 2,     // true -> Ebwt is colorspace
+	EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
+	                    // concatenated string reversed, rather than
+						// each stretch reversed
+};
+
+/**
+ * Extended Burrows-Wheeler transform header.  This together with the
+ * actual data arrays and other text-specific parameters defined in
+ * class Ebwt constitute the entire Ebwt.
+ */
+template <typename index_t = uint32_t>
+class EbwtParams {
+
+public:
+	EbwtParams() { }
+
+	EbwtParams(
+		index_t len,
+		int32_t lineRate,
+		int32_t offRate,
+		int32_t ftabChars,
+		bool color,
+		bool entireReverse)
+	{
+		init(len, lineRate, offRate, ftabChars, color, entireReverse);
+	}
+
+	EbwtParams(const EbwtParams& eh) {
+		init(eh._len, eh._lineRate, eh._offRate,
+		     eh._ftabChars, eh._color, eh._entireReverse);
+	}
+
+	void init(
+		index_t len,
+		int32_t lineRate,
+		int32_t offRate,
+		int32_t ftabChars,
+		bool color,
+		bool entireReverse)
+	{
+		_color = color;
+		_entireReverse = entireReverse;
+		_len = len;
+		_bwtLen = _len + 1;
+		_sz = (len+3)/4;
+		_bwtSz = (len/4 + 1);
+		_lineRate = lineRate;
+		_origOffRate = offRate;
+		_offRate = offRate;
+		_offMask = std::numeric_limits<index_t>::max() << _offRate;
+		_ftabChars = ftabChars;
+		_eftabLen = _ftabChars*2;
+		_eftabSz = _eftabLen*sizeof(index_t);
+		_ftabLen = (1 << (_ftabChars*2))+1;
+		_ftabSz = _ftabLen*sizeof(index_t);
+		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
+		_offsSz = _offsLen*sizeof(index_t);
+		_lineSz = 1 << _lineRate;
+		_sideSz = _lineSz * 1 /* lines per side */;
+		_sideBwtSz = _sideSz - (sizeof(index_t) * 4);
+		_sideBwtLen = _sideBwtSz*4;
+		_numSides = (_bwtSz+(_sideBwtSz)-1)/(_sideBwtSz);
+		_numLines = _numSides * 1 /* lines per side */;
+		_ebwtTotLen = _numSides * _sideSz;
+		_ebwtTotSz = _ebwtTotLen;
+		assert(repOk());
+	}
+
+	index_t len() const           { return _len; }
+	index_t lenNucs() const       { return _len + (_color ? 1 : 0); }
+	index_t bwtLen() const        { return _bwtLen; }
+	index_t sz() const            { return _sz; }
+	index_t bwtSz() const         { return _bwtSz; }
+	int32_t lineRate() const      { return _lineRate; }
+	int32_t origOffRate() const   { return _origOffRate; }
+	int32_t offRate() const       { return _offRate; }
+	index_t offMask() const       { return _offMask; }
+	int32_t ftabChars() const     { return _ftabChars; }
+	index_t eftabLen() const      { return _eftabLen; }
+	index_t eftabSz() const       { return _eftabSz; }
+	index_t ftabLen() const       { return _ftabLen; }
+	index_t ftabSz() const        { return _ftabSz; }
+	index_t offsLen() const       { return _offsLen; }
+	index_t offsSz() const        { return _offsSz; }
+	index_t lineSz() const        { return _lineSz; }
+	index_t sideSz() const        { return _sideSz; }
+	index_t sideBwtSz() const     { return _sideBwtSz; }
+	index_t sideBwtLen() const    { return _sideBwtLen; }
+	index_t numSides() const      { return _numSides; }
+	index_t numLines() const      { return _numLines; }
+	index_t ebwtTotLen() const    { return _ebwtTotLen; }
+	index_t ebwtTotSz() const     { return _ebwtTotSz; }
+	bool color() const            { return _color; }
+	bool entireReverse() const    { return _entireReverse; }
+
+	/**
+	 * Set a new suffix-array sampling rate, which involves updating
+	 * rate, mask, sample length, and sample size.
+	 */
+	void setOffRate(int __offRate) {
+		_offRate = __offRate;
+		_offMask = std::numeric_limits<index_t>::max() << _offRate;
+		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
+		_offsSz = _offsLen*sizeof(index_t);
+	}
+
+#ifndef NDEBUG
+	/// Check that this EbwtParams is internally consistent
+	bool repOk() const {
+		// assert_gt(_len, 0);
+		assert_gt(_lineRate, 3);
+		assert_geq(_offRate, 0);
+		assert_leq(_ftabChars, 16);
+		assert_geq(_ftabChars, 1);
+        assert_lt(_lineRate, 32);
+		assert_lt(_ftabChars, 32);
+		assert_eq(0, _ebwtTotSz % _lineSz);
+		return true;
+	}
+#endif
+
+	/**
+	 * Pretty-print the header contents to the given output stream.
+	 */
+	void print(ostream& out) const {
+		out << "Headers:" << endl
+		    << "    len: "          << _len << endl
+		    << "    bwtLen: "       << _bwtLen << endl
+		    << "    sz: "           << _sz << endl
+		    << "    bwtSz: "        << _bwtSz << endl
+		    << "    lineRate: "     << _lineRate << endl
+		    << "    offRate: "      << _offRate << endl
+		    << "    offMask: 0x"    << hex << _offMask << dec << endl
+		    << "    ftabChars: "    << _ftabChars << endl
+		    << "    eftabLen: "     << _eftabLen << endl
+		    << "    eftabSz: "      << _eftabSz << endl
+		    << "    ftabLen: "      << _ftabLen << endl
+		    << "    ftabSz: "       << _ftabSz << endl
+		    << "    offsLen: "      << _offsLen << endl
+		    << "    offsSz: "       << _offsSz << endl
+		    << "    lineSz: "       << _lineSz << endl
+		    << "    sideSz: "       << _sideSz << endl
+		    << "    sideBwtSz: "    << _sideBwtSz << endl
+		    << "    sideBwtLen: "   << _sideBwtLen << endl
+		    << "    numSides: "     << _numSides << endl
+		    << "    numLines: "     << _numLines << endl
+		    << "    ebwtTotLen: "   << _ebwtTotLen << endl
+		    << "    ebwtTotSz: "    << _ebwtTotSz << endl
+		    << "    color: "        << _color << endl
+		    << "    reverse: "      << _entireReverse << endl;
+	}
+
+	index_t _len;
+	index_t _bwtLen;
+	index_t _sz;
+	index_t _bwtSz;
+	int32_t _lineRate;
+	int32_t _origOffRate;
+	int32_t _offRate;
+	index_t _offMask;
+	int32_t _ftabChars;
+	index_t _eftabLen;
+	index_t _eftabSz;
+	index_t _ftabLen;
+	index_t _ftabSz;
+	index_t _offsLen;
+	index_t _offsSz;
+	index_t _lineSz;
+	index_t _sideSz;
+	index_t _sideBwtSz;
+	index_t _sideBwtLen;
+	index_t _numSides;
+	index_t _numLines;
+	index_t _ebwtTotLen;
+	index_t _ebwtTotSz;
+	bool     _color;
+	bool     _entireReverse;
+};
+
+/**
+ * Exception to throw when a file-realted error occurs.
+ */
+class EbwtFileOpenException : public std::runtime_error {
+public:
+	EbwtFileOpenException(const std::string& msg = "") :
+		std::runtime_error(msg) { }
+};
+
+/**
+ * Calculate size of file with given name.
+ */
+static inline int64_t fileSize(const char* name) {
+	std::ifstream f;
+	f.open(name, std::ios_base::binary | std::ios_base::in);
+	if (!f.good() || f.eof() || !f.is_open()) { return 0; }
+	f.seekg(0, std::ios_base::beg);
+	std::ifstream::pos_type begin_pos = f.tellg();
+	f.seekg(0, std::ios_base::end);
+	return static_cast<int64_t>(f.tellg() - begin_pos);
+}
+
+/**
+ * Encapsulates a location in the bwt text in terms of the side it
+ * occurs in and its offset within the side.
+ */
+template <typename index_t = uint32_t>
+struct SideLocus {
+	SideLocus() :
+	_sideByteOff(0),
+	_sideNum(0),
+	_charOff(0),
+	_by(-1),
+	_bp(-1) { }
+
+	/**
+	 * Construct from row and other relevant information about the Ebwt.
+	 */
+	SideLocus(index_t row, const EbwtParams<index_t>& ep, const uint8_t* ebwt) {
+		initFromRow(row, ep, ebwt);
+	}
+
+	/**
+	 * Init two SideLocus objects from a top/bot pair, using the result
+	 * from one call to initFromRow to possibly avoid a second call.
+	 */
+	static void initFromTopBot(
+		index_t top,
+		index_t bot,
+		const EbwtParams<index_t>& ep,
+		const uint8_t* ebwt,
+		SideLocus& ltop,
+		SideLocus& lbot)
+	{
+		const index_t sideBwtLen = ep._sideBwtLen;
+		assert_gt(bot, top);
+		ltop.initFromRow(top, ep, ebwt);
+		index_t spread = bot - top;
+		// Many cache misses on the following lines
+		if(ltop._charOff + spread < sideBwtLen) {
+			lbot._charOff = ltop._charOff + spread;
+			lbot._sideNum = ltop._sideNum;
+			lbot._sideByteOff = ltop._sideByteOff;
+			lbot._by = (int)(lbot._charOff >> 2);
+			assert_lt(lbot._by, (int)ep._sideBwtSz);
+			lbot._bp = lbot._charOff & 3;
+		} else {
+			lbot.initFromRow(bot, ep, ebwt);
+		}
+	}
+
+	/**
+	 * Calculate SideLocus based on a row and other relevant
+	 * information about the shape of the Ebwt.
+	 */
+	void initFromRow(index_t row, const EbwtParams<index_t>& ep, const uint8_t* ebwt) {
+		const index_t sideSz      = ep._sideSz;
+		// Side length is hard-coded for now; this allows the compiler
+		// to do clever things to accelerate / and %.
+		_sideNum                  = row / ep._sideBwtLen;
+		assert_lt(_sideNum, ep._numSides);
+		_charOff                  = row % ep._sideBwtLen;
+		_sideByteOff              = _sideNum * sideSz;
+		assert_leq(row, ep._len);
+		assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
+		// Tons of cache misses on the next line
+		_by = (int)(_charOff >> 2); // byte within side
+		assert_lt(_by, (int)ep._sideBwtSz);
+		_bp = _charOff & 3;  // bit-pair within byte
+	}
+	
+	/**
+	 * Transform this SideLocus to refer to the next side (i.e. the one
+	 * corresponding to the next side downstream).  Set all cursors to
+	 * point to the beginning of the side.
+	 */
+	void nextSide(const EbwtParams<index_t>& ep) {
+		assert(valid());
+		_sideByteOff += ep.sideSz();
+		_sideNum++;
+		_by = _bp = _charOff = 0;
+		assert(valid());
+	}
+
+	/**
+	 * Return true iff this is an initialized SideLocus
+	 */
+	bool valid() const {
+		if(_bp != -1) {
+			return true;
+		}
+		return false;
+	}
+	
+	/**
+	 * Convert locus to BW row it corresponds to.
+	 */
+    index_t toBWRow() const;
+	
+#ifndef NDEBUG
+	/**
+	 * Check that SideLocus is internally consistent and consistent
+	 * with the (provided) EbwtParams.
+	 */
+	bool repOk(const EbwtParams<index_t>& ep) const {
+		ASSERT_ONLY(index_t row = toBWRow());
+		assert_leq(row, ep._len);
+		assert_range(-1, 3, _bp);
+		assert_range(0, (int)ep._sideBwtSz, _by);
+		return true;
+	}
+#endif
+
+	/// Make this look like an invalid SideLocus
+	void invalidate() {
+		_bp = -1;
+	}
+
+	/**
+	 * Return a read-only pointer to the beginning of the top side.
+	 */
+	const uint8_t *side(const uint8_t* ebwt) const {
+		return ebwt + _sideByteOff;
+	}
+    
+    /**
+	 * Return a read-only pointer to the beginning of the top side.
+	 */
+	const uint8_t *next_side(const EbwtParams<index_t>& ep, const uint8_t* ebwt) const {
+        if(_sideByteOff + ep._sideSz < ep._ebwtTotSz) {
+            return ebwt + _sideByteOff + ep._sideSz;
+        } else {
+            return NULL;
+        }
+	}
+    
+	index_t _sideByteOff; // offset of top side within ebwt[]
+	index_t _sideNum;     // index of side
+	index_t _charOff;     // character offset within side
+	int32_t _by;          // byte within side (not adjusted for bw sides)
+	int32_t _bp;          // bitpair within byte (not adjusted for bw sides)
+};
+
+/**
+ * Convert locus to BW row it corresponds to.
+ */
+template <typename index_t>
+inline index_t SideLocus<index_t>::toBWRow() const {
+    if(sizeof(index_t) == 8) {
+        return _sideNum * (512 - 16 * sizeof(index_t)) + _charOff;
+    } else {
+        return _sideNum * (256 - 16 * sizeof(index_t)) + _charOff;
+    }
+}
+
+template <>
+inline uint64_t SideLocus<uint64_t>::toBWRow() const {
+    return _sideNum * (512 - 16 * sizeof(uint64_t)) + _charOff;
+}
+
+template <>
+inline uint32_t SideLocus<uint32_t>::toBWRow() const {
+    return _sideNum * (256 - 16 * sizeof(uint32_t)) + _charOff;
+}
+
+template <>
+inline uint16_t SideLocus<uint16_t>::toBWRow() const {
+    return _sideNum * (256 - 16 * sizeof(uint16_t)) + _charOff;
+}
+
+#ifdef POPCNT_CAPABILITY   // wrapping of "struct"
+struct USE_POPCNT_GENERIC {
+#endif
+    // Use this standard bit-bashing population count
+    inline static int pop64(uint64_t x) {
+        // Lots of cache misses on following lines (>10K)
+        x = x - ((x >> 1) & 0x5555555555555555llu);
+        x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
+        x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
+        x = x + (x >> 8);
+        x = x + (x >> 16);
+        x = x + (x >> 32);
+        return (int)(x & 0x3Fllu);
+    }
+#ifdef POPCNT_CAPABILITY  // wrapping a "struct"
+};
+#endif
+
+#ifdef POPCNT_CAPABILITY
+struct USE_POPCNT_INSTRUCTION {
+    inline static int pop64(uint64_t x) {
+        int64_t count;
+        asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x));
+        return (int)count;
+    }
+};
+#endif
+
+/**
+ * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
+ * within a 64-bit argument.
+ */
+#ifdef POPCNT_CAPABILITY
+template<typename Operation>
+#endif
+inline static int countInU64(int c, uint64_t dw) {
+    uint64_t c0 = c_table[c];
+	uint64_t x0 = dw ^ c0;
+    uint64_t x1 = (x0 >> 1);
+    uint64_t x2 = x1 & (0x5555555555555555);
+    uint64_t x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+    uint64_t tmp = Operation().pop64(x3);
+#else
+    uint64_t tmp = pop64(x3);
+#endif
+    return (int) tmp;
+}
+
+// Forward declarations for Ebwt class
+class EbwtSearchParams;
+
+/**
+ * Extended Burrows-Wheeler transform data.
+ *
+ * An Ebwt may be transferred to and from RAM with calls to
+ * evictFromMemory() and loadIntoMemory().  By default, a newly-created
+ * Ebwt is not loaded into memory; if the user would like to use a
+ * newly-created Ebwt to answer queries, they must first call
+ * loadIntoMemory().
+ */
+template <class index_t = uint32_t>
+class Ebwt {
+public:
+	#define Ebwt_INITS \
+	    _toBigEndian(currentlyBigEndian()), \
+	    _overrideOffRate(overrideOffRate), \
+	    _verbose(verbose), \
+	    _passMemExc(passMemExc), \
+	    _sanity(sanityCheck), \
+	    fw_(fw), \
+	    _in1(NULL), \
+	    _in2(NULL), \
+	    _zOff(std::numeric_limits<index_t>::max()), \
+	    _zEbwtByteOff(std::numeric_limits<index_t>::max()), \
+	    _zEbwtBpOff(-1), \
+	    _nPat(0), \
+	    _nFrag(0), \
+	    _plen(EBWT_CAT), \
+	    _rstarts(EBWT_CAT), \
+	    _fchr(EBWT_CAT), \
+	    _ftab(EBWT_CAT), \
+	    _eftab(EBWT_CAT), \
+        _offw(false), \
+	    _offs(EBWT_CAT), \
+        _offsw(EBWT_CAT), \
+	    _ebwt(EBWT_CAT), \
+	    _useMm(false), \
+	    useShmem_(false), \
+	    _refnames(EBWT_CAT), \
+	    mmFile1_(NULL), \
+	    mmFile2_(NULL), \
+        _compressed(false)
+
+	/// Construct an Ebwt from the given input file
+	Ebwt(const string& in,
+	     int color,
+		 int needEntireReverse,
+	     bool fw,
+	     int32_t overrideOffRate, // = -1,
+	     int32_t offRatePlus, // = -1,
+	     bool useMm, // = false,
+	     bool useShmem, // = false,
+	     bool mmSweep, // = false,
+	     bool loadNames, // = false,
+		 bool loadSASamp, // = true,
+		 bool loadFtab, // = true,
+		 bool loadRstarts, // = true,
+	     bool verbose, // = false,
+	     bool startVerbose, // = false,
+	     bool passMemExc, // = false,
+	     bool sanityCheck, // = false)
+		 bool skipLoading = false) : 
+	     Ebwt_INITS
+	{
+		assert(!useMm || !useShmem);
+        
+#ifdef POPCNT_CAPABILITY
+        ProcessorSupport ps;
+        _usePOPCNTinstruction = ps.POPCNTenabled();
+#endif
+        
+		packed_ = false;
+		_useMm = useMm;
+		useShmem_ = useShmem;
+		_in1Str = in + ".1." + gEbwt_ext;
+		_in2Str = in + ".2." + gEbwt_ext;
+		
+		if(!skipLoading) {
+			readIntoMemory(
+						   color,       // expect index to be colorspace?
+						   fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
+						   loadSASamp,  // load the SA sample portion?
+						   loadFtab,    // load the ftab & eftab?
+						   loadRstarts, // load the rstarts array?
+						   true,        // stop after loading the header portion?
+						   &_eh,        // params
+						   mmSweep,     // mmSweep
+						   loadNames,   // loadNames
+						   startVerbose); // startVerbose
+			// If the offRate has been overridden, reflect that in the
+			// _eh._offRate field
+			if(offRatePlus > 0 && _overrideOffRate == -1) {
+				_overrideOffRate = _eh._offRate + offRatePlus;
+			}
+			if(_overrideOffRate > _eh._offRate) {
+				_eh.setOffRate(_overrideOffRate);
+				assert_eq(_overrideOffRate, _eh._offRate);
+			}
+			assert(repOk());
+		}
+        
+        // Read conversion table, genome size table, and taxonomy tree
+        string in3Str = in + ".3." + gEbwt_ext;
+        if(verbose || startVerbose) cerr << "Opening \"" << in3Str.c_str() << "\"" << endl;
+        ifstream in3(in3Str.c_str(), ios::binary);
+        if(!in3.good()) {
+            cerr << "Could not open index file " << in3Str.c_str() << endl;
+        }
+        
+        initial_tax_rank_num();
+        
+        set<uint64_t> leaves;
+        size_t num_cids = 0; // number of compressed sequences
+        _uid_to_tid.clear();
+        readU32(in3, this->toBe());
+        uint64_t nref = readIndex<uint64_t>(in3, this->toBe());
+        if(nref > 0) {
+            while(!in3.eof()) {
+                string uid;
+                uint64_t tid;
+                while(true) {
+                    char c = '\0';
+                    in3 >> c;
+                    if(c == '\0' || c == '\n') break;
+                    uid.push_back(c);
+                }
+                if(uid.find("cid") == 0) {
+                    num_cids++;
+                }
+                tid = readIndex<uint64_t>(in3, this->toBe());
+                _uid_to_tid.expand();
+                _uid_to_tid.back().first = uid;
+                _uid_to_tid.back().second = tid;
+                leaves.insert(tid);
+                if(nref == _uid_to_tid.size()) break;
+            }
+            assert_eq(nref, _uid_to_tid.size());
+        }
+        
+        if(num_cids >= 10) {
+            this->_compressed = true;
+        }
+        
+        _tree.clear();
+        uint64_t ntid = readIndex<uint64_t>(in3, this->toBe());
+        if(ntid > 0) {
+            while(!in3.eof()) {
+                TaxonomyNode node;
+                uint64_t tid = readIndex<uint64_t>(in3, this->toBe());
+                node.parent_tid = readIndex<uint64_t>(in3, this->toBe());
+                node.rank = readIndex<uint16_t>(in3, this->toBe());
+                node.leaf = (leaves.find(tid) != leaves.end());
+                _tree[tid] = node;
+                if(ntid == _tree.size()) break;
+            }
+            assert_eq(ntid, _tree.size());
+        }
+        
+        _name.clear();
+        uint64_t nname = readIndex<uint64_t>(in3, this->toBe());
+        if(nname > 0) {
+            string name;
+            while(!in3.eof()) {
+                uint64_t tid = readIndex<uint64_t>(in3, this->toBe());
+                in3 >> name;
+                in3.seekg(1, ios_base::cur);
+                assert(_name.find(tid) == _name.end());
+                std::replace(name.begin(), name.end(), '@', ' ');
+                _name[tid] = name;
+                if(_name.size() == nname)
+                    break;
+            }
+        }
+        
+        _size.clear();
+        uint64_t nsize = readIndex<uint64_t>(in3, this->toBe());
+        if(nsize > 0) {
+            while(!in3.eof()) {
+                uint64_t tid = readIndex<uint64_t>(in3, this->toBe());
+                uint64_t size = readIndex<uint64_t>(in3, this->toBe());
+                assert(_size.find(tid) == _size.end());
+                _size[tid] = size;
+                if(_size.size() == nsize)
+                    break;
+            }
+        }
+        
+        // Calculate average genome size
+        if(!this->_offw) { // Skip if there are many sequences (e.g. >64K)
+            for(map<uint64_t, TaxonomyNode>::const_iterator tree_itr = _tree.begin(); tree_itr != _tree.end(); tree_itr++) {
+                uint64_t tid = tree_itr->first;
+                const TaxonomyNode& node = tree_itr->second;
+                if(node.rank == RANK_SPECIES || node.rank == RANK_GENUS || node.rank == RANK_FAMILY ||
+                   node.rank == RANK_ORDER || node.rank == RANK_CLASS || node.rank == RANK_PHYLUM) {
+                    size_t sum = 0, count = 0;
+                    for(map<uint64_t, uint64_t>::const_iterator size_itr = _size.begin(); size_itr != _size.end(); size_itr++) {
+                        uint64_t c_tid = size_itr->first;
+                        map<uint64_t, TaxonomyNode>::const_iterator tree_itr2 = _tree.find(c_tid);
+                        if(tree_itr2 == _tree.end())
+                            continue;
+                        
+                        assert(tree_itr2 != _tree.end());
+                        const TaxonomyNode& c_node = tree_itr2->second;
+                        if((c_node.rank == RANK_UNKNOWN && c_node.leaf) ||
+                           tax_rank_num[c_node.rank] < tax_rank_num[RANK_SPECIES]) {
+                            c_tid = c_node.parent_tid;
+                            while(true) {
+                                if(c_tid == tid) {
+                                    sum += size_itr->second;
+                                    count += 1;
+                                    break;
+                                }
+                                tree_itr2 = _tree.find(c_tid);
+                                if(tree_itr2 == _tree.end())
+                                    break;
+                                if(c_tid == tree_itr2->second.parent_tid)
+                                    break;
+                                c_tid = tree_itr2->second.parent_tid;
+                            }
+                        }
+                    }
+                    if(count > 0) {
+                        _size[tid] = sum / count;
+                    }
+                }
+            }
+        }
+        _paths.buildPaths(_uid_to_tid, _tree);
+        
+        in3.close();
+	}
+	
+	/// Construct an Ebwt from the given header parameters and string
+	/// vector, optionally using a blockwise suffix sorter with the
+	/// given 'bmax' and 'dcv' parameters.  The string vector is
+	/// ultimately joined and the joined string is passed to buildToDisk().
+	Ebwt(
+		 bool packed,
+		 int color,
+		 int needEntireReverse,
+		 int32_t lineRate,
+		 int32_t offRate,
+		 int32_t ftabChars,
+		 const string& file,   // base filename for EBWT files
+		 bool fw,
+		 int dcv,
+		 EList<RefRecord>& szs,
+		 index_t sztot,
+		 const RefReadInParams& refparams,
+		 uint32_t seed,
+		 int32_t overrideOffRate = -1,
+		 bool verbose = false,
+		 bool passMemExc = false,
+		 bool sanityCheck = false) :
+	Ebwt_INITS,
+	_eh(
+		joinedLen(szs),
+		lineRate,
+		offRate,
+		ftabChars,
+		color,
+		refparams.reverse == REF_READ_REVERSE)
+	{
+#ifdef POPCNT_CAPABILITY
+        ProcessorSupport ps;
+        _usePOPCNTinstruction = ps.POPCNTenabled();
+#endif
+		packed_ = packed;
+	}
+
+	/// Construct an Ebwt from the given header parameters and string
+	/// vector, optionally using a blockwise suffix sorter with the
+	/// given 'bmax' and 'dcv' parameters.  The string vector is
+	/// ultimately joined and the joined string is passed to buildToDisk().
+	template<typename TStr>
+	Ebwt(
+         TStr& s,
+         bool packed,
+         int color,
+         int needEntireReverse,
+         int32_t lineRate,
+         int32_t offRate,
+         int32_t ftabChars,
+         const string& file,   // base filename for EBWT files
+         bool fw,
+         bool useBlockwise,
+         index_t bmax,
+         index_t bmaxSqrtMult,
+         index_t bmaxDivN,
+         int dcv,
+         int nthreads,
+         EList<FileBuf*>& is,
+         EList<RefRecord>& szs,
+         index_t sztot,
+         const string& conversion_table_fname,
+         const string& taxonomy_fname,
+         const string& name_table_fname,
+         const string& size_table_fname,
+         const RefReadInParams& refparams,
+         uint32_t seed,
+         int32_t overrideOffRate = -1,
+         bool doSaFile = false,
+         bool doBwtFile = false,
+         int kmer_size = 0,
+         bool verbose = false,
+         bool passMemExc = false,
+         bool sanityCheck = false) :
+    Ebwt_INITS,
+    _eh(
+        joinedLen(szs),
+        lineRate,
+        offRate,
+        ftabChars,
+        color,
+        refparams.reverse == REF_READ_REVERSE)
+	{
+#ifdef POPCNT_CAPABILITY
+        ProcessorSupport ps;
+        _usePOPCNTinstruction = ps.POPCNTenabled();
+#endif
+		_in1Str = file + ".1." + gEbwt_ext;
+		_in2Str = file + ".2." + gEbwt_ext;
+		packed_ = packed;
+		// Open output files
+		ofstream fout1(_in1Str.c_str(), ios::binary);
+		if(!fout1.good()) {
+			cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl
+			     << "Please make sure the directory exists and that permissions allow writing by" << endl
+			     << "Bowtie." << endl;
+			throw 1;
+		}
+		ofstream fout2(_in2Str.c_str(), ios::binary);
+		if(!fout2.good()) {
+			cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl
+			     << "Please make sure the directory exists and that permissions allow writing by" << endl
+			     << "Bowtie." << endl;
+			throw 1;
+		}
+        _inSaStr = file + ".sa";
+        _inBwtStr = file + ".bwt";
+        ofstream *saOut = NULL, *bwtOut = NULL;
+        if(doSaFile) {
+            saOut = new ofstream(_inSaStr.c_str(), ios::binary);
+            if(!saOut->good()) {
+                cerr << "Could not open suffix-array file for writing: \"" << _inSaStr.c_str() << "\"" << endl
+                << "Please make sure the directory exists and that permissions allow writing by" << endl
+                << "Bowtie." << endl;
+                throw 1;
+            }
+        }
+        if(doBwtFile) {
+            bwtOut = new ofstream(_inBwtStr.c_str(), ios::binary);
+            if(!bwtOut->good()) {
+                cerr << "Could not open suffix-array file for writing: \"" << _inBwtStr.c_str() << "\"" << endl
+                << "Please make sure the directory exists and that permissions allow writing by" << endl
+                << "Bowtie." << endl;
+                throw 1;
+            }
+        }
+		// Build
+		initFromVector<TStr>(
+							 s,
+							 is,
+							 szs,
+							 sztot,
+							 refparams,
+							 fout1,
+							 fout2,
+                             saOut,
+                             bwtOut,
+                             kmer_size,
+                             file,
+                             conversion_table_fname,
+                             taxonomy_fname,
+                             name_table_fname,
+                             size_table_fname,
+							 useBlockwise,
+							 bmax,
+							 bmaxSqrtMult,
+							 bmaxDivN,
+							 dcv,
+                             nthreads,
+							 seed,
+							 verbose);
+		// Close output files
+		fout1.flush();
+		int64_t tellpSz1 = (int64_t)fout1.tellp();
+		VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str.c_str());
+		fout1.close();
+		bool err = false;
+		if(tellpSz1 > fileSize(_in1Str.c_str())) {
+			err = true;
+			cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1
+			     << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
+		}
+		fout2.flush();
+		int64_t tellpSz2 = (int64_t)fout2.tellp();
+		VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str.c_str());
+		fout2.close();
+		if(tellpSz2 > fileSize(_in2Str.c_str())) {
+			err = true;
+			cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
+			     << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
+		}
+        if(saOut != NULL) {
+            // Check on suffix array output file size
+            int64_t tellpSzSa = (int64_t)saOut->tellp();
+            VMSG_NL("Wrote " << tellpSzSa << " bytes to suffix-array file: " << _inSaStr.c_str());
+            saOut->close();
+            if(tellpSzSa > fileSize(_inSaStr.c_str())) {
+                err = true;
+                cerr << "Index is corrupt: File size for " << _inSaStr.c_str() << " should have been " << tellpSzSa
+                << " but is actually " << fileSize(_inSaStr.c_str()) << "." << endl;
+            }
+        }
+        if(bwtOut != NULL) {
+            // Check on suffix array output file size
+            int64_t tellpSzBwt = (int64_t)bwtOut->tellp();
+            VMSG_NL("Wrote " << tellpSzBwt << " bytes to BWT file: " << _inBwtStr.c_str());
+            bwtOut->close();
+            if(tellpSzBwt > fileSize(_inBwtStr.c_str())) {
+                err = true;
+                cerr << "Index is corrupt: File size for " << _inBwtStr.c_str() << " should have been " << tellpSzBwt
+                << " but is actually " << fileSize(_inBwtStr.c_str()) << "." << endl;
+            }
+        }
+		if(err) {
+			cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
+			throw 1;
+		}
+		// Reopen as input streams
+		VMSG_NL("Re-opening _in1 and _in2 as input streams");
+		if(_sanity) {
+			VMSG_NL("Sanity-checking Bt2");
+			assert(!isInMemory());
+			readIntoMemory(
+				color,                       // colorspace?
+				fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
+				true,                        // load SA sample (_offs[])?
+				true,                        // load ftab (_ftab[] & _eftab[])?
+				true,                        // load r-starts (_rstarts[])?
+				false,                       // just load header?
+				NULL,                        // Params object to fill
+				false,                       // mm sweep?
+				true,                        // load names?
+				false);                      // verbose startup?
+			// sanityCheckAll(refparams.reverse);
+			evictFromMemory();
+			assert(!isInMemory());
+		}
+		VMSG_NL("Returning from Ebwt constructor");
+	}
+	
+	/**
+	 * Static constructor for a pair of forward/reverse indexes for the
+	 * given reference string.
+	 */
+	template<typename TStr>
+	static pair<Ebwt*, Ebwt*>
+	fromString(
+		const char* str,
+		bool packed,
+		int color,
+		int reverse,
+		bool bigEndian,
+		int32_t lineRate,
+		int32_t offRate,
+		int32_t ftabChars,
+		const string& file,
+		bool useBlockwise,
+		index_t bmax,
+		index_t bmaxSqrtMult,
+		index_t bmaxDivN,
+		int dcv,
+		uint32_t seed,
+		bool verbose,
+		bool autoMem,
+		bool sanity)
+	{
+		EList<std::string> strs(EBWT_CAT);
+		strs.push_back(std::string(str));
+		return fromStrings<TStr>(
+			strs,
+			packed,
+			color,
+			reverse,
+			bigEndian,
+			lineRate,
+			offRate,
+			ftabChars,
+			file,
+			useBlockwise,
+			bmax,
+			bmaxSqrtMult,
+			bmaxDivN,
+			dcv,
+			seed,
+			verbose,
+			autoMem,
+			sanity);
+	}
+	
+	/**
+	 * Static constructor for a pair of forward/reverse indexes for the
+	 * given list of reference strings.
+	 */
+	template<typename TStr>
+	static pair<Ebwt*, Ebwt*>
+	fromStrings(
+		const EList<std::string>& strs,
+		bool packed,
+		int color,
+		int reverse,
+		bool bigEndian,
+		int32_t lineRate,
+		int32_t offRate,
+		int32_t ftabChars,
+		const string& file,
+		bool useBlockwise,
+		index_t bmax,
+		index_t bmaxSqrtMult,
+		index_t bmaxDivN,
+		int dcv,
+		uint32_t seed,
+		bool verbose,
+		bool autoMem,
+		bool sanity)
+	{
+        assert(!strs.empty());
+		EList<FileBuf*> is(EBWT_CAT);
+		RefReadInParams refparams(color, REF_READ_FORWARD, false, false);
+		// Adapt sequence strings to stringstreams open for input
+		auto_ptr<stringstream> ss(new stringstream());
+		for(index_t i = 0; i < strs.size(); i++) {
+			(*ss) << ">" << i << endl << strs[i] << endl;
+		}
+		auto_ptr<FileBuf> fb(new FileBuf(ss.get()));
+		assert(!fb->eof());
+		assert(fb->get() == '>');
+		ASSERT_ONLY(fb->reset());
+		assert(!fb->eof());
+		is.push_back(fb.get());
+		// Vector for the ordered list of "records" comprising the input
+		// sequences.  A record represents a stretch of unambiguous
+		// characters in one of the input sequences.
+		EList<RefRecord> szs(EBWT_CAT);
+		std::pair<index_t, index_t> sztot;
+		sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
+		// Construct Ebwt from input strings and parameters
+		Ebwt<index_t> *ebwtFw = new Ebwt<index_t>(
+												  TStr(),
+												  packed,
+												  refparams.color ? 1 : 0,
+												  -1,           // fw
+												  lineRate,
+												  offRate,      // suffix-array sampling rate
+												  ftabChars,    // number of chars in initial arrow-pair calc
+												  file,         // basename for .?.ebwt files
+												  true,         // fw?
+												  useBlockwise, // useBlockwise
+												  bmax,         // block size for blockwise SA builder
+												  bmaxSqrtMult, // block size as multiplier of sqrt(len)
+												  bmaxDivN,     // block size as divisor of len
+												  dcv,          // difference-cover period
+												  is,           // list of input streams
+												  szs,          // list of reference sizes
+												  sztot.first,  // total size of all unambiguous ref chars
+												  refparams,    // reference read-in parameters
+												  seed,         // pseudo-random number generator seed
+												  -1,           // override offRate
+												  verbose,      // be talkative
+												  autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
+												  sanity);      // verify results and internal consistency
+		refparams.reverse = reverse;
+		szs.clear();
+		sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
+		// Construct Ebwt from input strings and parameters
+		Ebwt<index_t> *ebwtBw = new Ebwt<index_t>(
+												  TStr(),
+												  packed,
+												  refparams.color ? 1 : 0,
+												  reverse == REF_READ_REVERSE,
+												  lineRate,
+												  offRate,      // suffix-array sampling rate
+												  ftabChars,    // number of chars in initial arrow-pair calc
+												  file + ".rev",// basename for .?.ebwt files
+												  false,        // fw?
+												  useBlockwise, // useBlockwise
+												  bmax,         // block size for blockwise SA builder
+												  bmaxSqrtMult, // block size as multiplier of sqrt(len)
+												  bmaxDivN,     // block size as divisor of len
+												  dcv,          // difference-cover period
+												  is,           // list of input streams
+												  szs,          // list of reference sizes
+												  sztot.first,  // total size of all unambiguous ref chars
+												  refparams,    // reference read-in parameters
+												  seed,         // pseudo-random number generator seed
+												  -1,           // override offRate
+												  verbose,      // be talkative
+												  autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
+												  sanity);      // verify results and internal consistency
+		return make_pair(ebwtFw, ebwtBw);
+	}
+	
+	/// Return true iff the Ebwt is packed
+	bool isPacked() { return packed_; }
+
+	/**
+	 * Write the rstarts array given the szs array for the reference.
+	 */
+	void szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse);
+	
+	/**
+	 * Helper for the constructors above.  Takes a vector of text
+	 * strings and joins them into a single string with a call to
+	 * joinToDisk, which does a join (with padding) and writes some of
+	 * the resulting data directly to disk rather than keep it in
+	 * memory.  It then constructs a suffix-array producer (what kind
+	 * depends on 'useBlockwise') for the resulting sequence.  The
+	 * suffix-array producer can then be used to obtain chunks of the
+	 * joined string's suffix array.
+	 */
+	template <typename TStr>
+	void initFromVector(TStr& s,
+						EList<FileBuf*>& is,
+	                    EList<RefRecord>& szs,
+	                    index_t sztot,
+	                    const RefReadInParams& refparams,
+	                    ofstream& out1,
+	                    ofstream& out2,
+                        ofstream* saOut,
+                        ofstream* bwtOut,
+                        int kmer_size,
+                        const string& base_fname,
+                        const string& conversion_table_fname,
+                        const string& taxonomy_fname,
+                        const string& size_table_fname,
+                        const string& name_table_fname,
+	                    bool useBlockwise,
+	                    index_t bmax,
+	                    index_t bmaxSqrtMult,
+	                    index_t bmaxDivN,
+	                    int dcv,
+                        int nthreads,
+	                    uint32_t seed,
+						bool verbose)
+	{
+		// Compose text strings into single string
+		VMSG_NL("Calculating joined length");
+		index_t jlen;
+		jlen = joinedLen(szs);
+		assert_geq(jlen, sztot);
+		VMSG_NL("Writing header");
+		writeFromMemory(true, out1, out2);
+		try {
+			VMSG_NL("Reserving space for joined string");
+			s.resize(jlen);
+			VMSG_NL("Joining reference sequences");
+			if(refparams.reverse == REF_READ_REVERSE) {
+				{
+					Timer timer(cout, "  Time to join reference sequences: ", _verbose);
+					joinToDisk(is, szs, sztot, refparams, s, out1, out2);
+				} {
+					Timer timer(cout, "  Time to reverse reference sequence: ", _verbose);
+					EList<RefRecord> tmp(EBWT_CAT);
+					s.reverse();
+					reverseRefRecords(szs, tmp, false, verbose);
+					szsToDisk(tmp, out1, refparams.reverse);
+				}
+			} else {
+				Timer timer(cout, "  Time to join reference sequences: ", _verbose);
+				joinToDisk(is, szs, sztot, refparams, s, out1, out2);
+				szsToDisk(szs, out1, refparams.reverse);
+			}
+			// Joined reference sequence now in 's'
+		} catch(bad_alloc& e) {
+			// If we throw an allocation exception in the try block,
+			// that means that the joined version of the reference
+			// string itself is too larger to fit in memory.  The only
+			// alternatives are to tell the user to give us more memory
+			// or to try again with a packed representation of the
+			// reference (if we haven't tried that already).
+			cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
+			if(!isPacked() && _passMemExc) {
+				// Pass the exception up so that we can retry using a
+				// packed string representation
+				throw e;
+			}
+			// There's no point passing this exception on.  The fact
+			// that we couldn't allocate the joined string means that
+			// --bmax is irrelevant - the user should re-run with
+			// ebwt-build-packed
+			if(isPacked()) {
+				cerr << "Please try running bowtie-build on a computer with more memory." << endl;
+			} else {
+				cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
+				     << "mode (-a/--auto), or try again on a computer with more memory." << endl;
+			}
+			if(sizeof(void*) == 4) {
+				cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
+				     << "this executable is 32-bit." << endl;
+			}
+			throw 1;
+		}
+        
+        this->_offw = this->_nPat > std::numeric_limits<uint16_t>::max();
+        
+        std::set<string> uids;
+        for(size_t i = 0; i < _refnames.size(); i++) {
+            const string& refname = _refnames[i];
+            string uid = get_uid(refname);
+            uids.insert(uid);
+        }
+        std::map<string, uint64_t> uid_to_tid; // map from unique id to taxonomy id
+        {
+            ifstream table_file(conversion_table_fname.c_str(), ios::in);
+            if(table_file.is_open()) {
+                while(!table_file.eof()) {
+                    string uid;
+                    table_file >> uid;
+                    if(uid.length() == 0 || uid[0] == '#') continue;
+                    string stid;
+                    table_file >> stid;
+                    uint64_t tid = get_tid(stid);
+                    if(uids.find(uid) == uids.end()) continue;
+                    if(uid_to_tid.find(uid) != uid_to_tid.end()) {
+						if(uid_to_tid.find(uid) != uid_to_tid.end()) {
+							cerr << "Warning: Diverging taxonomy IDs for " << uid << " in " << conversion_table_fname << ": "
+                                 << uid_to_tid[uid] << " and " << tid << ". Taking first. " << endl;
+						}
+                        continue;
+                    }
+                    uid_to_tid[uid] = tid;
+                }
+                table_file.close();
+            } else {
+                cerr << "Error: " << conversion_table_fname << " doesn't exist!" << endl;
+                throw 1;
+            }
+        }
+        // Open output stream for the '.3.cf' file which will hold conversion table and taxonomy tree
+        string fname3 = base_fname + ".3." + gEbwt_ext;
+        ofstream fout3(fname3.c_str(), ios::binary);
+        if(!fout3.good()) {
+            cerr << "Could not open index file for writing: \"" << fname3 << "\"" << endl
+            << "Please make sure the directory exists and that permissions allow writing by Centrifuge" << endl;
+            throw 1;
+        }
+        std::set<uint64_t> tids;
+        writeIndex<int32_t>(fout3, 1, this->toBe()); // endianness sentinel
+        writeIndex<uint64_t>(fout3, _refnames.size(), this->toBe());
+        for(size_t i = 0; i < _refnames.size(); i++) {
+            const string& refname = _refnames[i];
+            string uid = get_uid(refname);
+            for(size_t c = 0; c < uid.length(); c++) {
+                fout3 << uid[c];
+            }
+            fout3 << '\0';
+            if(uid_to_tid.find(uid) != uid_to_tid.end()) {
+                uint64_t tid = uid_to_tid[uid];
+                writeIndex<uint64_t>(fout3, tid, this->toBe());
+                tids.insert(tid);
+            } else {
+                cerr << "Warning: taxomony id doesn't exists for " << uid << "!" << endl;
+                writeIndex<uint64_t>(fout3, 0, this->toBe());
+            }
+        }
+
+        // Read taxonomy
+        {
+            TaxonomyTree tree = read_taxonomy_tree(taxonomy_fname);
+            std::set<uint64_t> tree_color;
+
+            for(std::set<uint64_t>::iterator itr = tids.begin(); itr != tids.end(); itr++) {
+                uint64_t tid = *itr;
+                if(tree.find(tid) == tree.end()) {
+                    cerr << "Warning: Taxonomy ID " << tid << " is not in the provided taxonomy tree (" << taxonomy_fname << ")!" << endl;
+
+                }
+                while(tree.find(tid) != tree.end()) {
+                    uint64_t parent_tid = tree[tid].parent_tid;
+                    tree_color.insert(tid);
+                    if(parent_tid == tid) break;
+                    tid = parent_tid;
+                }
+            }
+            writeIndex<uint64_t>(fout3, tree_color.size(), this->toBe());
+            for(std::set<uint64_t>::iterator itr = tree_color.begin(); itr != tree_color.end(); itr++) {
+                uint64_t tid = *itr;
+                writeIndex<uint64_t>(fout3, tid, this->toBe());
+                assert(tree.find(tid) != tree.end());
+                const TaxonomyNode& node = tree[tid];
+                writeIndex<uint64_t>(fout3, node.parent_tid, this->toBe());
+                writeIndex<uint16_t>(fout3, node.rank, this->toBe());
+            }
+        
+            // Read name table
+            _name.clear();
+            if(name_table_fname != "") {
+                ifstream table_file(name_table_fname.c_str(), ios::in);
+                if(table_file.is_open()) {
+                    char line[1024];
+                    while(!table_file.eof()) {
+                        line[0] = 0;
+                        table_file.getline(line, sizeof(line));
+                        if(line[0] == 0 || line[0] == '#') continue;
+                        if(!strstr(line, "scientific name")) continue;
+                        istringstream cline(line);
+                        uint64_t tid;
+                        char dummy;
+                        string scientific_name;
+                        cline >> tid >> dummy >> scientific_name;
+                        if(tree_color.find(tid) == tree_color.end()) continue;
+                        string temp;
+                        while(true) {
+                            cline >> temp;
+                            if(temp == "|") break;
+                            scientific_name.push_back('@');
+                            scientific_name += temp;
+                        }
+                        _name[tid] = scientific_name;
+                    }
+                    table_file.close();
+                } else {
+                    cerr << "Error: " << name_table_fname << " doesn't exist!" << endl;
+                    throw 1;
+                }
+            }
+            
+            writeIndex<uint64_t>(fout3, _name.size(), this->toBe());
+            for(std::map<uint64_t, string>::const_iterator itr = _name.begin(); itr != _name.end(); itr++) {
+                writeIndex<uint64_t>(fout3, itr->first, this->toBe());
+                fout3 << itr->second << endl;
+            }
+        }
+        
+        // Read size table
+        {
+            _size.clear();
+            
+            // Calculate contig (or genome) sizes corresponding to each taxonomic ID
+            for(size_t i = 0; i < _refnames.size(); i++) {
+                string uid = get_uid(_refnames[i]);
+                if(uid_to_tid.find(uid) == uid_to_tid.end())
+                    continue;
+                uint64_t tid = uid_to_tid[uid];
+                uint64_t contig_size = plen()[i];
+                if(_size.find(tid) == _size.end()) {
+                    _size[tid] = contig_size;
+                } else {
+                    _size[tid] += contig_size;
+                }
+            }
+            
+            if(size_table_fname != "") {
+                ifstream table_file(size_table_fname.c_str(), ios::in);
+                if(table_file.is_open()) {
+                    while(!table_file.eof()) {
+                        string stid;
+                        table_file >> stid;
+                        if(stid.length() == 0 || stid[0] == '#') continue;
+                        uint64_t tid = get_tid(stid);
+                        uint64_t size;
+                        table_file >> size;
+                        _size[tid] = size;
+                    }
+                    table_file.close();
+                } else {
+                    cerr << "Error: " << size_table_fname << " doesn't exist!" << endl;
+                    throw 1;
+                }
+            }
+            
+            writeIndex<uint64_t>(fout3, _size.size(), this->toBe());
+            for(std::map<uint64_t, uint64_t>::const_iterator itr = _size.begin(); itr != _size.end(); itr++) {
+                writeIndex<uint64_t>(fout3, itr->first, this->toBe());
+                writeIndex<uint64_t>(fout3, itr->second, this->toBe());
+            }
+        }
+        
+        fout3.close();
+    
+		// Succesfully obtained joined reference string
+		assert_geq(s.length(), jlen);
+		if(bmax != (index_t)OFF_MASK) {
+			VMSG_NL("bmax according to bmax setting: " << bmax);
+		}
+		else if(bmaxSqrtMult != (index_t)OFF_MASK) {
+			bmax *= bmaxSqrtMult;
+			VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
+		}
+		else if(bmaxDivN != (index_t)OFF_MASK) {
+			bmax = max<uint32_t>((uint32_t)(jlen / bmaxDivN), 1);
+			VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
+		}
+		else {
+			bmax = (uint32_t)sqrt(s.length());
+			VMSG_NL("bmax defaulted to: " << bmax);
+		}
+		int iter = 0;
+		bool first = true;
+		streampos out1pos = out1.tellp();
+		streampos out2pos = out2.tellp();
+		// Look for bmax/dcv parameters that work.
+		while(true) {
+			if(!first && bmax < 40 && _passMemExc) {
+				cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
+				if(!isPacked()) {
+					// Throw an exception exception so that we can
+					// retry using a packed string representation
+					throw bad_alloc();
+				} else {
+					cerr << "Already tried a packed string representation." << endl;
+				}
+				cerr << "Please try indexing this reference on a computer with more memory." << endl;
+				if(sizeof(void*) == 4) {
+					cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
+						 << "this executable is 32-bit." << endl;
+				}
+				throw 1;
+			}
+			if(!first) {
+				out1.seekp(out1pos);
+				out2.seekp(out2pos);
+			}
+			if(dcv > 4096) dcv = 4096;
+			if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
+				dcv <<= 1; // double difference-cover period
+			} else {
+				bmax -= (bmax >> 2); // reduce by 25%
+			}
+			VMSG("Using parameters --bmax " << bmax);
+			if(dcv == 0) {
+				VMSG_NL(" and *no difference cover*");
+			} else {
+				VMSG_NL(" --dcv " << dcv);
+			}
+			iter++;
+			try {
+				{
+					VMSG_NL("  Doing ahead-of-time memory usage test");
+					// Make a quick-and-dirty attempt to force a bad_alloc iff
+					// we would have thrown one eventually as part of
+					// constructing the DifferenceCoverSample
+					dcv <<= 1;
+					index_t sz = (index_t)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
+					AutoArray<uint8_t> tmp(sz, EBWT_CAT);
+					dcv >>= 1;
+					// Likewise with the KarkkainenBlockwiseSA
+					sz = (index_t)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
+                    if(nthreads > 1) sz *= (nthreads + 1);
+					AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
+					// Now throw in the 'ftab' and 'isaSample' structures
+					// that we'll eventually allocate in buildToDisk
+					AutoArray<index_t> ftab(_eh._ftabLen * 2, EBWT_CAT);
+					AutoArray<uint8_t> side(_eh._sideSz, EBWT_CAT);
+					// Grab another 20 MB out of caution
+					AutoArray<uint32_t> extra(20*1024*1024, EBWT_CAT);
+					// If we made it here without throwing bad_alloc, then we
+					// passed the memory-usage stress test
+					VMSG("  Passed!  Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
+					if(isPacked()) {
+						VMSG(" --packed");
+					}
+					VMSG_NL("");
+				}
+				VMSG_NL("Constructing suffix-array element generator");
+				KarkkainenBlockwiseSA<TStr> bsa(s, bmax, nthreads, dcv, seed, _sanity, _passMemExc, _verbose, base_fname);
+				assert(bsa.suffixItrIsReset());
+				assert_eq(bsa.size(), s.length()+1);
+				VMSG_NL("Converting suffix-array elements to index image");
+				buildToDisk(bsa, s, out1, out2, saOut, bwtOut, szs, kmer_size);
+				out1.flush(); out2.flush();
+                bool failed = out1.fail() || out2.fail();
+                if(saOut != NULL) {
+                    saOut->flush();
+                    failed = failed || saOut->fail();
+                }
+                if(bwtOut != NULL) {
+                    bwtOut->flush();
+                    failed = failed || bwtOut->fail();
+                }
+				break;
+			} catch(bad_alloc& e) {
+				if(_passMemExc) {
+					VMSG_NL("  Ran out of memory; automatically trying more memory-economical parameters.");
+				} else {
+					cerr << "Out of memory while constructing suffix array.  Please try using a smaller" << endl
+						 << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
+					throw 1;
+				}
+			}
+			first = false;
+		}
+		assert(repOk());
+		// Now write reference sequence names on the end
+		assert_eq(this->_refnames.size(), this->_nPat);
+		for(index_t i = 0; i < this->_refnames.size(); i++) {
+			out1 << this->_refnames[i].c_str() << endl;
+		}
+		out1 << '\0';
+		out1.flush(); out2.flush();
+		if(out1.fail() || out2.fail()) {
+			cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
+			throw 1;
+		}
+		VMSG_NL("Returning from initFromVector");
+	}
+	
+	/**
+	 * Return the length that the joined string of the given string
+	 * list will have.  Note that this is indifferent to how the text
+	 * fragments correspond to input sequences - it just cares about
+	 * the lengths of the fragments.
+	 */
+	index_t joinedLen(EList<RefRecord>& szs) {
+		index_t ret = 0;
+		for(unsigned int i = 0; i < szs.size(); i++) {
+			ret += (index_t)szs[i].len;
+		}
+		return ret;
+	}
+
+	/// Destruct an Ebwt
+	~Ebwt() {
+		_fchr.reset();
+		_ftab.reset();
+		_eftab.reset();
+		_plen.reset();
+		_rstarts.reset();
+		_offs.reset();
+        _offsw.reset();
+		_ebwt.reset();
+		if(offs() != NULL && useShmem_) {
+			FREE_SHARED(offs());
+		}
+        if(offsw() != NULL && useShmem_) {
+            FREE_SHARED(offsw());
+        }
+		if(ebwt() != NULL && useShmem_) {
+			FREE_SHARED(ebwt());
+		}
+		if (_in1 != NULL) fclose(_in1);
+		if (_in2 != NULL) fclose(_in2);
+	}
+
+	/// Accessors
+	inline const EbwtParams<index_t>& eh() const     { return _eh; }
+	index_t    zOff() const         { return _zOff; }
+	index_t    zEbwtByteOff() const { return _zEbwtByteOff; }
+	int        zEbwtBpOff() const   { return _zEbwtBpOff; }
+	index_t    nPat() const        { return _nPat; }
+	index_t    nFrag() const       { return _nFrag; }
+	inline index_t*   fchr()              { return _fchr.get(); }
+	inline index_t*   ftab()              { return _ftab.get(); }
+	inline index_t*   eftab()             { return _eftab.get(); }
+	inline uint16_t*   offs()              { return _offs.get(); }
+    inline uint32_t*   offsw()             { return _offsw.get(); }
+	inline index_t*   plen()              { return _plen.get(); }
+	inline index_t*   rstarts()           { return _rstarts.get(); }
+	inline uint8_t*    ebwt()              { return _ebwt.get(); }
+	inline const index_t* fchr() const    { return _fchr.get(); }
+	inline const index_t* ftab() const    { return _ftab.get(); }
+	inline const index_t* eftab() const   { return _eftab.get(); }
+    inline const uint16_t* offs() const    { return _offs.get(); }
+    inline const uint32_t* offsw() const    { return _offsw.get(); }
+	inline const index_t* plen() const    { return _plen.get(); }
+	inline const index_t* rstarts() const { return _rstarts.get(); }
+	inline const uint8_t*  ebwt() const    { return _ebwt.get(); }
+	bool        toBe() const         { return _toBigEndian; }
+	bool        verbose() const      { return _verbose; }
+	bool        sanityCheck() const  { return _sanity; }
+	EList<string>& refnames()        { return _refnames; }
+	bool        fw() const           { return fw_; }
+    
+    const EList<pair<string, uint64_t> >&   uid_to_tid() const { return _uid_to_tid; }
+    const TaxonomyTree& tree() const { return _tree; }
+    const TaxonomyPathTable&                paths() const { return _paths; }
+    const std::map<uint64_t, string>&       name() const { return _name; }
+    const std::map<uint64_t, uint64_t>&     size() const { return _size; }
+    bool                                    compressed() const { return _compressed; }
+    
+    
+#ifdef POPCNT_CAPABILITY
+    bool _usePOPCNTinstruction;
+#endif
+
+	/**
+	 * Returns true iff the index contains the given string (exactly).  The
+	 * given string must contain only unambiguous characters.  TODO:
+	 * support skipping of ambiguous characters.
+	 */
+	bool contains(
+		const BTDnaString& str,
+		index_t *top = NULL,
+		index_t *bot = NULL) const;
+
+	/**
+	 * Returns true iff the index contains the given string (exactly).  The
+	 * given string must contain only unambiguous characters.  TODO:
+	 * support skipping of ambiguous characters.
+	 */
+	bool contains(
+		const char *str,
+		index_t *top = NULL,
+		index_t *bot = NULL) const
+	{
+		return contains(BTDnaString(str, true), top, bot);
+	}
+	
+	/// Return true iff the Ebwt is currently in memory
+	bool isInMemory() const {
+		if(ebwt() != NULL) {
+			// Note: We might have skipped loading _offs, _ftab,
+			// _eftab, and _rstarts depending on whether this is the
+			// reverse index and what algorithm is being used.
+			assert(_eh.repOk());
+			//assert(_ftab != NULL);
+			//assert(_eftab != NULL);
+			assert(fchr() != NULL);
+			//assert(_offs != NULL);
+			//assert(_rstarts != NULL);
+			assert_neq(_zEbwtByteOff, (index_t)OFF_MASK);
+			assert_neq(_zEbwtBpOff, -1);
+			return true;
+		} else {
+			assert(ftab() == NULL);
+			assert(eftab() == NULL);
+			assert(fchr() == NULL);
+			assert(offs() == NULL);
+            assert(offsw() == NULL);
+			// assert(rstarts() == NULL); // FIXME FB: Assertion fails when calling centrifuge-build-bin-debug
+			assert_eq(_zEbwtByteOff, (index_t)OFF_MASK);
+			assert_eq(_zEbwtBpOff, -1);
+			return false;
+		}
+	}
+
+	/// Return true iff the Ebwt is currently stored on disk
+	bool isEvicted() const {
+		return !isInMemory();
+	}
+
+	/**
+	 * Load this Ebwt into memory by reading it in from the _in1 and
+	 * _in2 streams.
+	 */
+	void loadIntoMemory(
+		int color,
+		int needEntireReverse,
+		bool loadSASamp,
+		bool loadFtab,
+		bool loadRstarts,
+		bool loadNames,
+		bool verbose)
+	{
+		readIntoMemory(
+			color,       // expect index to be colorspace?
+			needEntireReverse, // require reverse index to be concatenated reference reversed
+			loadSASamp,  // load the SA sample portion?
+			loadFtab,    // load the ftab (_ftab[] and _eftab[])?
+			loadRstarts, // load the r-starts (_rstarts[])?
+			false,       // stop after loading the header portion?
+			NULL,        // params
+			false,       // mmSweep
+			loadNames,   // loadNames
+			verbose);    // startVerbose
+	}
+
+	/**
+	 * Frees memory associated with the Ebwt.
+	 */
+	void evictFromMemory() {
+		assert(isInMemory());
+		_fchr.free();
+		_ftab.free();
+		_eftab.free();
+		_rstarts.free();
+		_offs.free(); // might not be under control of APtrWrap
+        _offsw.free(); // might not be under control of APtrWrap
+		_ebwt.free(); // might not be under control of APtrWrap
+		// Keep plen; it's small and the client may want to seq it
+		// even when the others are evicted.
+		//_plen  = NULL;
+		_zEbwtByteOff = (index_t)OFF_MASK;
+		_zEbwtBpOff = -1;
+	}
+
+	/**
+	 * Turn a substring of 'seq' starting at offset 'off' and having
+	 * length equal to the index's 'ftabChars' into an int that can be
+	 * used to index into the ftab array.
+	 */
+	index_t ftabSeqToInt(
+		const BTDnaString& seq,
+		index_t off,
+		bool rev) const
+	{
+		int fc = _eh._ftabChars;
+		index_t lo = off, hi = lo + fc;
+		assert_leq(hi, seq.length());
+		index_t ftabOff = 0;
+		for(int i = 0; i < fc; i++) {
+			bool fwex = fw();
+			if(rev) fwex = !fwex;
+			// We add characters to the ftabOff in the order they would
+			// have been consumed in a normal search.  For BWT, this
+			// means right-to-left order; for BWT' it's left-to-right.
+			int c = (fwex ? seq[lo + i] : seq[hi - i - 1]);
+			if(c > 3) {
+				return std::numeric_limits<index_t>::max();
+			}
+			assert_range(0, 3, c);
+			ftabOff <<= 2;
+			ftabOff |= c;
+		}
+		return ftabOff;
+	}
+	
+	/**
+	 * Non-static facade for static function ftabHi.
+	 */
+	index_t ftabHi(index_t i) const {
+		return Ebwt<index_t>::ftabHi(
+			ftab(),
+			eftab(),
+			_eh._len,
+			_eh._ftabLen,
+		    _eh._eftabLen,
+			i);
+	}
+
+	/**
+	 * Get "high interpretation" of ftab entry at index i.  The high
+	 * interpretation of a regular ftab entry is just the entry
+	 * itself.  The high interpretation of an extended entry is the
+	 * second correpsonding ui32 in the eftab.
+	 *
+	 * It's a static member because it's convenient to ask this
+	 * question before the Ebwt is fully initialized.
+	 */
+	static index_t ftabHi(
+		const index_t *ftab,
+		const index_t *eftab,
+		index_t len,
+		index_t ftabLen,
+		index_t eftabLen,
+		index_t i)
+	{
+		assert_lt(i, ftabLen);
+		if(ftab[i] <= len) {
+			return ftab[i];
+		} else {
+			index_t efIdx = ftab[i] ^ (index_t)OFF_MASK;
+			assert_lt(efIdx*2+1, eftabLen);
+			return eftab[efIdx*2+1];
+		}
+	}
+
+	/**
+	 * Non-static facade for static function ftabLo.
+	 */
+	index_t ftabLo(index_t i) const {
+		return Ebwt<index_t>::ftabLo(
+			ftab(),
+			eftab(),
+			_eh._len,
+			_eh._ftabLen,
+		    _eh._eftabLen,
+			i);
+	}
+	
+	/**
+	 * Get low bound of ftab range.
+	 */
+	index_t ftabLo(const BTDnaString& seq, index_t off) const {
+		return ftabLo(ftabSeqToInt(seq, off, false));
+	}
+
+	/**
+	 * Get high bound of ftab range.
+	 */
+	index_t ftabHi(const BTDnaString& seq, index_t off) const {
+		return ftabHi(ftabSeqToInt(seq, off, false));
+	}
+	
+	/**
+	 * Extract characters from seq starting at offset 'off' and going either
+	 * forward or backward, depending on 'rev'.  Order matters when compiling
+	 * the integer that gets looked up in the ftab.  Each successive character
+	 * is ORed into the least significant bit-pair, and characters are
+	 * integrated in the direction of the search.
+	 */
+	bool
+	ftabLoHi(
+		const BTDnaString& seq, // sequence to extract from
+		index_t off,             // offset into seq to begin extracting
+		bool rev,               // reverse while extracting
+		index_t& top,
+		index_t& bot) const
+	{
+		index_t fi = ftabSeqToInt(seq, off, rev);
+		if(fi == std::numeric_limits<index_t>::max()) {
+			return false;
+		}
+		top = ftabHi(fi);
+		bot = ftabLo(fi+1);
+		assert_geq(bot, top);
+		return true;
+	}
+	
+	/**
+	 * Get "low interpretation" of ftab entry at index i.  The low
+	 * interpretation of a regular ftab entry is just the entry
+	 * itself.  The low interpretation of an extended entry is the
+	 * first correpsonding ui32 in the eftab.
+	 *
+	 * It's a static member because it's convenient to ask this
+	 * question before the Ebwt is fully initialized.
+	 */
+	static index_t ftabLo(
+		const index_t *ftab,
+		const index_t *eftab,
+		index_t len,
+		index_t ftabLen,
+		index_t eftabLen,
+		index_t i)
+	{
+		assert_lt(i, ftabLen);
+		if(ftab[i] <= len) {
+			return ftab[i];
+		} else {
+			index_t efIdx = ftab[i] ^ (index_t)OFF_MASK;
+			assert_lt(efIdx*2+1, eftabLen);
+			return eftab[efIdx*2];
+		}
+	}
+
+	/**
+	 * Try to resolve the reference offset of the BW element 'elt'.  If
+	 * it can be resolved immediately, return the reference offset.  If
+	 * it cannot be resolved immediately, return 0xffffffff.
+	 */
+	index_t tryOffset(index_t elt) const {
+#ifndef NDEBUG
+        if(this->_offw) {
+            assert(offsw() != NULL);
+        } else {
+            assert(offs() != NULL);
+        }
+#endif
+		if(elt == _zOff) return 0;
+		if((elt & _eh._offMask) == elt) {
+			index_t eltOff = elt >> _eh._offRate;
+			assert_lt(eltOff, _eh._offsLen);
+            index_t off;
+            if(this->_offw) {
+                off = offsw()[eltOff];
+            } else {
+                off = offs()[eltOff];
+            }
+			assert_neq((index_t)OFF_MASK, off);
+			return off;
+		} else {
+			// Try looking at zoff
+			return (index_t)OFF_MASK;
+		}
+	}
+
+	/**
+	 * Try to resolve the reference offset of the BW element 'elt' such
+	 * that the offset returned is at the right-hand side of the
+	 * forward reference substring involved in the hit.
+	 */
+	index_t tryOffset(
+		index_t elt,
+		bool fw,
+		index_t hitlen) const
+	{
+		index_t off = tryOffset(elt);
+		if(off != (index_t)OFF_MASK && !fw) {
+			assert_lt(off, _eh._len);
+			off = _eh._len - off - 1;
+			assert_geq(off, hitlen-1);
+			off -= (hitlen-1);
+			assert_lt(off, _eh._len);
+		}
+		return off;
+	}
+
+	/**
+	 * Walk 'steps' steps to the left and return the row arrived at.
+	 */
+	index_t walkLeft(index_t row, index_t steps) const;
+
+	/**
+	 * Resolve the reference offset of the BW element 'elt'.
+	 */
+	index_t getOffset(index_t row) const;
+
+	/**
+	 * Resolve the reference offset of the BW element 'elt' such that
+	 * the offset returned is at the right-hand side of the forward
+	 * reference substring involved in the hit.
+	 */
+	index_t getOffset(
+		index_t elt,
+		bool fw,
+		index_t hitlen) const;
+
+	/**
+	 * When using read() to create an Ebwt, we have to set a couple of
+	 * additional fields in the Ebwt object that aren't part of the
+	 * parameter list and are not stored explicitly in the file.  Right
+	 * now, this just involves initializing _zEbwtByteOff and
+	 * _zEbwtBpOff from _zOff.
+	 */
+	void postReadInit(EbwtParams<index_t>& eh) {
+		index_t sideNum     = _zOff / eh._sideBwtLen;
+		index_t sideCharOff = _zOff % eh._sideBwtLen;
+		index_t sideByteOff = sideNum * eh._sideSz;
+		_zEbwtByteOff = sideCharOff >> 2;
+		assert_lt(_zEbwtByteOff, eh._sideBwtSz);
+		_zEbwtBpOff = sideCharOff & 3;
+		assert_lt(_zEbwtBpOff, 4);
+		_zEbwtByteOff += sideByteOff;
+		assert(repOk(eh)); // Ebwt should be fully initialized now
+	}
+
+	/**
+	 * Given basename of an Ebwt index, read and return its flag.
+	 */
+	static int32_t readFlags(const string& instr);
+
+	/**
+	 * Pretty-print the Ebwt to the given output stream.
+	 */
+	void print(ostream& out) const {
+		print(out, _eh);
+	}
+	
+	/**
+	 * Pretty-print the Ebwt and given EbwtParams to the given output
+	 * stream.
+	 */
+	void print(ostream& out, const EbwtParams<index_t>& eh) const {
+		eh.print(out); // print params
+        return;
+		out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
+		    << "    zOff: "         << _zOff << endl
+		    << "    zEbwtByteOff: " << _zEbwtByteOff << endl
+		    << "    zEbwtBpOff: "   << _zEbwtBpOff << endl
+		    << "    nPat: "  << _nPat << endl
+		    << "    plen: ";
+		if(plen() == NULL) {
+			out << "NULL" << endl;
+		} else {
+			out << "non-NULL, [0] = " << plen()[0] << endl;
+		}
+		out << "    rstarts: ";
+		if(rstarts() == NULL) {
+			out << "NULL" << endl;
+		} else {
+			out << "non-NULL, [0] = " << rstarts()[0] << endl;
+		}
+		out << "    ebwt: ";
+		if(ebwt() == NULL) {
+			out << "NULL" << endl;
+		} else {
+			out << "non-NULL, [0] = " << ebwt()[0] << endl;
+		}
+		out << "    fchr: ";
+		if(fchr() == NULL) {
+			out << "NULL" << endl;
+		} else {
+			out << "non-NULL, [0] = " << fchr()[0] << endl;
+		}
+		out << "    ftab: ";
+		if(ftab() == NULL) {
+			out << "NULL" << endl;
+		} else {
+			out << "non-NULL, [0] = " << ftab()[0] << endl;
+		}
+		out << "    eftab: ";
+		if(eftab() == NULL) {
+			out << "NULL" << endl;
+		} else {
+			out << "non-NULL, [0] = " << eftab()[0] << endl;
+		}
+		out << "    offs: ";
+		if(offs() == NULL) {
+			out << "NULL" << endl;
+		} else {
+			out << "non-NULL, [0] = " << offs()[0] << endl;
+		}
+	}
+
+	// Building
+	template <typename TStr> static TStr join(EList<TStr>& l, uint32_t seed);
+	template <typename TStr> static TStr join(EList<FileBuf*>& l, EList<RefRecord>& szs, index_t sztot, const RefReadInParams& refparams, uint32_t seed);
+	template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, index_t sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
+	template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2, ostream* saOut, ostream* bwtOut, const EList<RefRecord>& szs, int kmer_size);
+
+	// I/O
+	void readIntoMemory(int color, int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, EbwtParams<index_t> *params, bool mmSweep, bool loadNames, bool startVerbose);
+	void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
+	void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
+
+	// Sanity checking
+	void sanityCheckUpToSide(int upToSide) const;
+	void sanityCheckAll(int reverse) const;
+	void restore(SString<char>& s) const;
+	void checkOrigs(const EList<SString<char> >& os, bool color, bool mirror) const;
+
+	// Searching and reporting
+	void joinedToTextOff(index_t qlen, index_t off, index_t& tidx, index_t& textoff, index_t& tlen, bool rejectStraddle, bool& straddled) const;
+
+#define WITHIN_BWT_LEN(x) \
+	assert_leq(x[0], this->_eh._sideBwtLen); \
+	assert_leq(x[1], this->_eh._sideBwtLen); \
+	assert_leq(x[2], this->_eh._sideBwtLen); \
+	assert_leq(x[3], this->_eh._sideBwtLen)
+
+#define WITHIN_FCHR(x) \
+	assert_leq(x[0], this->fchr()[1]); \
+	assert_leq(x[1], this->fchr()[2]); \
+	assert_leq(x[2], this->fchr()[3]); \
+	assert_leq(x[3], this->fchr()[4])
+
+#define WITHIN_FCHR_DOLLARA(x) \
+	assert_leq(x[0], this->fchr()[1]+1); \
+	assert_leq(x[1], this->fchr()[2]); \
+	assert_leq(x[2], this->fchr()[3]); \
+	assert_leq(x[3], this->fchr()[4])
+
+	/**
+	 * Count all occurrences of character c from the beginning of the
+	 * forward side to <by,bp> and add in the occ[] count up to the side
+	 * break just prior to the side.
+	 *
+	 * A Bowtie 2 side is shaped like:
+	 *
+	 * XXXXXXXXXXXXXXXX [A] [C] [G] [T]
+	 * --------48------ -4- -4- -4- -4-  (numbers in bytes)
+	 */
+	inline index_t countBt2Side(const SideLocus<index_t>& l, int c) const {
+        assert_range(0, 3, c);
+        assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
+        assert_range(0, 3, (int)l._bp);
+        const uint8_t *side = l.side(this->ebwt());
+        index_t cCnt = countUpTo(l, c);
+        assert_leq(cCnt, l.toBWRow());
+        assert_leq(cCnt, this->_eh._sideBwtLen);
+        if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
+            // Adjust for the fact that we represented $ with an 'A', but
+            // shouldn't count it as an 'A' here
+            if((l._sideByteOff + l._by > _zEbwtByteOff) ||
+               (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
+            {
+                cCnt--; // Adjust for '$' looking like an 'A'
+            }
+        }
+        index_t ret;
+        // Now factor in the occ[] count at the side break
+        const uint8_t *acgt8 = side + _eh._sideBwtSz;
+        const index_t *acgt = reinterpret_cast<const index_t*>(acgt8);
+        assert_leq(acgt[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
+        assert_leq(acgt[1], this->_eh._len);
+        assert_leq(acgt[2], this->_eh._len);
+        assert_leq(acgt[3], this->_eh._len);
+        ret = acgt[c] + cCnt + this->fchr()[c];
+#ifndef NDEBUG
+        assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section
+        if(c == 0) {
+            assert_leq(cCnt, this->_eh._sideBwtLen);
+        } else {
+            assert_leq(ret, this->_eh._bwtLen);
+        }
+#endif
+        return ret;
+	}
+
+	/**
+	 * Count all occurrences of all four nucleotides up to the starting
+	 * point (which must be in a forward side) given by 'l' storing the
+	 * result in 'cntsUpto', then count nucleotide occurrences within the
+	 * range of length 'num' storing the result in 'cntsIn'.  Also, keep
+	 * track of the characters occurring within the range by setting
+	 * 'masks' accordingly (masks[1][10] == true -> 11th character is a
+	 * 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false.
+	 */
+	inline void countBt2SideRange(
+		SideLocus<index_t>& l,        // top locus
+		index_t num,        // number of elts in range to tall
+		index_t* cntsUpto,  // A/C/G/T counts up to top
+		index_t* cntsIn,    // A/C/G/T counts within range
+		EList<bool> *masks) const // masks indicating which range elts = A/C/G/T
+	{
+		assert_gt(num, 0);
+		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
+		assert_range(0, 3, (int)l._bp);
+		countUpToEx(l, cntsUpto);
+		WITHIN_FCHR_DOLLARA(cntsUpto);
+		WITHIN_BWT_LEN(cntsUpto);
+		const uint8_t *side = l.side(this->ebwt());
+		if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
+			// Adjust for the fact that we represented $ with an 'A', but
+			// shouldn't count it as an 'A' here
+			if((l._sideByteOff + l._by > _zEbwtByteOff) ||
+			   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
+			{
+				cntsUpto[0]--; // Adjust for '$' looking like an 'A'
+			}
+		}
+		// Now factor in the occ[] count at the side break
+		const index_t *acgt = reinterpret_cast<const index_t*>(side + _eh._sideBwtSz);
+		assert_leq(acgt[0], this->fchr()[1] + this->_eh.sideBwtLen());
+		assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
+		assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
+		assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]);
+		assert_leq(acgt[0], this->_eh._len + this->_eh.sideBwtLen());
+		assert_leq(acgt[1], this->_eh._len);
+		assert_leq(acgt[2], this->_eh._len);
+		assert_leq(acgt[3], this->_eh._len);
+		cntsUpto[0] += (acgt[0] + this->fchr()[0]);
+		cntsUpto[1] += (acgt[1] + this->fchr()[1]);
+		cntsUpto[2] += (acgt[2] + this->fchr()[2]);
+		cntsUpto[3] += (acgt[3] + this->fchr()[3]);
+		masks[0].resize(num);
+		masks[1].resize(num);
+		masks[2].resize(num);
+		masks[3].resize(num);
+		WITHIN_FCHR_DOLLARA(cntsUpto);
+		WITHIN_FCHR_DOLLARA(cntsIn);
+		// 'cntsUpto' is complete now.
+		// Walk forward until we've tallied the entire 'In' range
+		index_t nm = 0;
+		// Rest of this side
+		nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm);
+		assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
+		assert_leq(nm, num);
+		SideLocus<index_t> lcopy = l;
+		while(nm < num) {
+			// Subsequent sides, if necessary
+			lcopy.nextSide(this->_eh);
+			nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm);
+			WITHIN_FCHR_DOLLARA(cntsIn);
+			assert_leq(nm, num);
+			assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
+		}
+		assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
+		WITHIN_FCHR_DOLLARA(cntsIn);
+	}
+
+	/**
+	 * Count all occurrences of character c from the beginning of the
+	 * forward side to <by,bp> and add in the occ[] count up to the side
+	 * break just prior to the side.
+	 *
+	 * A forward side is shaped like:
+	 *
+	 * [A] [C] XXXXXXXXXXXXXXXX
+	 * -4- -4- --------56------ (numbers in bytes)
+	 *         ^
+	 *         Side ptr (result from SideLocus.side())
+	 *
+	 * And following it is a reverse side shaped like:
+	 * 
+	 * [G] [T] XXXXXXXXXXXXXXXX
+	 * -4- -4- --------56------ (numbers in bytes)
+	 *         ^
+	 *         Side ptr (result from SideLocus.side())
+	 *
+	 */
+	inline void countBt2SideEx(const SideLocus<index_t>& l, index_t* arrs) const {
+		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
+		assert_range(0, 3, (int)l._bp);
+		countUpToEx(l, arrs);
+		if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
+			// Adjust for the fact that we represented $ with an 'A', but
+			// shouldn't count it as an 'A' here
+			if((l._sideByteOff + l._by > _zEbwtByteOff) ||
+			   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
+			{
+				arrs[0]--; // Adjust for '$' looking like an 'A'
+			}
+		}
+		WITHIN_FCHR(arrs);
+		WITHIN_BWT_LEN(arrs);
+		// Now factor in the occ[] count at the side break
+		const uint8_t *side = l.side(this->ebwt());
+		const uint8_t *acgt16 = side + this->_eh._sideSz - sizeof(index_t) * 4;
+		const index_t *acgt = reinterpret_cast<const index_t*>(acgt16);
+		assert_leq(acgt[0], this->fchr()[1] + this->_eh.sideBwtLen());
+		assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
+		assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
+		assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]);
+		assert_leq(acgt[0], this->_eh._len + this->_eh.sideBwtLen());
+		assert_leq(acgt[1], this->_eh._len);
+		assert_leq(acgt[2], this->_eh._len);
+		assert_leq(acgt[3], this->_eh._len);
+		arrs[0] += (acgt[0] + this->fchr()[0]);
+		arrs[1] += (acgt[1] + this->fchr()[1]);
+		arrs[2] += (acgt[2] + this->fchr()[2]);
+		arrs[3] += (acgt[3] + this->fchr()[3]);
+		WITHIN_FCHR(arrs);
+	}
+
+    /**
+	 * Counts the number of occurrences of character 'c' in the given Ebwt
+	 * side up to (but not including) the given byte/bitpair (by/bp).
+	 *
+	 * This is a performance-critical function.  This is the top search-
+	 * related hit in the time profile.
+	 *
+	 * Function gets 11.09% in profile
+	 */
+	inline index_t countUpTo(const SideLocus<index_t>& l, int c) const {
+		// Count occurrences of c in each 64-bit (using bit trickery);
+		// Someday countInU64() and pop() functions should be
+		// vectorized/SSE-ized in case that helps.
+        bool usePOPCNT = false;
+		index_t cCnt = 0;
+		const uint8_t *side = l.side(this->ebwt());
+		int i = 0;
+#ifdef POPCNT_CAPABILITY
+        if(_usePOPCNTinstruction) {
+            usePOPCNT = true;
+            int by = l._by + (l._bp > 0 ? 1 : 0);
+            for(; i < by; i += 8) {
+                if(i + 8 < by) {
+                    cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
+                } else {
+                    index_t by_shift = 8 - (by - i);
+                    index_t bp_shift = (l._bp > 0 ? 4 - l._bp : 0);
+                    index_t shift = (by_shift << 3) + (bp_shift << 1);
+                    uint64_t side_i = *(uint64_t*)&side[i];
+                    side_i = (_toBigEndian ? side_i >> shift : side_i << shift);
+                    index_t cCnt_add = countInU64<USE_POPCNT_INSTRUCTION>(c, side_i);
+                    if(c == 0) cCnt_add -= (shift >> 1);
+#ifndef NDEBUG
+                    index_t cCnt_temp = 0;
+                    for(int j = i; j < l._by; j++) {
+                        cCnt_temp += cCntLUT_4[0][c][side[j]];
+                    }
+                    if(l._bp > 0) {
+                        cCnt_temp += cCntLUT_4[(int)l._bp][c][side[l._by]];
+                    }
+                    assert_eq(cCnt_add, cCnt_temp);
+#endif
+                    cCnt += cCnt_add;
+                    break;
+                }
+            }
+        } else {
+            for(; i + 7 < l._by; i += 8) {
+                cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
+            }
+        }
+#else
+        for(; i + 7 < l._by; i += 8) {
+            cCnt += countInU64(c, *(uint64_t*)&side[i]);
+        }
+#endif
+        
+        if(!usePOPCNT) {
+            // Count occurences of c in the rest of the side (using LUT)
+            for(; i < l._by; i++) {
+                cCnt += cCntLUT_4[0][c][side[i]];
+            }
+            
+            // Count occurences of c in the rest of the byte
+            if(l._bp > 0) {
+                cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
+            }
+        }
+        
+		return cCnt;
+	}
+    
+    /**
+	 * Counts the number of occurrences of character 'c' in the given Ebwt
+	 * side down to the given byte/bitpair (by/bp).
+	 *
+	 */
+	inline index_t countDownTo(const SideLocus<index_t>& l, int c) const {
+		// Count occurrences of c in each 64-bit (using bit trickery);
+		// Someday countInU64() and pop() functions should be
+		// vectorized/SSE-ized in case that helps.
+		index_t cCnt = 0;
+		const uint8_t *side = l.side(this->ebwt());
+		int i = 64 - 4 * sizeof(index_t) - 1;
+#ifdef POPCNT_CAPABILITY
+        if ( _usePOPCNTinstruction) {
+            for(; i - 7 > l._by; i -= 8) {
+                cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i-7]);
+            }
+        }
+        else {
+            for(; i + 7 > l._by; i -= 8) {
+                cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i-7]);
+            }
+        }
+#else
+        for(; i + 7 > l._by; i -= 8) {
+            cCnt += countInU64(c, *(uint64_t*)&side[i-7]);
+        }
+#endif
+		// Count occurences of c in the rest of the side (using LUT)
+		for(; i > l._by; i--) {
+			cCnt += cCntLUT_4_rev[0][c][side[i]];
+		}
+		// Count occurences of c in the rest of the byte
+		if(l._bp > 0) {
+			cCnt += cCntLUT_4_rev[4-(int)l._bp][c][side[i]];
+		} else {
+            cCnt += cCntLUT_4_rev[0][c][side[i]];
+        }
+		return cCnt;
+	}
+
+    /**
+     * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
+     * within a 64-bit argument.
+     *
+     * Function gets 2.32% in profile
+     */
+#ifdef POPCNT_CAPABILITY
+    template<typename Operation>
+#endif
+    inline static void countInU64Ex(uint64_t dw, index_t* arrs) {
+        uint64_t c0 = c_table[0];
+        uint64_t x0 = dw ^ c0;
+        uint64_t x1 = (x0 >> 1);
+        uint64_t x2 = x1 & (0x5555555555555555llu);
+        uint64_t x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+        uint64_t tmp = Operation().pop64(x3);
+#else
+        uint64_t tmp = pop64(x3);
+#endif
+        arrs[0] += (uint32_t) tmp;
+        
+        c0 = c_table[1];
+        x0 = dw ^ c0;
+        x1 = (x0 >> 1);
+        x2 = x1 & (0x5555555555555555llu);
+        x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+        tmp = Operation().pop64(x3);
+#else
+        tmp = pop64(x3);
+#endif
+        arrs[1] += (uint32_t) tmp;
+        
+        c0 = c_table[2];
+        x0 = dw ^ c0;
+        x1 = (x0 >> 1);
+        x2 = x1 & (0x5555555555555555llu);
+        x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+        tmp = Operation().pop64(x3);
+#else
+        tmp = pop64(x3);
+#endif
+        arrs[2] += (uint32_t) tmp;
+        
+        c0 = c_table[3];
+        x0 = dw ^ c0;
+        x1 = (x0 >> 1);
+        x2 = x1 & (0x5555555555555555llu);
+        x3 = x0 & x2;
+#ifdef POPCNT_CAPABILITY
+        tmp = Operation().pop64(x3);
+#else
+        tmp = pop64(x3);
+#endif
+        arrs[3] += (uint32_t) tmp;
+    }
+
+	/**
+	 * Counts the number of occurrences of all four nucleotides in the
+	 * given side up to (but not including) the given byte/bitpair (by/bp).
+	 * Count for 'a' goes in arrs[0], 'c' in arrs[1], etc.
+	 */
+	inline void countUpToEx(const SideLocus<index_t>& l, index_t* arrs) const {
+		int i = 0;
+		// Count occurrences of each nucleotide in each 64-bit word using
+		// bit trickery; note: this seems does not seem to lend a
+		// significant boost to performance in practice.  If you comment
+		// out this whole loop (which won't affect correctness - it will
+		// just cause the following loop to take up the slack) then runtime
+		// does not change noticeably. Someday the countInU64() and pop()
+		// functions should be vectorized/SSE-ized in case that helps.
+		const uint8_t *side = l.side(this->ebwt());
+#ifdef POPCNT_CAPABILITY
+        if (_usePOPCNTinstruction) {
+            for(; i+7 < l._by; i += 8) {
+                countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
+            }
+        }
+        else {
+            for(; i+7 < l._by; i += 8) {
+                countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
+            }
+        }
+#else
+        for(; i+7 < l._by; i += 8) {
+            countInU64Ex(*(uint64_t*)&side[i], arrs);
+        }
+#endif
+		// Count occurences of nucleotides in the rest of the side (using LUT)
+		// Many cache misses on following lines (~20K)
+		for(; i < l._by; i++) {
+			arrs[0] += cCntLUT_4[0][0][side[i]];
+			arrs[1] += cCntLUT_4[0][1][side[i]];
+			arrs[2] += cCntLUT_4[0][2][side[i]];
+			arrs[3] += cCntLUT_4[0][3][side[i]];
+		}
+		// Count occurences of c in the rest of the byte
+		if(l._bp > 0) {
+			arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
+			arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
+			arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
+			arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
+		}
+	}
+
+#ifndef NDEBUG
+	/**
+	 * Given top and bot loci, calculate counts of all four DNA chars up to
+	 * those loci.  Used for more advanced backtracking-search.
+	 */
+	inline void mapLFEx(
+		const SideLocus<index_t>& l,
+		index_t *arrs
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		assert_eq(0, arrs[0]);
+		assert_eq(0, arrs[1]);
+		assert_eq(0, arrs[2]);
+		assert_eq(0, arrs[3]);
+		countBt2SideEx(l, arrs);
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with individual calls to mapLF;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			assert_eq(mapLF(l, 0, true), arrs[0]);
+			assert_eq(mapLF(l, 1, true), arrs[1]);
+			assert_eq(mapLF(l, 2, true), arrs[2]);
+			assert_eq(mapLF(l, 3, true), arrs[3]);
+		}
+	}
+#endif
+
+	/**
+	 * Given top and bot rows, calculate counts of all four DNA chars up to
+	 * those loci.
+	 */
+	inline void mapLFEx(
+		index_t top,
+		index_t bot,
+		index_t *tops,
+		index_t *bots
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		SideLocus<index_t> ltop, lbot;
+		SideLocus<index_t>::initFromTopBot(top, bot, _eh, ebwt(), ltop, lbot);
+		mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity));
+	}
+
+	/**
+	 * Given top and bot loci, calculate counts of all four DNA chars up to
+	 * those loci.  Used for more advanced backtracking-search.
+	 */
+	inline void mapLFEx(
+		const SideLocus<index_t>& ltop,
+		const SideLocus<index_t>& lbot,
+		index_t *tops,
+		index_t *bots
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		assert(ltop.repOk(this->eh()));
+		assert(lbot.repOk(this->eh()));
+		assert_eq(0, tops[0]); assert_eq(0, bots[0]);
+		assert_eq(0, tops[1]); assert_eq(0, bots[1]);
+		assert_eq(0, tops[2]); assert_eq(0, bots[2]);
+		assert_eq(0, tops[3]); assert_eq(0, bots[3]);
+		countBt2SideEx(ltop, tops);
+		countBt2SideEx(lbot, bots);
+#ifndef NDEBUG
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with individual calls to mapLF;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			assert_eq(mapLF(ltop, 0, true), tops[0]);
+			assert_eq(mapLF(ltop, 1, true), tops[1]);
+			assert_eq(mapLF(ltop, 2, true), tops[2]);
+			assert_eq(mapLF(ltop, 3, true), tops[3]);
+			assert_eq(mapLF(lbot, 0, true), bots[0]);
+			assert_eq(mapLF(lbot, 1, true), bots[1]);
+			assert_eq(mapLF(lbot, 2, true), bots[2]);
+			assert_eq(mapLF(lbot, 3, true), bots[3]);
+		}
+#endif
+	}
+
+	/**
+	 * Counts the number of occurrences of all four nucleotides in the
+	 * given side from the given byte/bitpair (l->_by/l->_bp) (or the
+	 * beginning of the side if l == 0).  Count for 'a' goes in arrs[0],
+	 * 'c' in arrs[1], etc.
+	 *
+	 * Note: must account for $.
+	 *
+	 * Must fill in masks
+	 */
+	inline index_t countBt2SideRange2(
+		const SideLocus<index_t>& l,
+		bool startAtLocus,
+		index_t num,
+		index_t* arrs,
+		EList<bool> *masks,
+		index_t maskOff) const
+	{
+		assert(!masks[0].empty());
+		assert_eq(masks[0].size(), masks[1].size());
+		assert_eq(masks[0].size(), masks[2].size());
+		assert_eq(masks[0].size(), masks[3].size());
+		ASSERT_ONLY(index_t myarrs[4] = {0, 0, 0, 0});
+		index_t nm = 0; // number of nucleotides tallied so far
+		int iby = 0;      // initial byte offset
+		int ibp = 0;      // initial base-pair offset
+		if(startAtLocus) {
+			iby = l._by;
+			ibp = l._bp;
+		} else {
+			// Start at beginning
+		}
+		int by = iby, bp = ibp;
+		assert_lt(bp, 4);
+		assert_lt(by, (int)this->_eh._sideBwtSz);
+		const uint8_t *side = l.side(this->ebwt());
+		while(nm < num) {
+			int c = (side[by] >> (bp * 2)) & 3;
+			assert_lt(maskOff + nm, masks[c].size());
+			masks[0][maskOff + nm] = masks[1][maskOff + nm] =
+			masks[2][maskOff + nm] = masks[3][maskOff + nm] = false;
+			assert_range(0, 3, c);
+			// Note: we tally $ just like an A
+			arrs[c]++; // tally it
+			ASSERT_ONLY(myarrs[c]++);
+			masks[c][maskOff + nm] = true; // not dead
+			nm++;
+			if(++bp == 4) {
+				bp = 0;
+				by++;
+				assert_leq(by, (int)this->_eh._sideBwtSz);
+				if(by == (int)this->_eh._sideBwtSz) {
+					// Fell off the end of the side
+					break;
+				}
+			}
+		}
+		WITHIN_FCHR_DOLLARA(arrs);
+#ifndef NDEBUG
+		if(_sanity) {
+			// Make sure results match up with a call to mapLFEx.
+			index_t tops[4] = {0, 0, 0, 0};
+			index_t bots[4] = {0, 0, 0, 0};
+			index_t top = l.toBWRow();
+			index_t bot = top + nm;
+			mapLFEx(top, bot, tops, bots, false);
+			assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1);
+			assert_eq(myarrs[1], bots[1] - tops[1]);
+			assert_eq(myarrs[2], bots[2] - tops[2]);
+			assert_eq(myarrs[3], bots[3] - tops[3]);
+		}
+#endif
+		return nm;
+	}
+
+	/**
+	 * Return the final character in row i (i.e. the i'th character in the
+	 * BWT transform).  Note that the 'L' in the name of the function
+	 * stands for 'last', as in the literature.
+	 */
+	inline int rowL(const SideLocus<index_t>& l) const {
+		// Extract and return appropriate bit-pair
+		return unpack_2b_from_8b(l.side(this->ebwt())[l._by], l._bp);
+	}
+
+	/**
+	 * Return the final character in row i (i.e. the i'th character in the
+	 * BWT transform).  Note that the 'L' in the name of the function
+	 * stands for 'last', as in the literature.
+	 */
+	inline int rowL(index_t i) const {
+		// Extract and return appropriate bit-pair
+		SideLocus<index_t> l;
+		l.initFromRow(i, _eh, ebwt());
+		return rowL(l);
+	}
+
+	/**
+	 * Given top and bot loci, calculate counts of all four DNA chars up to
+	 * those loci.  Used for more advanced backtracking-search.
+	 */
+	inline void mapLFRange(
+		SideLocus<index_t>& ltop,
+		SideLocus<index_t>& lbot,
+		index_t num,        // Number of elts
+		index_t* cntsUpto,  // A/C/G/T counts up to top
+		index_t* cntsIn,    // A/C/G/T counts within range
+		EList<bool> *masks
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		assert(ltop.repOk(this->eh()));
+		assert(lbot.repOk(this->eh()));
+		assert_eq(num, lbot.toBWRow() - ltop.toBWRow());
+		assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]);
+		assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]);
+		assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]);
+		assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]);
+		countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks);
+		assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
+#ifndef NDEBUG
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with individual calls to mapLF;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			index_t tops[4] = {0, 0, 0, 0};
+			index_t bots[4] = {0, 0, 0, 0};
+			assert(ltop.repOk(this->eh()));
+			assert(lbot.repOk(this->eh()));
+			mapLFEx(ltop, lbot, tops, bots, false);
+			for(int i = 0; i < 4; i++) {
+				assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]);
+				if(i == 0) {
+					assert(cntsIn[i] == bots[i]-tops[i] ||
+						   cntsIn[i] == bots[i]-tops[i]+1);
+				} else {
+					assert_eq(cntsIn[i], bots[i]-tops[i]);
+				}
+			}
+		}
+#endif
+	}
+
+	/**
+	 * Given row i, return the row that the LF mapping maps i to.
+	 */
+	inline index_t mapLF(
+		const SideLocus<index_t>& l
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		ASSERT_ONLY(index_t srcrow = l.toBWRow());
+		index_t ret;
+		assert(l.side(this->ebwt()) != NULL);
+		int c = rowL(l);
+		assert_lt(c, 4);
+		assert_geq(c, 0);
+		ret = countBt2Side(l, c);
+		assert_lt(ret, this->_eh._bwtLen);
+		assert_neq(srcrow, ret);
+#ifndef NDEBUG
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with results from mapLFEx;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			index_t arrs[] = { 0, 0, 0, 0 };
+			mapLFEx(l, arrs, true);
+			assert_eq(arrs[c], ret);
+		}
+#endif
+		return ret;
+	}
+
+	/**
+	 * Given row i and character c, return the row that the LF mapping maps
+	 * i to on character c.
+	 */
+	inline index_t mapLF(
+		const SideLocus<index_t>& l, int c
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		index_t ret;
+		assert_lt(c, 4);
+		assert_geq(c, 0);
+		ret = countBt2Side(l, c);
+		assert_lt(ret, this->_eh._bwtLen);
+#ifndef NDEBUG
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with results from mapLFEx;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			index_t arrs[] = { 0, 0, 0, 0 };
+			mapLFEx(l, arrs, true);
+			assert_eq(arrs[c], ret);
+		}
+#endif
+		return ret;
+	}
+
+	/**
+	 * Given top and bot loci, calculate counts of all four DNA chars up to
+	 * those loci.  Also, update a set of tops and bots for the reverse
+	 * index/direction using the idea from the bi-directional BWT paper.
+	 */
+	inline void mapBiLFEx(
+		const SideLocus<index_t>& ltop,
+		const SideLocus<index_t>& lbot,
+		index_t *tops,
+		index_t *bots,
+		index_t *topsP, // topsP[0] = top
+		index_t *botsP
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+#ifndef NDEBUG
+		for(int i = 0; i < 4; i++) {
+			assert_eq(0, tops[0]);  assert_eq(0, bots[0]);
+		}
+#endif
+		countBt2SideEx(ltop, tops);
+		countBt2SideEx(lbot, bots);
+#ifndef NDEBUG
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with individual calls to mapLF;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			assert_eq(mapLF(ltop, 0, true), tops[0]);
+			assert_eq(mapLF(ltop, 1, true), tops[1]);
+			assert_eq(mapLF(ltop, 2, true), tops[2]);
+			assert_eq(mapLF(ltop, 3, true), tops[3]);
+			assert_eq(mapLF(lbot, 0, true), bots[0]);
+			assert_eq(mapLF(lbot, 1, true), bots[1]);
+			assert_eq(mapLF(lbot, 2, true), bots[2]);
+			assert_eq(mapLF(lbot, 3, true), bots[3]);
+		}
+#endif
+		// bots[0..3] - tops[0..3] = # of ways to extend the suffix with an
+		// A, C, G, T
+		botsP[0] = topsP[0] + (bots[0] - tops[0]);
+		topsP[1] = botsP[0];
+		botsP[1] = topsP[1] + (bots[1] - tops[1]);
+		topsP[2] = botsP[1];
+		botsP[2] = topsP[2] + (bots[2] - tops[2]);
+		topsP[3] = botsP[2];
+		botsP[3] = topsP[3] + (bots[3] - tops[3]);
+	}
+
+	/**
+	 * Given row and its locus information, proceed on the given character
+	 * and return the next row, or all-fs if we can't proceed on that
+	 * character.  Returns 0xffffffff if this row ends in $.
+	 */
+	inline index_t mapLF1(
+		index_t row,       // starting row
+		const SideLocus<index_t>& l, // locus for starting row
+		int c               // character to proceed on
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		if(rowL(l) != c || row == _zOff) return (index_t)OFF_MASK;
+		index_t ret;
+		assert_lt(c, 4);
+		assert_geq(c, 0);
+		ret = countBt2Side(l, c);
+		assert_lt(ret, this->_eh._bwtLen);
+#ifndef NDEBUG
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with results from mapLFEx;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			index_t arrs[] = { 0, 0, 0, 0 };
+			mapLFEx(l, arrs, true);
+			assert_eq(arrs[c], ret);
+		}
+#endif
+		return ret;
+	}
+
+
+	/**
+	 * Given row and its locus information, set the row to LF(row) and
+	 * return the character that was in the final column.
+	 */
+	inline int mapLF1(
+		index_t& row,      // starting row
+		const SideLocus<index_t>& l  // locus for starting row
+		ASSERT_ONLY(, bool overrideSanity = false)
+		) const
+	{
+		if(row == _zOff) return -1;
+		int c = rowL(l);
+		assert_range(0, 3, c);
+		row = countBt2Side(l, c);
+		assert_lt(row, this->_eh._bwtLen);
+#ifndef NDEBUG
+		if(_sanity && !overrideSanity) {
+			// Make sure results match up with results from mapLFEx;
+			// be sure to override sanity-checking in the callee, or we'll
+			// have infinite recursion
+			index_t arrs[] = { 0, 0, 0, 0 };
+			mapLFEx(l, arrs, true);
+			assert_eq(arrs[c], row);
+		}
+#endif
+		return c;
+	}
+
+#ifndef NDEBUG
+	/// Check that in-memory Ebwt is internally consistent with respect
+	/// to given EbwtParams; assert if not
+	bool inMemoryRepOk(const EbwtParams<index_t>& eh) const {
+		assert_geq(_zEbwtBpOff, 0);
+		assert_lt(_zEbwtBpOff, 4);
+		assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
+		assert_lt(_zOff, eh._bwtLen);
+		assert_geq(_nFrag, _nPat);
+		return true;
+	}
+
+	/// Check that in-memory Ebwt is internally consistent; assert if
+	/// not
+	bool inMemoryRepOk() const {
+		return repOk(_eh);
+	}
+
+	/// Check that Ebwt is internally consistent with respect to given
+	/// EbwtParams; assert if not
+	bool repOk(const EbwtParams<index_t>& eh) const {
+		assert(_eh.repOk());
+		if(isInMemory()) {
+			return inMemoryRepOk(eh);
+		}
+		return true;
+	}
+
+	/// Check that Ebwt is internally consistent; assert if not
+	bool repOk() const {
+		return repOk(_eh);
+	}
+#endif
+    
+    string get_uid(const string& header) {
+        size_t ndelim = 0;
+        size_t j = 0;
+        for(; j < header.length(); j++) {
+            if(header[j] == ' ') break;
+            if(header[j] == '|') ndelim++;
+            if(ndelim == 2) break;
+        }
+        string uid = header.substr(0, j);
+        return uid;
+    }
+    
+    uint64_t get_tid(const string& stid) {
+        uint64_t tid1 = 0, tid2 = 0;
+        bool sawDot = false;
+        for(size_t i = 0; i < stid.length(); i++) {
+            if(stid[i] == '.') {
+                sawDot = true;
+                continue;
+            }
+            uint32_t num = stid[i] - '0';
+            if(sawDot) {
+                tid2 = tid2 * 10 + num;
+            } else {
+                tid1 = tid1 * 10 + num;
+            }
+        }
+        return tid1 | (tid2 << 32);
+    }
+
+	bool       _toBigEndian;
+	int32_t    _overrideOffRate;
+	bool       _verbose;
+	bool       _passMemExc;
+	bool       _sanity;
+	bool       fw_;     // true iff this is a forward index
+	FILE    *_in1;    // input fd for primary index file
+	FILE    *_in2;    // input fd for secondary index file
+	string     _in1Str; // filename for primary index file
+	string     _in2Str; // filename for secondary index file
+    string     _inSaStr;  // filename for suffix-array file
+    string     _inBwtStr; // filename for BWT file
+	index_t    _zOff;
+	index_t    _zEbwtByteOff;
+	int        _zEbwtBpOff;
+	index_t    _nPat;  /// number of reference texts
+	index_t    _nFrag; /// number of fragments
+	APtrWrap<index_t> _plen;
+	APtrWrap<index_t> _rstarts; // starting offset of fragments / text indexes
+	// _fchr, _ftab and _eftab are expected to be relatively small
+	// (usually < 1MB, perhaps a few MB if _fchr is particularly large
+	// - like, say, 11).  For this reason, we don't bother with writing
+	// them to disk through separate output streams; we
+	APtrWrap<index_t> _fchr;
+	APtrWrap<index_t> _ftab;
+	APtrWrap<index_t> _eftab; // "extended" entries for _ftab
+	// _offs may be extremely large.  E.g. for DNA w/ offRate=4 (one
+	// offset every 16 rows), the total size of _offs is the same as
+	// the total size of the input sequence
+    bool _offw;
+	APtrWrap<uint16_t> _offs;  // offset when # of seq. is less than 2^16
+    APtrWrap<uint32_t> _offsw; // offset when # of seq. is more than 2^16
+	// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
+	// is at least as large as the input sequence.
+	APtrWrap<uint8_t> _ebwt;
+	bool       _useMm;        /// use memory-mapped files to hold the index
+	bool       useShmem_;     /// use shared memory to hold large parts of the index
+	EList<string> _refnames; /// names of the reference sequences
+	char *mmFile1_;
+	char *mmFile2_;
+    
+    bool                             _compressed; // compressed index?
+    
+	EbwtParams<index_t> _eh;
+	bool packed_;
+    
+    EList<pair<string, uint64_t> >   _uid_to_tid; // table that converts uid to tid
+    TaxonomyTree _tree;
+    TaxonomyPathTable                _paths;
+    std::map<uint64_t, string>       _name;
+    std::map<uint64_t, uint64_t>     _size;
+    
+
+	static const uint64_t default_bmax = OFF_MASK;
+	static const uint64_t default_bmaxMultSqrt = OFF_MASK;
+	static const uint64_t default_bmaxDivN = 4;
+	static const int      default_dcv = 1024;
+	static const bool     default_noDc = false;
+	static const bool     default_useBlockwise = true;
+	static const uint32_t default_seed = 0;
+#ifdef BOWTIE_64BIT_INDEX
+	static const int      default_lineRate = 7;
+#else
+	static const int      default_lineRate = 6;
+#endif
+	static const int      default_offRate = 5;
+	static const int      default_offRatePlus = 0;
+	static const int      default_ftabChars = 10;
+	static const bool     default_bigEndian = false;
+
+protected:
+
+	ostream& log() const {
+		return cout; // TODO: turn this into a parameter
+	}
+
+	/// Print a verbose message and flush (flushing is helpful for
+	/// debugging)
+	void verbose(const string& s) const {
+		if(this->verbose()) {
+			this->log() << s.c_str();
+			this->log().flush();
+		}
+	}
+};
+
+/**
+ * Read reference names from an input stream 'in' for an Ebwt primary
+ * file and store them in 'refnames'.
+ */
+template <typename index_t>
+void readEbwtRefnames(istream& in, EList<string>& refnames);
+
+/**
+ * Read reference names from the index with basename 'in' and store
+ * them in 'refnames'.
+ */
+template <typename index_t>
+void readEbwtRefnames(const string& instr, EList<string>& refnames);
+
+/**
+ * Read just enough of the Ebwt's header to determine whether it's
+ * colorspace.
+ */
+bool readEbwtColor(const string& instr);
+
+/**
+ * Read just enough of the Ebwt's header to determine whether it's
+ * entirely reversed.
+ */
+bool readEntireReverse(const string& instr);
+
+///////////////////////////////////////////////////////////////////////
+//
+// Functions for building Ebwts
+//
+///////////////////////////////////////////////////////////////////////
+
+/**
+ * Join several text strings together in a way that's compatible with
+ * the text-chunking scheme dictated by chunkRate parameter.
+ *
+ * The non-static member Ebwt::join additionally builds auxilliary
+ * arrays that maintain a mapping between chunks in the joined string
+ * and the original text strings.
+ */
+template <typename index_t>
+template <typename TStr>
+TStr Ebwt<index_t>::join(EList<TStr>& l, uint32_t seed) {
+	RandomSource rand; // reproducible given same seed
+	rand.init(seed);
+	TStr ret;
+	index_t guessLen = 0;
+	for(index_t i = 0; i < l.size(); i++) {
+		guessLen += length(l[i]);
+	}
+	ret.resize(guessLen);
+	index_t off = 0;
+	for(size_t i = 0; i < l.size(); i++) {
+		TStr& s = l[i];
+		assert_gt(s.length(), 0);
+		for(size_t j = 0; j < s.size(); j++) {
+			ret.set(s[j], off++);
+		}
+	}
+	return ret;
+}
+
+/**
+ * Join several text strings together in a way that's compatible with
+ * the text-chunking scheme dictated by chunkRate parameter.
+ *
+ * The non-static member Ebwt::join additionally builds auxilliary
+ * arrays that maintain a mapping between chunks in the joined string
+ * and the original text strings.
+ */
+template <typename index_t>
+template <typename TStr>
+TStr Ebwt<index_t>::join(EList<FileBuf*>& l,
+                EList<RefRecord>& szs,
+                index_t sztot,
+                const RefReadInParams& refparams,
+                uint32_t seed)
+{
+	RandomSource rand; // reproducible given same seed
+	rand.init(seed);
+	RefReadInParams rpcp = refparams;
+	TStr ret;
+	index_t guessLen = sztot;
+	ret.resize(guessLen);
+	ASSERT_ONLY(index_t szsi = 0);
+	TIndexOffU dstoff = 0;
+	for(index_t i = 0; i < l.size(); i++) {
+		// For each sequence we can pull out of istream l[i]...
+		assert(!l[i]->eof());
+		bool first = true;
+		while(!l[i]->eof()) {
+			RefRecord rec = fastaRefReadAppend(*l[i], first, ret, dstoff, rpcp);
+			first = false;
+			index_t bases = (index_t)rec.len;
+			assert_eq(rec.off, szs[szsi].off);
+			assert_eq(rec.len, szs[szsi].len);
+			assert_eq(rec.first, szs[szsi].first);
+			ASSERT_ONLY(szsi++);
+			if(bases == 0) continue;
+		}
+	}
+	return ret;
+}
+
+/**
+ * Join several text strings together according to the text-chunking
+ * scheme specified in the EbwtParams.  Ebwt fields calculated in this
+ * function are written directly to disk.
+ *
+ * It is assumed, but not required, that the header values have already
+ * been written to 'out1' before this function is called.
+ *
+ * The static member Ebwt::join just returns a joined version of a
+ * list of strings without building any of the auxilliary arrays.
+ */
+template <typename index_t>
+template <typename TStr>
+void Ebwt<index_t>::joinToDisk(
+	EList<FileBuf*>& l,
+	EList<RefRecord>& szs,
+	index_t sztot,
+	const RefReadInParams& refparams,
+	TStr& ret,
+	ostream& out1,
+	ostream& out2)
+{
+	RefReadInParams rpcp = refparams;
+	assert_gt(szs.size(), 0);
+	assert_gt(l.size(), 0);
+	assert_gt(sztot, 0);
+	// Not every fragment represents a distinct sequence - many
+	// fragments may correspond to a single sequence.  Count the
+	// number of sequences here by counting the number of "first"
+	// fragments.
+	this->_nPat = 0;
+	this->_nFrag = 0;
+	for(index_t i = 0; i < szs.size(); i++) {
+		if(szs[i].len > 0) this->_nFrag++;
+		if(szs[i].first && szs[i].len > 0) this->_nPat++;
+	}
+	assert_gt(this->_nPat, 0);
+	assert_geq(this->_nFrag, this->_nPat);
+	_rstarts.reset();
+	writeIndex<index_t>(out1, this->_nPat, this->toBe());
+	// Allocate plen[]
+	try {
+		this->_plen.init(new index_t[this->_nPat], this->_nPat);
+	} catch(bad_alloc& e) {
+		cerr << "Out of memory allocating plen[] in Ebwt::join()"
+		     << " at " << __FILE__ << ":" << __LINE__ << endl;
+		throw e;
+	}
+	// For each pattern, set plen
+	int npat = -1;
+	for(index_t i = 0; i < szs.size(); i++) {
+		if(szs[i].first && szs[i].len > 0) {
+			if(npat >= 0) {
+				writeIndex<index_t>(out1, this->plen()[npat], this->toBe());
+			}
+			npat++;
+			this->plen()[npat] = (szs[i].len + szs[i].off);
+		} else {
+			this->plen()[npat] += (szs[i].len + szs[i].off);
+		}
+	}
+	assert_eq((index_t)npat, this->_nPat-1);
+	writeIndex<index_t>(out1, this->plen()[npat], this->toBe());
+	// Write the number of fragments
+	writeIndex<index_t>(out1, this->_nFrag, this->toBe());
+	index_t seqsRead = 0;
+	ASSERT_ONLY(index_t szsi = 0);
+	ASSERT_ONLY(index_t entsWritten = 0);
+	index_t dstoff = 0;
+	// For each filebuf
+	for(unsigned int i = 0; i < l.size(); i++) {
+		assert(!l[i]->eof());
+		bool first = true;
+		index_t patoff = 0;
+		// For each *fragment* (not necessary an entire sequence) we
+		// can pull out of istream l[i]...
+		while(!l[i]->eof()) {
+			string name;
+			// Push a new name onto our vector
+			_refnames.push_back("");
+			RefRecord rec = fastaRefReadAppend(
+				*l[i], first, ret, dstoff, rpcp, &_refnames.back());
+			first = false;
+			index_t bases = rec.len;
+			if(rec.first && rec.len > 0) {
+				if(_refnames.back().length() == 0) {
+					// If name was empty, replace with an index
+					ostringstream stm;
+					stm << seqsRead;
+					_refnames.back() = stm.str();
+				}
+			} else {
+				// This record didn't actually start a new sequence so
+				// no need to add a name
+				//assert_eq(0, _refnames.back().length());
+				_refnames.pop_back();
+			}
+			assert_lt(szsi, szs.size());
+			assert_eq(rec.off, szs[szsi].off);
+			assert_eq(rec.len, szs[szsi].len);
+			assert_eq(rec.first, szs[szsi].first);
+			assert(rec.first || rec.off > 0);
+			ASSERT_ONLY(szsi++);
+			// Increment seqsRead if this is the first fragment
+			if(rec.first && rec.len > 0) seqsRead++;
+			if(bases == 0) continue;
+			assert_leq(bases, this->plen()[seqsRead-1]);
+			// Reset the patoff if this is the first fragment
+			if(rec.first) patoff = 0;
+			patoff += rec.off; // add fragment's offset from end of last frag.
+			// Adjust rpcps
+			//index_t seq = seqsRead-1;
+			ASSERT_ONLY(entsWritten++);
+			// This is where rstarts elements are written to the output stream
+			//writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
+			//writeU32(out1, seq,       this->toBe()); // sequence id
+			//writeU32(out1, patoff,    this->toBe()); // offset into sequence
+			patoff += (index_t)bases;
+		}
+		assert_gt(szsi, 0);
+		l[i]->reset();
+		assert(!l[i]->eof());
+#ifndef NDEBUG
+		int c = l[i]->get();
+		assert_eq('>', c);
+		assert(!l[i]->eof());
+		l[i]->reset();
+		assert(!l[i]->eof());
+#endif
+	}
+	assert_eq(entsWritten, this->_nFrag);
+}
+
+/**
+ * Build an Ebwt from a string 's' and its suffix array 'sa' (which
+ * might actually be a suffix array *builder* that builds blocks of the
+ * array on demand).  The bulk of the Ebwt, i.e. the ebwt and offs
+ * arrays, is written directly to disk.  This is by design: keeping
+ * those arrays in memory needlessly increases the footprint of the
+ * building process.  Instead, we prefer to build the Ebwt directly
+ * "to disk" and then read it back into memory later as necessary.
+ *
+ * It is assumed that the header values and join-related values (nPat,
+ * plen) have already been written to 'out1' before this function
+ * is called.  When this function is finished, it will have
+ * additionally written ebwt, zOff, fchr, ftab and eftab to the primary
+ * file and offs to the secondary file.
+ *
+ * Assume DNA/RNA/any alphabet with 4 or fewer elements.
+ * Assume occ array entries are 32 bits each.
+ *
+ * @param sa            the suffix array to convert to a Ebwt
+ * @param s             the original string
+ * @param out
+ */
+template <typename index_t>
+template <typename TStr>
+void Ebwt<index_t>::buildToDisk(
+                                InorderBlockwiseSA<TStr>& sa,
+                                const TStr& s,
+                                ostream& out1,
+                                ostream& out2,
+                                ostream* saOut,
+                                ostream* bwtOut,
+                                const EList<RefRecord>& szs,
+                                int kmer_size)
+{
+	const EbwtParams<index_t>& eh = this->_eh;
+
+	assert(eh.repOk());
+	assert_eq(s.length()+1, sa.size());
+	assert_eq(s.length(), eh._len);
+	assert_gt(eh._lineRate, 3);
+	assert(sa.suffixItrIsReset());
+
+	index_t  len = eh._len;
+	index_t  ftabLen = eh._ftabLen;
+	index_t  sideSz = eh._sideSz;
+	index_t  ebwtTotSz = eh._ebwtTotSz;
+	index_t  fchr[] = {0, 0, 0, 0, 0};
+	EList<index_t> ftab(EBWT_CAT);
+	index_t  zOff = (index_t)OFF_MASK;
+
+	// Save # of occurrences of each character as we walk along the bwt
+	index_t occ[4] = {0, 0, 0, 0};
+	index_t occSave[4] = {0, 0, 0, 0};
+    
+	// Record rows that should "absorb" adjacent rows in the ftab.
+	// The absorbed rows represent suffixes shorter than the ftabChars
+	// cutoff.
+	uint8_t absorbCnt = 0;
+	EList<uint8_t> absorbFtab(EBWT_CAT);
+	try {
+		VMSG_NL("Allocating ftab, absorbFtab");
+		ftab.resize(ftabLen);
+		ftab.fillZero();
+		absorbFtab.resize(ftabLen);
+		absorbFtab.fillZero();
+	} catch(bad_alloc &e) {
+		cerr << "Out of memory allocating ftab[] or absorbFtab[] "
+		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
+		     << __LINE__ << endl;
+		throw e;
+	}
+
+	// Allocate the side buffer; holds a single side as its being
+	// constructed and then written to disk.  Reused across all sides.
+#ifdef SIXTY4_FORMAT
+	EList<uint64_t> ebwtSide(EBWT_CAT);
+#else
+	EList<uint8_t> ebwtSide(EBWT_CAT);
+#endif
+	try {
+#ifdef SIXTY4_FORMAT
+		ebwtSide.resize(sideSz >> 3);
+#else
+		ebwtSide.resize(sideSz);
+#endif
+	} catch(bad_alloc &e) {
+		cerr << "Out of memory allocating ebwtSide[] in "
+		     << "Ebwt::buildToDisk() at " << __FILE__ << ":"
+		     << __LINE__ << endl;
+		throw e;
+	}
+
+	// Points to the base offset within ebwt for the side currently
+	// being written
+	index_t side = 0;
+
+	// Whether we're assembling a forward or a reverse bucket
+	bool fw;
+	int sideCur = 0;
+	fw = true;
+
+	// Have we skipped the '$' in the last column yet?
+	ASSERT_ONLY(bool dollarSkipped = false);
+
+	index_t si = 0;   // string offset (chars)
+	ASSERT_ONLY(index_t lastSufInt = 0);
+	ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
+	                               // array (as opposed to the padding at the
+	                               // end)
+	// Iterate over packed bwt bytes
+	VMSG_NL("Entering Ebwt loop");
+	ASSERT_ONLY(index_t beforeEbwtOff = (index_t)out1.tellp());
+    
+    // First integer in the suffix-array output file is the length of the
+    // array, including $
+    if(saOut != NULL) {
+        // Write length word
+        writeIndex<index_t>(*saOut, len+1, this->toBe());
+    }
+    
+    // First integer in the BWT output file is the length of BWT(T), including $
+    if(bwtOut != NULL) {
+        // Write length word
+        writeIndex<index_t>(*bwtOut, len+1, this->toBe());
+    }
+    
+    // Count the number of distinct k-mers if kmer_size is non-zero
+    EList<uint8_t> kmer;
+    EList<size_t> kmer_count;
+    EList<size_t> acc_szs;
+    if(kmer_size > 0) {
+        kmer.resize(kmer_size);
+        kmer.fillZero();
+	kmer_count.resize(kmer_size);
+	kmer_count.fillZero();
+        for(size_t i = 0; i < szs.size(); i++) {
+            if(szs[i].first) {
+                size_t size = 0;
+                if(acc_szs.size() > 0) {
+                    size = acc_szs.back();
+                }
+                acc_szs.expand();
+                acc_szs.back() = size;
+            }
+            acc_szs.back() += szs[i].len;
+        }
+    }
+	while(side < ebwtTotSz) {
+		// Sanity-check our cursor into the side buffer
+		assert_geq(sideCur, 0);
+		assert_lt(sideCur, (int)eh._sideBwtSz);
+		assert_eq(0, side % sideSz); // 'side' must be on side boundary
+		ebwtSide[sideCur] = 0; // clear
+		assert_lt(side + sideCur, ebwtTotSz);
+		// Iterate over bit-pairs in the si'th character of the BWT
+#ifdef SIXTY4_FORMAT
+		for(int bpi = 0; bpi < 32; bpi++, si++)
+#else
+		for(int bpi = 0; bpi < 4; bpi++, si++)
+#endif
+		{
+			int bwtChar;
+			bool count = true;
+			if(si <= len) {
+				// Still in the SA; extract the bwtChar
+				index_t saElt = sa.nextSuffix();
+                if(saOut != NULL) {
+                    writeIndex<index_t>(*saOut, saElt, this->toBe());
+                }
+				// (that might have triggered sa to calc next suf block)
+				if(saElt == 0) {
+					// Don't add the '$' in the last column to the BWT
+					// transform; we can't encode a $ (only A C T or G)
+					// and counting it as, say, an A, will mess up the
+					// LR mapping
+					bwtChar = 0; count = false;
+					ASSERT_ONLY(dollarSkipped = true);
+					zOff = si; // remember the SA row that
+					           // corresponds to the 0th suffix
+				} else {
+					bwtChar = (int)(s[saElt-1]);
+					assert_lt(bwtChar, 4);
+					// Update the fchr
+					fchr[bwtChar]++;
+				}
+				// Update ftab
+				if((len-saElt) >= (index_t)eh._ftabChars) {
+					// Turn the first ftabChars characters of the
+					// suffix into an integer index into ftab.  The
+					// leftmost (lowest index) character of the suffix
+					// goes in the most significant bit pair if the
+					// integer.
+					index_t sufInt = 0;
+					for(int i = 0; i < eh._ftabChars; i++) {
+						sufInt <<= 2;
+						assert_lt((index_t)i, len-saElt);
+						sufInt |= (unsigned char)(s[saElt+i]);
+					}
+					// Assert that this prefix-of-suffix is greater
+					// than or equal to the last one (true b/c the
+					// suffix array is sorted)
+					#ifndef NDEBUG
+					if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
+					lastSufInt = sufInt;
+					#endif
+					// Update ftab
+					assert_lt(sufInt+1, ftabLen);
+					ftab[sufInt+1]++;
+					if(absorbCnt > 0) {
+						// Absorb all short suffixes since the last
+						// transition into this transition
+						absorbFtab[sufInt] = absorbCnt;
+						absorbCnt = 0;
+					}
+				} else {
+					// Otherwise if suffix is fewer than ftabChars
+					// characters long, then add it to the 'absorbCnt';
+					// it will be absorbed into the next transition
+					assert_lt(absorbCnt, 255);
+					absorbCnt++;
+				}
+                // Update the number of distinct k-mers
+                if(kmer_size > 0) {
+                    size_t idx = acc_szs.bsearchLoBound(saElt);
+                    assert_lt(idx, acc_szs.size());
+                    bool different = false;
+                    for(size_t k = 0; k < kmer_size; k++) {
+                        if((acc_szs[idx]-saElt) > k) {
+                            uint8_t bp = s[saElt+k];
+                            if(kmer[k] != bp || kmer_count[k] <= 0 || different) {
+                                kmer_count[k]++;
+                                different = true;
+                            }
+                            kmer[k] = bp;
+                        }
+                        else {
+                            break;
+                        }
+                    }
+                }
+				// Suffix array offset boundary? - update offset array
+				if((si & eh._offMask) == si) {
+					assert_lt((si >> eh._offRate), eh._offsLen);
+					// Write offsets directly to the secondary output
+					// stream, thereby avoiding keeping them in memory
+                    index_t tidx = 0, toff = 0, tlen = 0;
+                    bool straddled2 = false;
+                    if(saElt > 0) {
+                        joinedToTextOff(
+                                        0,
+                                        saElt - 1,
+                                        tidx,
+                                        toff,
+                                        tlen,
+                                        false,        // reject straddlers?
+                                        straddled2);  // straddled?
+                    }
+                    if(this->_offw) {
+                        writeIndex<uint32_t>(out2, (uint32_t)tidx, this->toBe());
+                    } else {
+                        assert_lt(tidx, std::numeric_limits<uint16_t>::max());
+                        writeIndex<uint16_t>(out2, (uint16_t)tidx, this->toBe());
+                    }
+				}
+			} else {
+				// Strayed off the end of the SA, now we're just
+				// padding out a bucket
+				#ifndef NDEBUG
+				if(inSA) {
+					// Assert that we wrote all the characters in the
+					// string before now
+					assert_eq(si, len+1);
+					inSA = false;
+				}
+				#endif
+				// 'A' used for padding; important that padding be
+				// counted in the occ[] array
+				bwtChar = 0;
+			}
+			if(count) occ[bwtChar]++;
+			// Append BWT char to bwt section of current side
+			if(fw) {
+				// Forward bucket: fill from least to most
+#ifdef SIXTY4_FORMAT
+				ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
+				if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
+#else
+				pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi);
+				assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar);
+#endif
+			} else {
+				// Backward bucket: fill from most to least
+#ifdef SIXTY4_FORMAT
+				ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
+				if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
+#else
+				pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi);
+				assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
+#endif
+			}
+		} // end loop over bit-pairs
+		assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
+#ifdef SIXTY4_FORMAT
+		assert_eq(0, si & 31);
+#else
+		assert_eq(0, si & 3);
+#endif
+
+		sideCur++;
+		if(sideCur == (int)eh._sideBwtSz) {
+			sideCur = 0;
+			index_t *uside = reinterpret_cast<index_t*>(ebwtSide.ptr());
+			// Write 'A', 'C', 'G' and 'T' tallies
+			side += sideSz;
+			assert_leq(side, eh._ebwtTotSz);
+			uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe());
+			uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe());
+			uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe());
+			uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe());
+			occSave[0] = occ[0];
+			occSave[1] = occ[1];
+			occSave[2] = occ[2];
+			occSave[3] = occ[3];
+			// Write backward side to primary file
+			out1.write((const char *)ebwtSide.ptr(), sideSz);
+		}
+	}
+	VMSG_NL("Exited Ebwt loop");
+	assert_neq(zOff, (index_t)OFF_MASK);
+	if(absorbCnt > 0) {
+		// Absorb any trailing, as-yet-unabsorbed short suffixes into
+		// the last element of ftab
+		absorbFtab[ftabLen-1] = absorbCnt;
+	}
+	// Assert that our loop counter got incremented right to the end
+	assert_eq(side, eh._ebwtTotSz);
+	// Assert that we wrote the expected amount to out1
+	assert_eq(((index_t)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz);
+	// assert that the last thing we did was write a forward bucket
+
+	//
+	// Write zOff to primary stream
+	//
+	writeIndex<index_t>(out1, zOff, this->toBe());
+
+	//
+	// Finish building fchr
+	//
+	// Exclusive prefix sum on fchr
+	for(int i = 1; i < 4; i++) {
+		fchr[i] += fchr[i-1];
+	}
+	assert_eq(fchr[3], len);
+	// Shift everybody up by one
+	for(int i = 4; i >= 1; i--) {
+		fchr[i] = fchr[i-1];
+	}
+	fchr[0] = 0;
+	if(_verbose) {
+		for(int i = 0; i < 5; i++)
+			cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
+	}
+	// Write fchr to primary file
+	for(int i = 0; i < 5; i++) {
+		writeIndex<index_t>(out1, fchr[i], this->toBe());
+	}
+
+	//
+	// Finish building ftab and build eftab
+	//
+	// Prefix sum on ftable
+	index_t eftabLen = 0;
+	assert_eq(0, absorbFtab[0]);
+	for(index_t i = 1; i < ftabLen; i++) {
+		if(absorbFtab[i] > 0) eftabLen += 2;
+	}
+	assert_leq(eftabLen, (index_t)eh._ftabChars*2);
+	eftabLen = eh._ftabChars*2;
+	EList<index_t> eftab(EBWT_CAT);
+	try {
+		eftab.resize(eftabLen);
+		eftab.fillZero();
+	} catch(bad_alloc &e) {
+		cerr << "Out of memory allocating eftab[] "
+		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
+		     << __LINE__ << endl;
+		throw e;
+	}
+	index_t eftabCur = 0;
+	for(index_t i = 1; i < ftabLen; i++) {
+		index_t lo = ftab[i] + Ebwt<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
+		if(absorbFtab[i] > 0) {
+			// Skip a number of short pattern indicated by absorbFtab[i]
+			index_t hi = lo + absorbFtab[i];
+			assert_lt(eftabCur*2+1, eftabLen);
+			eftab[eftabCur*2] = lo;
+			eftab[eftabCur*2+1] = hi;
+			ftab[i] = (eftabCur++) ^ (index_t)OFF_MASK; // insert pointer into eftab
+			assert_eq(lo, Ebwt<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
+			assert_eq(hi, Ebwt<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
+		} else {
+			ftab[i] = lo;
+		}
+	}
+	assert_eq(Ebwt<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
+	// Write ftab to primary file
+	for(index_t i = 0; i < ftabLen; i++) {
+		writeIndex<index_t>(out1, ftab[i], this->toBe());
+	}
+	// Write eftab to primary file
+	for(index_t i = 0; i < eftabLen; i++) {
+		writeIndex<index_t>(out1, eftab[i], this->toBe());
+	}
+    
+    if(kmer_size > 0) {
+      for(size_t k = 0; k < kmer_size; k++) {
+        cerr << "Number of distinct " << k+1 << "-mers is " << kmer_count[k] << endl;
+      }
+    }
+
+	// Note: if you'd like to sanity-check the Ebwt, you'll have to
+	// read it back into memory first!
+	assert(!isInMemory());
+	VMSG_NL("Exiting Ebwt::buildToDisk()");
+}
+
+/**
+ * Try to find the Bowtie index specified by the user.  First try the
+ * exact path given by the user.  Then try the user-provided string
+ * appended onto the path of the "indexes" subdirectory below this
+ * executable, then try the provided string appended onto
+ * "$BOWTIE2_INDEXES/".
+ */
+string adjustEbwtBase(const string& cmdline,
+					  const string& ebwtFileBase,
+					  bool verbose);
+
+
+extern string gLastIOErrMsg;
+
+/* Checks whether a call to read() failed or not. */
+inline bool is_read_err(int fdesc, ssize_t ret, size_t count) {
+    if (ret < 0) {
+        std::stringstream sstm;
+        sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
+		gLastIOErrMsg = sstm.str();
+        return true;
+    }
+    return false;
+}
+
+/* Checks whether a call to fread() failed or not. */
+inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count) {
+    if (ferror(file_hd)) {
+        gLastIOErrMsg = "Error Reading File!";
+        return true;
+    }
+    return false;
+}
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// Functions for searching Ebwts
+// (But most of them are defined in the header)
+//
+///////////////////////////////////////////////////////////////////////
+
+/**
+ * Take an offset into the joined text and translate it into the
+ * reference of the index it falls on, the offset into the reference,
+ * and the length of the reference.  Use a binary search through the
+ * sorted list of reference fragment ranges t
+ */
+template <typename index_t>
+void Ebwt<index_t>::joinedToTextOff(
+									index_t qlen,
+									index_t off,
+									index_t& tidx,
+									index_t& textoff,
+									index_t& tlen,
+									bool rejectStraddle,
+									bool& straddled) const
+{
+	assert(rstarts() != NULL); // must have loaded rstarts
+	index_t top = 0;
+	index_t bot = _nFrag; // 1 greater than largest addressable element
+	index_t elt = (index_t)OFF_MASK;
+	// Begin binary search
+	while(true) {
+		ASSERT_ONLY(index_t oldelt = elt);
+		elt = top + ((bot - top) >> 1);
+		assert_neq(oldelt, elt); // must have made progress
+		index_t lower = rstarts()[elt*3];
+		index_t upper;
+		if(elt == _nFrag-1) {
+			upper = _eh._len;
+		} else {
+			upper = rstarts()[((elt+1)*3)];
+		}
+		assert_gt(upper, lower);
+		index_t fraglen = upper - lower;
+		if(lower <= off) {
+			if(upper > off) { // not last element, but it's within
+				// off is in this range; check if it falls off
+				if(off + qlen > upper) {
+					straddled = true;
+					if(rejectStraddle) {
+						// it falls off; signal no-go and return
+						tidx = (index_t)OFF_MASK;
+						assert_lt(elt, _nFrag-1);
+						return;
+					}
+				}
+				// This is the correct text idx whether the index is
+				// forward or reverse
+				tidx = rstarts()[(elt*3)+1];
+				assert_lt(tidx, this->_nPat);
+				assert_leq(fraglen, this->plen()[tidx]);
+				// it doesn't fall off; now calculate textoff.
+				// Initially it's the number of characters that precede
+				// the alignment in the fragment
+				index_t fragoff = off - rstarts()[(elt*3)];
+				if(!this->fw_) {
+					fragoff = fraglen - fragoff - 1;
+					fragoff -= (qlen-1);
+				}
+				// Add the alignment's offset into the fragment
+				// ('fragoff') to the fragment's offset within the text
+				textoff = fragoff + rstarts()[(elt*3)+2];
+				assert_lt(textoff, this->plen()[tidx]);
+				break; // done with binary search
+			} else {
+				// 'off' belongs somewhere in the region between elt
+				// and bot
+				top = elt;
+			}
+		} else {
+			// 'off' belongs somewhere in the region between top and
+			// elt
+			bot = elt;
+		}
+		// continue with binary search
+	}
+	tlen = this->plen()[tidx];
+}
+
+/**
+ * Walk 'steps' steps to the left and return the row arrived at.  If we
+ * walk through the dollar sign, return 0xffffffff.
+ */
+template <typename index_t>
+index_t Ebwt<index_t>::walkLeft(index_t row, index_t steps) const {
+#ifndef NDEBUG
+    if(this->_offw) {
+        assert(offsw() != NULL);
+    } else {
+        assert(offs() != NULL);
+    }
+#endif
+	assert_neq((index_t)OFF_MASK, row);
+	SideLocus<index_t> l;
+	if(steps > 0) l.initFromRow(row, _eh, ebwt());
+	while(steps > 0) {
+		if(row == _zOff) return (index_t)OFF_MASK;
+		index_t newrow = this->mapLF(l ASSERT_ONLY(, false));
+		assert_neq((index_t)OFF_MASK, newrow);
+		assert_neq(newrow, row);
+		row = newrow;
+		steps--;
+		if(steps > 0) l.initFromRow(row, _eh, ebwt());
+	}
+	return row;
+}
+
+/**
+ * Resolve the reference offset of the BW element 'elt'.
+ */
+template <typename index_t>
+index_t Ebwt<index_t>::getOffset(index_t row) const {
+#ifndef NDEBUG
+    if(this->_offw) {
+        assert(offsw() != NULL);
+    } else {
+        assert(offs() != NULL);
+    }
+#endif
+	assert_neq((index_t)OFF_MASK, row);
+	if(row == _zOff) return 0;
+    if((row & _eh._offMask) == row) {
+        if(this->_offw) {
+            return this->offsw()[row >> _eh._offRate];
+        } else {
+            return this->offs()[row >> _eh._offRate];
+        }
+    }
+	index_t jumps = 0;
+	SideLocus<index_t> l;
+	l.initFromRow(row, _eh, ebwt());
+	while(true) {
+		index_t newrow = this->mapLF(l ASSERT_ONLY(, false));
+		jumps++;
+		assert_neq((index_t)OFF_MASK, newrow);
+		assert_neq(newrow, row);
+		row = newrow;
+		if(row == _zOff) {
+			return jumps;
+		} else if((row & _eh._offMask) == row) {
+            if(this->_offw) {
+                return jumps + this->offsw()[row >> _eh._offRate];
+            } else {
+                return jumps + this->offs()[row >> _eh._offRate];
+            }
+		}
+		l.initFromRow(row, _eh, ebwt());
+	}
+}
+
+/**
+ * Resolve the reference offset of the BW element 'elt' such that
+ * the offset returned is at the right-hand side of the forward
+ * reference substring involved in the hit.
+ */
+template <typename index_t>
+index_t Ebwt<index_t>::getOffset(
+								 index_t elt,
+								 bool fw,
+								 index_t hitlen) const
+{
+	index_t off = getOffset(elt);
+	assert_neq((index_t)OFF_MASK, off);
+	if(!fw) {
+		assert_lt(off, _eh._len);
+		off = _eh._len - off - 1;
+		assert_geq(off, hitlen-1);
+		off -= (hitlen-1);
+		assert_lt(off, _eh._len);
+	}
+	return off;
+}
+
+/**
+ * Returns true iff the index contains the given string (exactly).  The given
+ * string must contain only unambiguous characters.  TODO: support ambiguous
+ * characters in 'str'.
+ */
+template <typename index_t>
+bool Ebwt<index_t>::contains(
+							 const BTDnaString& str,
+							 index_t *otop,
+							 index_t *obot) const
+{
+	assert(isInMemory());
+	SideLocus<index_t> tloc, bloc;
+	if(str.empty()) {
+		if(otop != NULL && obot != NULL) *otop = *obot = 0;
+		return true;
+	}
+	int c = str[str.length()-1];
+	assert_range(0, 4, c);
+	index_t top = 0, bot = 0;
+	if(c < 4) {
+		top = fchr()[c];
+		bot = fchr()[c+1];
+	} else {
+		bool set = false;
+		for(int i = 0; i < 4; i++) {
+			if(fchr()[c] < fchr()[c+1]) {
+				if(set) {
+					return false;
+				} else {
+					set = true;
+					top = fchr()[c];
+					bot = fchr()[c+1];
+				}
+			}
+		}
+	}
+	assert_geq(bot, top);
+	tloc.initFromRow(top, eh(), ebwt());
+	bloc.initFromRow(bot, eh(), ebwt());
+	ASSERT_ONLY(index_t lastDiff = bot - top);
+	for(int64_t i = (int64_t)str.length()-2; i >= 0; i--) {
+		c = str[i];
+		assert_range(0, 4, c);
+		if(c <= 3) {
+			top = mapLF(tloc, c);
+			bot = mapLF(bloc, c);
+		} else {
+			index_t sz = bot - top;
+			int c1 = mapLF1(top, tloc ASSERT_ONLY(, false));
+			bot = mapLF(bloc, c1);
+			assert_leq(bot - top, sz);
+			if(bot - top < sz) {
+				// Encountered an N and could not proceed through it because
+				// there was more than one possible nucleotide we could replace
+				// it with
+				return false;
+			}
+		}
+		assert_geq(bot, top);
+		assert_leq(bot-top, lastDiff);
+		ASSERT_ONLY(lastDiff = bot-top);
+		if(i > 0) {
+			tloc.initFromRow(top, eh(), ebwt());
+			bloc.initFromRow(bot, eh(), ebwt());
+		}
+	}
+	if(otop != NULL && obot != NULL) {
+		*otop = top; *obot = bot;
+	}
+	return bot > top;
+}
+
+#endif /*EBWT_H_*/
diff --git a/bt2_io.h b/bt2_io.h
new file mode 100644
index 0000000..3d7fe77
--- /dev/null
+++ b/bt2_io.h
@@ -0,0 +1,1030 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EBWT_IO_H_
+#define EBWT_IO_H_
+
+#include <string>
+#include <stdexcept>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include "bt2_idx.h"
+
+using namespace std;
+
+///////////////////////////////////////////////////////////////////////
+//
+// Functions for reading and writing Ebwts
+//
+///////////////////////////////////////////////////////////////////////
+
+/**
+ * Read an Ebwt from file with given filename.
+ */
+template <typename index_t>
+void Ebwt<index_t>::readIntoMemory(
+	int color,
+	int needEntireRev,
+	bool loadSASamp,
+	bool loadFtab,
+	bool loadRstarts,
+	bool justHeader,
+	EbwtParams<index_t> *params,
+	bool mmSweep,
+	bool loadNames,
+	bool startVerbose)
+{
+	bool switchEndian; // dummy; caller doesn't care
+#ifdef BOWTIE_MM
+	char *mmFile[] = { NULL, NULL };
+#endif
+	if(_in1Str.length() > 0) {
+		if(_verbose || startVerbose) {
+			cerr << "  About to open input files: ";
+			logTime(cerr);
+		}
+		// Initialize our primary and secondary input-stream fields
+		if(_in1 != NULL) fclose(_in1);
+		if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str.c_str() << "\"" << endl;
+		if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
+			cerr << "Could not open index file " << _in1Str.c_str() << endl;
+		}
+		if(loadSASamp) {
+			if(_in2 != NULL) fclose(_in2);
+			if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str.c_str() << "\"" << endl;
+			if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
+				cerr << "Could not open index file " << _in2Str.c_str() << endl;
+			}
+		}
+		if(_verbose || startVerbose) {
+			cerr << "  Finished opening input files: ";
+			logTime(cerr);
+		}
+		
+#ifdef BOWTIE_MM
+		if(_useMm /*&& !justHeader*/) {
+			const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
+			int fds[] = { fileno(_in1), fileno(_in2) };
+			for(int i = 0; i < (loadSASamp ? 2 : 1); i++) {
+				if(_verbose || startVerbose) {
+					cerr << "  Memory-mapping input file " << (i+1) << ": ";
+					logTime(cerr);
+				}
+				struct stat sbuf;
+				if (stat(names[i], &sbuf) == -1) {
+					perror("stat");
+					cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
+					throw 1;
+				}
+				mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size,
+										PROT_READ, MAP_SHARED, fds[(size_t)i], 0);
+				if(mmFile[i] == (void *)(-1)) {
+					perror("mmap");
+					cerr << "Error: Could not memory-map the index file " << names[i] << endl;
+					throw 1;
+				}
+				if(mmSweep) {
+					int sum = 0;
+					for(off_t j = 0; j < sbuf.st_size; j += 1024) {
+						sum += (int) mmFile[i][j];
+					}
+					if(startVerbose) {
+						cerr << "  Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
+						logTime(cerr);
+					}
+				}
+			}
+			mmFile1_ = mmFile[0];
+			mmFile2_ = loadSASamp ? mmFile[1] : NULL;
+		}
+#endif
+	}
+#ifdef BOWTIE_MM
+	else if(_useMm && !justHeader) {
+		mmFile[0] = mmFile1_;
+		mmFile[1] = mmFile2_;
+	}
+	if(_useMm && !justHeader) {
+		assert(mmFile[0] == mmFile1_);
+		assert(mmFile[1] == mmFile2_);
+	}
+#endif
+	
+	if(_verbose || startVerbose) {
+		cerr << "  Reading header: ";
+		logTime(cerr);
+	}
+	
+	// Read endianness hints from both streams
+	size_t bytesRead = 0;
+	switchEndian = false;
+	uint32_t one = readU32(_in1, switchEndian); // 1st word of primary stream
+	bytesRead += 4;
+	if(loadSASamp) {
+#ifndef NDEBUG
+		assert_eq(one, readU32(_in2, switchEndian)); // should match!
+#else
+		readU32(_in2, switchEndian);
+#endif
+	}
+	if(one != 1) {
+		assert_eq((1u<<24), one);
+		assert_eq(1, endianSwapU32(one));
+		switchEndian = true;
+	}
+	
+	// Can't switch endianness and use memory-mapped files; in order to
+	// support this, someone has to modify the file to switch
+	// endiannesses appropriately, and we can't do this inside Bowtie
+	// or we might be setting up a race condition with other processes.
+	if(switchEndian && _useMm) {
+		cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
+		throw 1;
+	}
+	
+	// Reads header entries one by one from primary stream
+	index_t len          = readIndex<index_t>(_in1, switchEndian);
+	bytesRead += sizeof(index_t);
+	int32_t  lineRate     = readI32(_in1, switchEndian);
+	bytesRead += 4;
+	/*int32_t  linesPerSide =*/ readI32(_in1, switchEndian);
+	bytesRead += 4;
+	int32_t  offRate      = readI32(_in1, switchEndian);
+	bytesRead += 4;
+	// TODO: add isaRate to the actual file format (right now, the
+	// user has to tell us whether there's an ISA sample and what the
+	// sampling rate is.
+	int32_t  ftabChars    = readI32(_in1, switchEndian);
+	bytesRead += 4;
+	// chunkRate was deprecated in an earlier version of Bowtie; now
+	// we use it to hold flags.
+	int32_t flags = readI32(_in1, switchEndian);
+	bool entireRev = false;
+	if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
+		if(color != -1 && !color) {
+			cerr << "Error: -C was not specified when running bowtie, but index is in colorspace.  If" << endl
+			     << "your reads are in colorspace, please use the -C option.  If your reads are not" << endl
+			     << "in colorspace, please use a normal index (one built without specifying -C to" << endl
+			     << "bowtie-build)." << endl;
+			throw 1;
+		}
+		color = 1;
+	} else if(flags < 0) {
+		if(color != -1 && color) {
+			cerr << "Error: -C was specified when running bowtie, but index is not in colorspace.  If" << endl
+			     << "your reads are in colorspace, please use a colorspace index (one built using" << endl
+			     << "bowtie-build -C).  If your reads are not in colorspace, don't specify -C when" << endl
+			     << "running bowtie." << endl;
+			throw 1;
+		}
+		color = 0;
+	}
+	if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) {
+		if(needEntireRev != -1 && needEntireRev != 0) {
+			cerr << "Error: This index is compatible with 0.* versions of Bowtie, but not with 2.*" << endl
+			     << "versions.  Please build or download a version of the index that is compitble" << endl
+				 << "with Bowtie 2.* (i.e. built with bowtie-build 2.* or later)" << endl;
+			throw 1;
+		}
+	} else entireRev = true;
+	bytesRead += 4;
+	
+	// Create a new EbwtParams from the entries read from primary stream
+	EbwtParams<index_t> *eh;
+	bool deleteEh = false;
+	if(params != NULL) {
+		params->init(len, lineRate, offRate, ftabChars, color, entireRev);
+		if(_verbose || startVerbose) params->print(cerr);
+		eh = params;
+	} else {
+		eh = new EbwtParams<index_t>(len, lineRate, offRate, ftabChars, color, entireRev);
+		deleteEh = true;
+	}
+	
+	// Set up overridden suffix-array-sample parameters
+	index_t offsLen = eh->_offsLen;
+    // uint64_t offsSz = eh->_offsSz;
+	index_t offRateDiff = 0;
+	index_t offsLenSampled = offsLen;
+	if(_overrideOffRate > offRate) {
+		offRateDiff = _overrideOffRate - offRate;
+	}
+	if(offRateDiff > 0) {
+		offsLenSampled >>= offRateDiff;
+		if((offsLen & ~((index_t)OFF_MASK << offRateDiff)) != 0) {
+			offsLenSampled++;
+		}
+	}
+	
+	// Can't override the offrate or isarate and use memory-mapped
+	// files; ultimately, all processes need to copy the sparser sample
+	// into their own memory spaces.
+	if(_useMm && (offRateDiff)) {
+		cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl;
+		throw 1;
+	}
+	
+	// Read nPat from primary stream
+	this->_nPat = readIndex<index_t>(_in1, switchEndian);
+	bytesRead += sizeof(index_t);
+	_plen.reset();
+	// Read plen from primary stream
+	if(_useMm) {
+#ifdef BOWTIE_MM
+		_plen.init((index_t*)(mmFile[0] + bytesRead), _nPat, false);
+		bytesRead += _nPat*sizeof(index_t);
+		fseek(_in1, _nPat*sizeof(index_t), SEEK_CUR);
+#endif
+	} else {
+		try {
+			if(_verbose || startVerbose) {
+				cerr << "Reading plen (" << this->_nPat << "): ";
+				logTime(cerr);
+			}
+			_plen.init(new index_t[_nPat], _nPat, true);
+			if(switchEndian) {
+				for(index_t i = 0; i < this->_nPat; i++) {
+					plen()[i] = readIndex<index_t>(_in1, switchEndian);
+				}
+			} else {
+				size_t r = MM_READ(_in1, (void*)(plen()), _nPat*sizeof(index_t));
+				if(r != (size_t)(_nPat*sizeof(index_t))) {
+					cerr << "Error reading _plen[] array: " << r << ", " << _nPat*sizeof(index_t) << endl;
+					throw 1;
+				}
+			}
+		} catch(bad_alloc& e) {
+			cerr << "Out of memory allocating plen[] in Ebwt::read()"
+			<< " at " << __FILE__ << ":" << __LINE__ << endl;
+			throw e;
+		}
+	}
+    
+    this->_offw = this->_nPat > std::numeric_limits<uint16_t>::max();
+	
+	bool shmemLeader;
+    size_t OFFSET_SIZE;
+	
+	// TODO: I'm not consistent on what "header" means.  Here I'm using
+	// "header" to mean everything that would exist in memory if we
+	// started to build the Ebwt but stopped short of the build*() step
+	// (i.e. everything up to and including join()).
+	if(justHeader) goto done;
+	
+	this->_nFrag = readIndex<index_t>(_in1, switchEndian);
+	bytesRead += sizeof(index_t);
+	if(_verbose || startVerbose) {
+		cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
+		logTime(cerr);
+	}
+	assert_geq(this->_nFrag, this->_nPat);
+	_rstarts.reset();
+	if(loadRstarts) {
+		if(_useMm) {
+#ifdef BOWTIE_MM
+			_rstarts.init((index_t*)(mmFile[0] + bytesRead), _nFrag*3, false);
+			bytesRead += this->_nFrag*sizeof(index_t)*3;
+			fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
+#endif
+		} else {
+			_rstarts.init(new index_t[_nFrag*3], _nFrag*3, true);
+			if(switchEndian) {
+				for(size_t i = 0; i < (size_t)(this->_nFrag*3); i += 3) {
+					// fragment starting position in joined reference
+					// string, text id, and fragment offset within text
+					this->rstarts()[i]   = readIndex<index_t>(_in1, switchEndian);
+					this->rstarts()[i+1] = readIndex<index_t>(_in1, switchEndian);
+					this->rstarts()[i+2] = readIndex<index_t>(_in1, switchEndian);
+				}
+			} else {
+				size_t r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*sizeof(index_t)*3);
+				if(r != (size_t)(this->_nFrag*sizeof(index_t)*3)) {
+					cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*sizeof(index_t)*3) << endl;
+					throw 1;
+				}
+			}
+		}
+	} else {
+		// Skip em
+		assert(rstarts() == NULL);
+		bytesRead += this->_nFrag*sizeof(index_t)*3;
+		fseek(_in1, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
+	}
+	
+	_ebwt.reset();
+	if(_useMm) {
+#ifdef BOWTIE_MM
+		_ebwt.init((uint8_t*)(mmFile[0] + bytesRead), eh->_ebwtTotLen, false);
+		bytesRead += eh->_ebwtTotLen;
+		fseek(_in1, eh->_ebwtTotLen, SEEK_CUR);
+#endif
+	} else {
+		// Allocate ebwt (big allocation)
+		if(_verbose || startVerbose) {
+			cerr << "Reading ebwt (" << eh->_ebwtTotLen << "): ";
+			logTime(cerr);
+		}
+		bool shmemLeader = true;
+		if(useShmem_) {
+			uint8_t *tmp = NULL;
+			shmemLeader = ALLOC_SHARED_U8(
+				(_in1Str + "[ebwt]"), eh->_ebwtTotLen, &tmp,
+				"ebwt[]", (_verbose || startVerbose));
+			assert(tmp != NULL);
+			_ebwt.init(tmp, eh->_ebwtTotLen, false);
+			if(_verbose || startVerbose) {
+				cerr << "  shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
+			}
+		} else {
+			try {
+				_ebwt.init(new uint8_t[eh->_ebwtTotLen], eh->_ebwtTotLen, true);
+			} catch(bad_alloc& e) {
+				cerr << "Out of memory allocating the ebwt[] array for the Bowtie index.  Please try" << endl
+				<< "again on a computer with more memory." << endl;
+				throw 1;
+			}
+		}
+		if(shmemLeader) {
+			// Read ebwt from primary stream
+			uint64_t bytesLeft = eh->_ebwtTotLen;
+			char *pebwt = (char*)this->ebwt();
+            while (bytesLeft>0){
+				size_t r = MM_READ(this->_in1, (void *)pebwt, bytesLeft);
+				if(MM_IS_IO_ERR(this->_in1, r, bytesLeft)) {
+					cerr << "Error reading _ebwt[] array: " << r << ", "
+                    << bytesLeft << endl;
+					throw 1;
+				}
+				pebwt += r;
+				bytesLeft -= r;
+			}
+			if(switchEndian) {
+				uint8_t *side = this->ebwt();
+				for(size_t i = 0; i < eh->_numSides; i++) {
+					index_t *cums = reinterpret_cast<index_t*>(side + eh->_sideSz - sizeof(index_t)*2);
+					cums[0] = endianSwapIndex(cums[0]);
+					cums[1] = endianSwapIndex(cums[1]);
+					side += this->_eh._sideSz;
+				}
+			}
+#ifdef BOWTIE_SHARED_MEM
+			if(useShmem_) NOTIFY_SHARED(ebwt(), eh->_ebwtTotLen);
+#endif
+		} else {
+			// Seek past the data and wait until master is finished
+			fseek(_in1, eh->_ebwtTotLen, SEEK_CUR);
+#ifdef BOWTIE_SHARED_MEM
+			if(useShmem_) WAIT_SHARED(ebwt(), eh->_ebwtTotLen);
+#endif
+		}
+	}
+	
+	// Read zOff from primary stream
+	_zOff = readIndex<index_t>(_in1, switchEndian);
+	bytesRead += sizeof(index_t);
+	assert_lt(_zOff, len);
+	
+	try {
+		// Read fchr from primary stream
+		if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
+		_fchr.reset();
+		if(_useMm) {
+#ifdef BOWTIE_MM
+			_fchr.init((index_t*)(mmFile[0] + bytesRead), 5, false);
+			bytesRead += 5*sizeof(index_t);
+			fseek(_in1, 5*sizeof(index_t), SEEK_CUR);
+#endif
+		} else {
+			_fchr.init(new index_t[5], 5, true);
+			for(int i = 0; i < 5; i++) {
+				this->fchr()[i] = readIndex<index_t>(_in1, switchEndian);
+				assert_leq(this->fchr()[i], len);
+				assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]);
+			}
+		}
+		assert_gt(this->fchr()[4], this->fchr()[0]);
+		// Read ftab from primary stream
+		if(_verbose || startVerbose) {
+			if(loadFtab) {
+				cerr << "Reading ftab (" << eh->_ftabLen << "): ";
+				logTime(cerr);
+			} else {
+				cerr << "Skipping ftab (" << eh->_ftabLen << "): ";
+			}
+		}
+		_ftab.reset();
+		if(loadFtab) {
+			if(_useMm) {
+#ifdef BOWTIE_MM
+				_ftab.init((index_t*)(mmFile[0] + bytesRead), eh->_ftabLen, false);
+				bytesRead += eh->_ftabLen*sizeof(index_t);
+				fseek(_in1, eh->_ftabLen*sizeof(index_t), SEEK_CUR);
+#endif
+			} else {
+				_ftab.init(new index_t[eh->_ftabLen], eh->_ftabLen, true);
+				if(switchEndian) {
+					for(size_t i = 0; i < eh->_ftabLen; i++)
+						this->ftab()[i] = readIndex<index_t>(_in1, switchEndian);
+				} else {
+					size_t r = MM_READ(_in1, (void *)ftab(), eh->_ftabLen*sizeof(index_t));
+					if(r != (size_t)(eh->_ftabLen*sizeof(index_t))) {
+						cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*sizeof(index_t)) << endl;
+						throw 1;
+					}
+				}
+			}
+			// Read etab from primary stream
+			if(_verbose || startVerbose) {
+				if(loadFtab) {
+					cerr << "Reading eftab (" << eh->_eftabLen << "): ";
+					logTime(cerr);
+				} else {
+					cerr << "Skipping eftab (" << eh->_eftabLen << "): ";
+				}
+
+			}
+			_eftab.reset();
+			if(_useMm) {
+#ifdef BOWTIE_MM
+				_eftab.init((index_t*)(mmFile[0] + bytesRead), eh->_eftabLen, false);
+				bytesRead += eh->_eftabLen*sizeof(index_t);
+				fseek(_in1, eh->_eftabLen*sizeof(index_t), SEEK_CUR);
+#endif
+			} else {
+				_eftab.init(new index_t[eh->_eftabLen], eh->_eftabLen, true);
+				if(switchEndian) {
+					for(size_t i = 0; i < eh->_eftabLen; i++)
+						this->eftab()[i] = readIndex<index_t>(_in1, switchEndian);
+				} else {
+					size_t r = MM_READ(_in1, (void *)this->eftab(), eh->_eftabLen*sizeof(index_t));
+					if(r != (size_t)(eh->_eftabLen*sizeof(index_t))) {
+						cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*sizeof(index_t)) << endl;
+						throw 1;
+					}
+				}
+			}
+			for(index_t i = 0; i < eh->_eftabLen; i++) {
+				if(i > 0 && this->eftab()[i] > 0) {
+					assert_geq(this->eftab()[i], this->eftab()[i-1]);
+				} else if(i > 0 && this->eftab()[i-1] == 0) {
+					assert_eq(0, this->eftab()[i]);
+				}
+			}
+		} else {
+			assert(ftab() == NULL);
+			assert(eftab() == NULL);
+			// Skip ftab
+			bytesRead += eh->_ftabLen*sizeof(index_t);
+			fseek(_in1, eh->_ftabLen*sizeof(index_t), SEEK_CUR);
+			// Skip eftab
+			bytesRead += eh->_eftabLen*sizeof(index_t);
+			fseek(_in1, eh->_eftabLen*sizeof(index_t), SEEK_CUR);
+		}
+	} catch(bad_alloc& e) {
+		cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
+		<< "Please try again on a computer with more memory." << endl;
+		throw 1;
+	}
+	
+	// Read reference sequence names from primary index file (or not,
+	// if --refidx is specified)
+	if(loadNames) {
+		while(true) {
+			char c = '\0';
+			if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break;
+			bytesRead++;
+			if(c == '\0') break;
+			else if(c == '\n') {
+				this->_refnames.push_back("");
+			} else {
+				if(this->_refnames.size() == 0) {
+					this->_refnames.push_back("");
+				}
+				this->_refnames.back().push_back(c);
+			}
+		}
+        if(this->_refnames.back().empty()) {
+            this->_refnames.pop_back();
+        }
+	}
+	
+    OFFSET_SIZE = (this->_offw ? 4 : 2);
+	_offs.reset();
+    _offsw.reset();
+    if(loadSASamp) {
+        bytesRead = 4; // reset for secondary index file (already read 1-sentinel)
+        
+        shmemLeader = true;
+        if(_verbose || startVerbose) {
+            cerr << "Reading offs (" << offsLenSampled << " " << std::setw(2) << sizeof(index_t)*8 << "-bit words): ";
+            logTime(cerr);
+        }
+        
+        if(!_useMm) {
+            if(!useShmem_) {
+                // Allocate offs_
+                try {
+                    if(this->_offw) {
+                        _offsw.init(new uint32_t[offsLenSampled], offsLenSampled, true);
+                    } else {
+                        _offs.init(new uint16_t[offsLenSampled], offsLenSampled, true);
+                    }
+                } catch(bad_alloc& e) {
+                    cerr << "Out of memory allocating the offs[] array  for the Bowtie index." << endl
+					<< "Please try again on a computer with more memory." << endl;
+					throw 1;
+				}
+			} else {
+                if(this->_offw) {
+                    uint32_t *tmp = NULL;
+                    shmemLeader = ALLOC_SHARED_U32(
+                                                   (_in2Str + "[offs]"), offsLenSampled*OFFSET_SIZE, &tmp,
+                                                   "offs", (_verbose || startVerbose));
+                    _offsw.init((uint32_t*)tmp, offsLenSampled, false);
+                } else {
+                    uint16_t *tmp = NULL;
+                    shmemLeader = ALLOC_SHARED_U32(
+                                                   (_in2Str + "[offs]"), offsLenSampled*OFFSET_SIZE, &tmp,
+                                                   "offs", (_verbose || startVerbose));
+                    _offs.init((uint16_t*)tmp, offsLenSampled, false);
+                }
+			}
+		}
+        
+        if(_overrideOffRate < 32) {
+            if(shmemLeader) {
+                // Allocate offs (big allocation)
+                if(switchEndian || offRateDiff > 0) {
+                    assert(!_useMm);
+                    const index_t blockMaxSz = (index_t)(2 * 1024 * 1024); // 2 MB block size
+                    const index_t blockMaxSzU = (blockMaxSz / OFFSET_SIZE); // # U32s per block
+                    char *buf;
+                    try {
+                        buf = new char[blockMaxSz];
+                    } catch(std::bad_alloc& e) {
+                        cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl;
+                        throw e;
+                    }
+                    for(index_t i = 0; i < offsLen; i += blockMaxSzU) {
+                        index_t block = min<index_t>((index_t)blockMaxSzU, (index_t)(offsLen - i));
+                        size_t r = MM_READ(_in2, (void *)buf, block * OFFSET_SIZE);
+                        if(r != (size_t)(block * OFFSET_SIZE)) {
+                            cerr << "Error reading block of _offs[] array: " << r << ", " << (block * OFFSET_SIZE) << endl;
+                            throw 1;
+                        }
+                        index_t idx = i >> 1;
+                        for(index_t j = 0; j < block; j += OFFSET_SIZE) {
+                            assert_lt(idx, offsLenSampled);
+                            if(this->_offw) {
+                                this->offsw()[idx] = ((uint32_t*)buf)[j];
+                                if(switchEndian) {
+                                    this->offsw()[idx] = endianSwapIndex((uint32_t)this->offs()[idx]);
+                                }
+                            } else {
+                                this->offs()[idx] = ((uint16_t*)buf)[j];
+                                if(switchEndian) {
+                                    this->offs()[idx] = endianSwapIndex((uint16_t)this->offs()[idx]);
+                                }
+                            }
+                            idx++;
+                        }
+                    }
+                    delete[] buf;
+                } else {
+                    if(_useMm) {
+#ifdef BOWTIE_MM
+                        if(this->_offw) {
+                            _offsw.init((uint32_t*)(mmFile[1] + bytesRead), offsLen, false);
+                        } else {
+                            _offs.init((uint16_t*)(mmFile[1] + bytesRead), offsLen, false);
+                        }
+                        bytesRead += (offsLen * OFFSET_SIZE);
+                        fseek(_in2, (offsLen * OFFSET_SIZE), SEEK_CUR);
+#endif
+                    } else {
+                        // Workaround for small-index mode where MM_READ may
+                        // not be able to handle read amounts greater than 2^32
+                        // bytes.
+                        uint64_t bytesLeft = (offsLen * OFFSET_SIZE);
+                        char *offs = NULL;
+                        if(this->_offw) {
+                            offs = (char *)this->offsw();
+                        } else {
+                            offs = (char *)this->offs();
+                        }
+                        while(bytesLeft > 0) {
+                            size_t r = MM_READ(_in2, (void*)offs, bytesLeft);
+                            if(MM_IS_IO_ERR(_in2, r, bytesLeft)) {
+                                cerr << "Error reading block of _offs[] array: "
+                                << r << ", " << bytesLeft << gLastIOErrMsg << endl;
+                                throw 1;
+                            }
+                            offs += r;
+                            bytesLeft -= r;
+                        }
+                    }
+                }
+#ifdef BOWTIE_SHARED_MEM
+                if(useShmem_) {
+                    if(this->_offw) {
+                        NOTIFY_SHARED(offsw(), offsLenSampled*OFFSET_SIZE);
+                    } else {
+                        NOTIFY_SHARED(offs(), offsLenSampled*OFFSET_SIZE);
+                    }
+                }
+#endif
+            } else {
+                // Not the shmem leader
+				fseek(_in2, offsLenSampled*OFFSET_SIZE, SEEK_CUR);
+#ifdef BOWTIE_SHARED_MEM
+                if(this->_offw) {
+                    NOTIFY_SHARED(offsw(), offsLenSampled*OFFSET_SIZE);
+                } else {                    
+                    NOTIFY_SHARED(offs(), offsLenSampled*OFFSET_SIZE);
+                }
+#endif
+            }
+        }
+    }
+    
+    this->postReadInit(*eh); // Initialize fields of Ebwt not read from file
+    if(_verbose || startVerbose) print(cerr, *eh);
+    
+    // The fact that _ebwt and friends actually point to something
+    // (other than NULL) now signals to other member functions that the
+    // Ebwt is loaded into memory.
+    
+done: // Exit hatch for both justHeader and !justHeader
+	
+	// Be kind
+	if(deleteEh) delete eh;
+#ifdef BOWTIE_MM
+	if(_in1 != NULL) fseek(_in1, 0, SEEK_SET);
+	if(_in2 != NULL) fseek(_in2, 0, SEEK_SET);
+#else
+	if(_in1 != NULL) rewind(_in1);
+    if(_in2 != NULL) rewind(_in2);
+#endif
+}
+
+/**
+ * Read reference names from an input stream 'in' for an Ebwt primary
+ * file and store them in 'refnames'.
+ */
+template <typename index_t>
+void readEbwtRefnames(istream& in, EList<string>& refnames) {
+	// _in1 must already be open with the get cursor at the
+	// beginning and no error flags set.
+	assert(in.good());
+	assert_eq((streamoff)in.tellg(), ios::beg);
+	
+	// Read endianness hints from both streams
+	bool switchEndian = false;
+	uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
+	if(one != 1) {
+		assert_eq((1u<<24), one);
+		switchEndian = true;
+	}
+	
+	// Reads header entries one by one from primary stream
+	index_t len          = readIndex<index_t>(in, switchEndian);
+	int32_t  lineRate     = readI32(in, switchEndian);
+	/*int32_t  linesPerSide =*/ readI32(in, switchEndian);
+	int32_t  offRate      = readI32(in, switchEndian);
+	int32_t  ftabChars    = readI32(in, switchEndian);
+	// BTL: chunkRate is now deprecated
+	int32_t flags = readI32(in, switchEndian);
+	bool color = false;
+	bool entireReverse = false;
+	if(flags < 0) {
+		color = (((-flags) & EBWT_COLOR) != 0);
+		entireReverse = (((-flags) & EBWT_ENTIRE_REV) != 0);
+	}
+	
+	// Create a new EbwtParams from the entries read from primary stream
+	EbwtParams<index_t> eh(len, lineRate, offRate, ftabChars, color, entireReverse);
+	
+	index_t nPat = readIndex<index_t>(in, switchEndian); // nPat
+	in.seekg(nPat*sizeof(index_t), ios_base::cur); // skip plen
+	
+	// Skip rstarts
+	index_t nFrag = readIndex<index_t>(in, switchEndian);
+	in.seekg(nFrag*sizeof(index_t)*3, ios_base::cur);
+	
+	// Skip ebwt
+	in.seekg(eh._ebwtTotLen, ios_base::cur);
+	
+	// Skip zOff from primary stream
+	readIndex<index_t>(in, switchEndian);
+	
+	// Skip fchr
+	in.seekg(5 * sizeof(index_t), ios_base::cur);
+	
+	// Skip ftab
+	in.seekg(eh._ftabLen*sizeof(index_t), ios_base::cur);
+	
+	// Skip eftab
+	in.seekg(eh._eftabLen*sizeof(index_t), ios_base::cur);
+	
+	// Read reference sequence names from primary index file
+	while(true) {
+		char c = '\0';
+		in.read(&c, 1);
+		if(in.eof()) break;
+		if(c == '\0') break;
+		else if(c == '\n') {
+			refnames.push_back("");
+		} else {
+			if(refnames.size() == 0) {
+				refnames.push_back("");
+			}
+			refnames.back().push_back(c);
+		}
+	}
+	if(refnames.back().empty()) {
+		refnames.pop_back();
+	}
+	
+	// Be kind
+	in.clear(); in.seekg(0, ios::beg);
+	assert(in.good());
+}
+
+/**
+ * Read reference names from the index with basename 'in' and store
+ * them in 'refnames'.
+ */
+template <typename index_t>
+void readEbwtRefnames(const string& instr, EList<string>& refnames) {
+	ifstream in;
+	// Initialize our primary and secondary input-stream fields
+	in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
+	if(!in.is_open()) {
+		throw EbwtFileOpenException("Cannot open file " + instr);
+	}
+	assert(in.is_open());
+	assert(in.good());
+	assert_eq((streamoff)in.tellg(), ios::beg);
+	readEbwtRefnames<index_t>(in, refnames);
+}
+
+/**
+ * Read just enough of the Ebwt's header to get its flags
+ */
+template <typename index_t>
+int32_t Ebwt<index_t>::readFlags(const string& instr) {
+	ifstream in;
+	// Initialize our primary and secondary input-stream fields
+	in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
+	if(!in.is_open()) {
+		throw EbwtFileOpenException("Cannot open file " + instr);
+	}
+	assert(in.is_open());
+	assert(in.good());
+	bool switchEndian = false;
+	uint32_t one = readU32(in, switchEndian); // 1st word of primary stream
+	if(one != 1) {
+		assert_eq((1u<<24), one);
+		assert_eq(1, endianSwapU32(one));
+		switchEndian = true;
+	}
+	readIndex<index_t>(in, switchEndian);
+	readI32(in, switchEndian);
+	readI32(in, switchEndian);
+	readI32(in, switchEndian);
+	readI32(in, switchEndian);
+	int32_t flags = readI32(in, switchEndian);
+	return flags;
+}
+
+/**
+ * Read just enough of the Ebwt's header to determine whether it's
+ * colorspace.
+ */
+bool
+readEbwtColor(const string& instr) {
+	int32_t flags = Ebwt<>::readFlags(instr);
+	if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+/**
+ * Read just enough of the Ebwt's header to determine whether it's
+ * entirely reversed.
+ */
+bool
+readEntireReverse(const string& instr) {
+	int32_t flags = Ebwt<>::readFlags(instr);
+	if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) != 0)) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+/**
+ * Write an extended Burrows-Wheeler transform to a pair of output
+ * streams.
+ *
+ * @param out1 output stream to primary file
+ * @param out2 output stream to secondary file
+ * @param be   write in big endian?
+ */
+template <typename index_t>
+void Ebwt<index_t>::writeFromMemory(bool justHeader,
+                           ostream& out1,
+                           ostream& out2) const
+{
+	const EbwtParams<index_t>& eh = this->_eh;
+	assert(eh.repOk());
+	uint32_t be = this->toBe();
+	assert(out1.good());
+	assert(out2.good());
+	
+	// When building an Ebwt, these header parameters are known
+	// "up-front", i.e., they can be written to disk immediately,
+	// before we join() or buildToDisk()
+	writeI32(out1, 1, be); // endian hint for priamry stream
+	writeI32(out2, 1, be); // endian hint for secondary stream
+	writeIndex<index_t>(out1, eh._len,          be); // length of string (and bwt and suffix array)
+	writeI32(out1, eh._lineRate,     be); // 2^lineRate = size in bytes of 1 line
+	writeI32(out1, 2,                be); // not used
+	writeI32(out1, eh._offRate,      be); // every 2^offRate chars is "marked"
+	writeI32(out1, eh._ftabChars,    be); // number of 2-bit chars used to address ftab
+	int32_t flags = 1;
+	if(eh._color) flags |= EBWT_COLOR;
+	if(eh._entireReverse) flags |= EBWT_ENTIRE_REV;
+	writeI32(out1, -flags, be); // BTL: chunkRate is now deprecated
+	
+	if(!justHeader) {
+		assert(rstarts() != NULL);
+		assert(offs() != NULL);
+		assert(ftab() != NULL);
+		assert(eftab() != NULL);
+		assert(isInMemory());
+		// These Ebwt parameters are known after the inputs strings have
+		// been joined() but before they have been built().  These can
+		// written to the disk next and then discarded from memory.
+		writeIndex<index_t>(out1, this->_nPat,      be);
+		for(index_t i = 0; i < this->_nPat; i++)
+			writeIndex<index_t>(out1, this->plen()[i], be);
+		assert_geq(this->_nFrag, this->_nPat);
+		writeIndex<index_t>(out1, this->_nFrag, be);
+		for(size_t i = 0; i < this->_nFrag*3; i++)
+			writeIndex<index_t>(out1, this->rstarts()[i], be);
+		
+		// These Ebwt parameters are discovered only as the Ebwt is being
+		// built (in buildToDisk()).  Of these, only 'offs' and 'ebwt' are
+		// terribly large.  'ebwt' is written to the primary file and then
+		// discarded from memory as it is built; 'offs' is similarly
+		// written to the secondary file and discarded.
+		out1.write((const char *)this->ebwt(), eh._ebwtTotLen);
+		writeIndex<index_t>(out1, this->zOff(), be);
+		index_t offsLen = eh._offsLen;
+		for(index_t i = 0; i < offsLen; i++)
+			writeIndex<index_t>(out2, this->offs()[i], be);
+		
+		// 'fchr', 'ftab' and 'eftab' are not fully determined until the
+		// loop is finished, so they are written to the primary file after
+		// all of 'ebwt' has already been written and only then discarded
+		// from memory.
+		for(int i = 0; i < 5; i++)
+			writeIndex<index_t>(out1, this->fchr()[i], be);
+		for(index_t i = 0; i < eh._ftabLen; i++)
+			writeIndex<index_t>(out1, this->ftab()[i], be);
+		for(index_t i = 0; i < eh._eftabLen; i++)
+			writeIndex<index_t>(out1, this->eftab()[i], be);
+	}
+}
+
+/**
+ * Given a pair of strings representing output filenames, and assuming
+ * this Ebwt object is currently in memory, write out this Ebwt to the
+ * specified files.
+ *
+ * If sanity-checking is enabled, then once the streams have been
+ * fully written and closed, we reopen them and read them into a
+ * (hopefully) exact copy of this Ebwt.  We then assert that the
+ * current Ebwt and the copy match in all of their fields.
+ */
+template <typename index_t>
+void Ebwt<index_t>::writeFromMemory(bool justHeader,
+                           const string& out1,
+                           const string& out2) const
+{
+	ASSERT_ONLY(const EbwtParams<index_t>& eh = this->_eh);
+	assert(isInMemory());
+	assert(eh.repOk());
+	
+	ofstream fout1(out1.c_str(), ios::binary);
+	ofstream fout2(out2.c_str(), ios::binary);
+	writeFromMemory(justHeader, fout1, fout2);
+	fout1.close();
+	fout2.close();
+	
+	// Read the file back in and assert that all components match
+	if(_sanity) {
+#if 0
+		if(_verbose)
+			cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl;
+		Ebwt copy(out1, out2, _verbose, _sanity);
+		assert(!isInMemory());
+		copy.loadIntoMemory(eh._color ? 1 : 0, true, false, false);
+		assert(isInMemory());
+	    assert_eq(eh._lineRate,     copy.eh()._lineRate);
+	    assert_eq(eh._offRate,      copy.eh()._offRate);
+	    assert_eq(eh._ftabChars,    copy.eh()._ftabChars);
+	    assert_eq(eh._len,          copy.eh()._len);
+	    assert_eq(_zOff,             copy.zOff());
+	    assert_eq(_zEbwtBpOff,       copy.zEbwtBpOff());
+	    assert_eq(_zEbwtByteOff,     copy.zEbwtByteOff());
+		assert_eq(_nPat,             copy.nPat());
+		for(index_t i = 0; i < _nPat; i++)
+			assert_eq(this->_plen[i], copy.plen()[i]);
+		assert_eq(this->_nFrag, copy.nFrag());
+		for(size_t i = 0; i < this->nFrag*3; i++) {
+			assert_eq(this->_rstarts[i], copy.rstarts()[i]);
+		}
+		for(index_t i = 0; i < 5; i++)
+			assert_eq(this->_fchr[i], copy.fchr()[i]);
+		for(size_t i = 0; i < eh._ftabLen; i++)
+			assert_eq(this->ftab()[i], copy.ftab()[i]);
+		for(size_t i = 0; i < eh._eftabLen; i++)
+			assert_eq(this->eftab()[i], copy.eftab()[i]);
+		for(index_t i = 0; i < eh._offsLen; i++)
+			assert_eq(this->_offs[i], copy.offs()[i]);
+		for(index_t i = 0; i < eh._ebwtTotLen; i++)
+			assert_eq(this->ebwt()[i], copy.ebwt()[i]);
+		copy.sanityCheckAll();
+		if(_verbose)
+			cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl;
+#endif
+	}
+}
+
+/**
+ * Write the rstarts array given the szs array for the reference.
+ */
+template <typename index_t>
+void Ebwt<index_t>::szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse) {
+#ifdef CENTRIFUGE
+    if(rstarts() == NULL) {
+        _rstarts.init(new index_t[this->_nFrag*3], this->_nFrag*3, true);
+    }
+#endif
+    
+	size_t seq = 0;
+	index_t off = 0;
+	index_t totlen = 0;
+    index_t rstarts_idx = 0;
+	for(size_t i = 0; i < szs.size(); i++) {
+		if(szs[i].len == 0) continue;
+		if(szs[i].first) off = 0;
+		off += szs[i].off;
+		if(szs[i].first && szs[i].len > 0) seq++;
+		index_t seqm1 = seq-1;
+		assert_lt(seqm1, _nPat);
+		index_t fwoff = off;
+		if(reverse == REF_READ_REVERSE) {
+			// Invert pattern idxs
+			seqm1 = _nPat - seqm1 - 1;
+			// Invert pattern idxs
+			assert_leq(off + szs[i].len, plen()[seqm1]);
+			fwoff = plen()[seqm1] - (off + szs[i].len);
+		}
+		writeIndex<index_t>(os, totlen, this->toBe()); // offset from beginning of joined string
+		writeIndex<index_t>(os, (index_t)seqm1,  this->toBe()); // sequence id
+		writeIndex<index_t>(os, (index_t)fwoff,  this->toBe()); // offset into sequence
+#ifdef CENTRIFUGE
+        this->rstarts()[rstarts_idx*3]   = totlen;
+        this->rstarts()[rstarts_idx*3+1] = (index_t)seqm1;
+        this->rstarts()[rstarts_idx*3+2] = (index_t)fwoff;
+        rstarts_idx++;
+#endif
+		totlen += szs[i].len;
+		off += szs[i].len;
+	}
+}
+
+#endif /*EBWT_IO_H_*/
+
diff --git a/bt2_util.h b/bt2_util.h
new file mode 100644
index 0000000..99cb9cc
--- /dev/null
+++ b/bt2_util.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EBWT_UTIL_H_
+#define EBWT_UTIL_H_
+
+#include <string>
+#include <stdexcept>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <string.h>
+#include "bt2_idx.h"
+
+///////////////////////////////////////////////////////////////////////
+//
+// Functions for printing and sanity-checking Ebwts
+//
+///////////////////////////////////////////////////////////////////////
+
+/**
+ * Check that the ebwt array is internally consistent up to (and not
+ * including) the given side index by re-counting the chars and
+ * comparing against the embedded occ[] arrays.
+ */
+template <typename index_t>
+void Ebwt<index_t>::sanityCheckUpToSide(int upToSide) const {
+	assert(isInMemory());
+	index_t occ[] = {0, 0, 0, 0};
+	ASSERT_ONLY(index_t occ_save[] = {0, 0, 0, 0});
+	index_t cur = 0; // byte pointer
+	const EbwtParams<index_t>& eh = this->_eh;
+	bool fw = false;
+	while(cur < (upToSide * eh._sideSz)) {
+		assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
+		for(index_t i = 0; i < eh._sideBwtSz; i++) {
+			uint8_t by = this->ebwt()[cur + (fw ? i : eh._sideBwtSz-i-1)];
+			for(int j = 0; j < 4; j++) {
+				// Unpack from lowest to highest bit pair
+				int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
+				occ[twoBit]++;
+			}
+			assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
+		}
+		assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
+		// Finished forward bucket; check saved [A], [C], [G] and [T]
+		// against the index_ts encoded here
+		ASSERT_ONLY(const index_t *uebwt = reinterpret_cast<const index_t*>(&ebwt()[cur + eh._sideBwtSz]));
+		ASSERT_ONLY(index_t as = uebwt[0]);
+		ASSERT_ONLY(index_t cs = uebwt[1]);
+		ASSERT_ONLY(index_t gs = uebwt[2]);
+		ASSERT_ONLY(index_t ts = uebwt[3]);
+		assert(as == occ_save[0] || as == occ_save[0]-1);
+		assert_eq(cs, occ_save[1]);
+		assert_eq(gs, occ_save[2]);
+		assert_eq(ts, occ_save[3]);
+#ifndef NDEBUG
+		occ_save[0] = occ[0];
+		occ_save[1] = occ[1];
+		occ_save[2] = occ[2];
+		occ_save[3] = occ[3];
+#endif
+		cur += eh._sideSz;
+	}
+}
+
+/**
+ * Sanity-check various pieces of the Ebwt
+ */
+template <typename index_t>
+void Ebwt<index_t>::sanityCheckAll(int reverse) const {
+	const EbwtParams<index_t>& eh = this->_eh;
+	assert(isInMemory());
+	// Check ftab
+	for(index_t i = 1; i < eh._ftabLen; i++) {
+		assert_geq(this->ftabHi(i), this->ftabLo(i-1));
+		assert_geq(this->ftabLo(i), this->ftabHi(i-1));
+		assert_leq(this->ftabHi(i), eh._bwtLen+1);
+	}
+	assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen);
+	
+	// Check offs
+	int seenLen = (eh._bwtLen + 31) >> ((index_t)5);
+	uint32_t *seen;
+	try {
+		seen = new uint32_t[seenLen]; // bitvector marking seen offsets
+	} catch(bad_alloc& e) {
+		cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
+		throw e;
+	}
+	memset(seen, 0, 4 * seenLen);
+	index_t offsLen = eh._offsLen;
+	for(index_t i = 0; i < offsLen; i++) {
+		assert_lt(this->offs()[i], eh._bwtLen);
+		int w = this->offs()[i] >> 5;
+		int r = this->offs()[i] & 31;
+		assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
+		seen[w] |= (1 << r);
+	}
+	delete[] seen;
+	
+	// Check nPat
+	assert_gt(this->_nPat, 0);
+    
+    // Check plen, flen
+	for(index_t i = 0; i < this->_nPat; i++) {
+		assert_geq(this->plen()[i], 0);
+	}
+    
+	// Check rstarts
+	if(this->rstarts() != NULL) {
+		for(index_t i = 0; i < this->_nFrag-1; i++) {
+			assert_gt(this->rstarts()[(i+1)*3], this->rstarts()[i*3]);
+			if(reverse == REF_READ_REVERSE) {
+				assert(this->rstarts()[(i*3)+1] >= this->rstarts()[((i+1)*3)+1]);
+			} else {
+				assert(this->rstarts()[(i*3)+1] <= this->rstarts()[((i+1)*3)+1]);
+			}
+		}
+	}
+	
+	// Check ebwt
+	sanityCheckUpToSide(eh._numSides);
+	VMSG_NL("Ebwt::sanityCheck passed");
+}
+
+/**
+ * Transform this Ebwt into the original string in linear time by using
+ * the LF mapping to walk backwards starting at the row correpsonding
+ * to the end of the string.  The result is written to s.  The Ebwt
+ * must be in memory.
+ */
+template <typename index_t>
+void Ebwt<index_t>::restore(SString<char>& s) const {
+	assert(isInMemory());
+	s.resize(this->_eh._len);
+	index_t jumps = 0;
+	index_t i = this->_eh._len; // should point to final SA elt (starting with '$')
+	SideLocus<index_t> l(i, this->_eh, this->ebwt());
+	while(i != _zOff) {
+		assert_lt(jumps, this->_eh._len);
+		//if(_verbose) cout << "restore: i: " << i << endl;
+		// Not a marked row; go back a char in the original string
+		index_t newi = mapLF(l ASSERT_ONLY(, false));
+		assert_neq(newi, i);
+		s[this->_eh._len - jumps - 1] = rowL(l);
+		i = newi;
+		l.initFromRow(i, this->_eh, this->ebwt());
+		jumps++;
+	}
+	assert_eq(jumps, this->_eh._len);
+}
+
+/**
+ * Check that this Ebwt, when restored via restore(), matches up with
+ * the given array of reference sequences.  For sanity checking.
+ */
+template <typename index_t>
+void Ebwt<index_t>::checkOrigs(
+	const EList<SString<char> >& os,
+	bool color,
+	bool mirror) const
+{
+	SString<char> rest;
+	restore(rest);
+	index_t restOff = 0;
+	size_t i = 0, j = 0;
+	if(mirror) {
+		// TODO: FIXME
+		return;
+	}
+	while(i < os.size()) {
+		size_t olen = os[i].length();
+		int lastorig = -1;
+		for(; j < olen; j++) {
+			size_t joff = j;
+			if(mirror) joff = olen - j - 1;
+			if((int)os[i][joff] == 4) {
+				// Skip over Ns
+				lastorig = -1;
+				if(!mirror) {
+					while(j < olen && (int)os[i][j] == 4) j++;
+				} else {
+					while(j < olen && (int)os[i][olen-j-1] == 4) j++;
+				}
+				j--;
+				continue;
+			}
+			if(lastorig == -1 && color) {
+				lastorig = os[i][joff];
+				continue;
+			}
+			if(color) {
+				assert_neq(-1, lastorig);
+				assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]);
+			} else {
+				assert_eq(os[i][joff], rest[restOff]);
+			}
+			lastorig = (int)os[i][joff];
+			restOff++;
+		}
+		if(j == os[i].length()) {
+			// Moved to next sequence
+			i++;
+			j = 0;
+		} else {
+			// Just jumped over a gap
+		}
+	}
+}
+
+#endif /*EBWT_UTIL_H_*/
+
diff --git a/btypes.h b/btypes.h
new file mode 100644
index 0000000..cc4225c
--- /dev/null
+++ b/btypes.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifndef BOWTIE_INDEX_TYPES_H
+#define	BOWTIE_INDEX_TYPES_H
+
+#ifdef BOWTIE_64BIT_INDEX
+#define OFF_MASK 0xffffffffffffffff
+#define OFF_LEN_MASK 0xc000000000000000
+#define LS_SIZE 0x100000000000000
+#define OFF_SIZE 8
+
+typedef uint64_t TIndexOffU;
+typedef int64_t TIndexOff;
+    
+#else
+#define OFF_MASK 0xffffffff
+#define OFF_LEN_MASK 0xc0000000
+#define LS_SIZE 0x10000000
+#define OFF_SIZE 4
+
+typedef uint32_t TIndexOffU;
+typedef int TIndexOff;
+
+#endif /* BOWTIE_64BIT_INDEX */
+
+extern const std::string gEbwt_ext;
+
+#endif	/* BOWTIE_INDEX_TYPES_H */
+
diff --git a/ccnt_lut.cpp b/ccnt_lut.cpp
new file mode 100644
index 0000000..10c726a
--- /dev/null
+++ b/ccnt_lut.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdint.h>
+
+/* Generated by gen_lookup_tables.pl */
+
+uint8_t cCntLUT_4[4][4][256];
+uint8_t cCntLUT_4_rev[4][4][256];
+
+int countCnt(int by, int c, uint8_t str) {
+    int count = 0;
+    if(by == 0) by = 4;
+    while(by-- > 0) {
+        int c2 = str & 3;
+        str >>= 2;
+        if(c == c2) count++;
+    }
+    
+    return count;
+}
+
+int countCnt_rev(int by, int c, uint8_t str) {
+    int count = 0;
+    if(by == 0) by = 4;
+    while(by-- > 0) {
+        int c2 = (str >> 6) & 3;
+        str <<= 2;
+        if(c == c2) count++;
+    }
+    
+    return count;
+}
+
+void initializeCntLut() {
+    for(int by = 0; by < 4; by++) {
+        for(int c = 0; c < 4; c++) {
+            for(int str = 0; str < 256; str++) {
+                cCntLUT_4[by][c][str] = countCnt(by, c, str);
+                cCntLUT_4_rev[by][c][str] = countCnt_rev(by, c, str);
+            }
+        }
+    }
+}
diff --git a/centrifuge b/centrifuge
new file mode 100755
index 0000000..54bdc57
--- /dev/null
+++ b/centrifuge
@@ -0,0 +1,559 @@
+#!/usr/bin/env perl
+
+#
+# Copyright 2014, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of Centrifuge, which is copied and modified from bowtie2 in
+# the Bowtie2 package.
+#
+# Centrifuge is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Centrifuge is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Centrifuge.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# centrifuge:
+#
+# A wrapper script for centrifuge.  Provides various advantages over running
+# centrifuge directly, including:
+#
+# 1. Handling compressed inputs
+# 2. Redirecting output to various files
+# 3. Output directly to bam (not currently supported)
+
+use strict;
+use warnings;
+use Getopt::Long qw(GetOptions);
+use File::Spec;
+use POSIX;
+
+
+my ($vol,$script_path,$prog);
+$prog = File::Spec->rel2abs( __FILE__ );
+
+while (-f $prog && -l $prog){
+    my (undef, $dir, undef) = File::Spec->splitpath($prog);
+    $prog = File::Spec->rel2abs(readlink($prog), $dir);
+}
+
+($vol,$script_path,$prog) 
+                = File::Spec->splitpath($prog);
+my $os_is_nix   = ($^O eq "linux") || ($^O eq "darwin");
+my $align_bin_s = $os_is_nix ? 'centrifuge-class' : 'centrifuge-class.exe'; 
+my $build_bin   = $os_is_nix ? 'centrifuge-build' : 'centrifuge-build.exe';               
+my $align_prog  = File::Spec->catpath($vol,$script_path,'centrifuge-class');
+my $idx_ext       = 'hc'; 
+my %signo       = ();
+my @signame     = ();
+
+{
+	# Get signal info
+	use Config;
+	my $i = 0;
+	for my $name (split(' ', $Config{sig_name})) {
+		$signo{$name} = $i;
+		$signame[$i] = $name;
+		$i++;
+	}
+}
+
+(-x "$align_prog") ||
+	Fail("Expected centrifuge to be in same directory with centrifuge-class:\n$script_path\n");
+
+# Get description of arguments from Centrifuge so that we can distinguish Centrifuge
+# args from wrapper args
+sub getBt2Desc($) {
+	my $d = shift;
+	my $cmd = "$align_prog --wrapper basic-0 --arg-desc";
+	open(my $fh, "$cmd |") || Fail("Failed to run command '$cmd'\n");
+	while(readline $fh) {
+		chomp;
+		next if /^\s*$/;
+		my @ts = split(/\t/);
+		$d->{$ts[0]} = $ts[1];
+	}
+	close($fh);
+	$? == 0 || Fail("Description of arguments failed!\n");
+}
+
+my %desc = ();
+my %wrapped = ("1" => 1, "2" => 1);
+getBt2Desc(\%desc);
+
+# Given an option like -1, determine whether it's wrapped (i.e. should be
+# handled by this script rather than being passed along to Centrifuge)
+sub isWrapped($) { return defined($wrapped{$_[0]}); }
+
+my @orig_argv = @ARGV;
+
+my @bt2w_args = (); # options for wrapper
+my @bt2_args  = (); # options for Centrifuge
+my $saw_dd = 0;
+for(0..$#ARGV) {
+	if($ARGV[$_] eq "--") {
+		$saw_dd = 1;
+		next;
+	}
+	push @bt2w_args, $ARGV[$_] if !$saw_dd;
+	push @bt2_args,  $ARGV[$_] if  $saw_dd;
+}
+if(!$saw_dd) {
+	@bt2_args = @bt2w_args;
+	@bt2w_args= ();
+}
+
+my $debug = 0;
+my %read_fns = ();
+my %read_compress = ();
+my $cap_out = undef;       # Filename for passthrough
+my $no_unal = 0;
+my $large_idx = 0;
+# Remove whitespace
+for my $i (0..$#bt2_args) {
+	$bt2_args[$i]=~ s/^\s+//; $bt2_args[$i] =~ s/\s+$//;
+}
+
+# We've handled arguments that the user has explicitly directed either to the
+# wrapper or to centrifuge, now we capture some of the centrifuge arguments that
+# ought to be handled in the wrapper
+for(my $i = 0; $i < scalar(@bt2_args); $i++) {
+	next unless defined($bt2_args[$i]);
+	my $arg = $bt2_args[$i];
+	my @args = split(/=/, $arg);
+	if(scalar(@args) > 2) {
+		$args[1] = join("=", @args[1..$#args]);
+	}
+	$arg = $args[0];
+	if($arg eq "-U" || $arg eq "--unpaired") {
+		$bt2_args[$i] = undef;
+		$arg =~ s/^-U//; $arg =~ s/^--unpaired//;
+		if($arg ne "") {
+			# Argument was part of this token
+			my @args = split(/,/, $arg);
+			for my $a (@args) { push @bt2w_args, ("-U", $a); }
+		} else {
+			# Argument is in the next token
+			$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+			$i++;
+			my @args = split(/,/, $bt2_args[$i]);
+			for my $a (@args) { push @bt2w_args, ("-U", $a); }
+			$bt2_args[$i] = undef;
+		}
+	}
+	if($arg =~ /^--?([12])/ && $arg !~ /^--?12/) {
+		my $mate = $1;
+		$bt2_args[$i] = undef;
+		$arg =~ s/^--?[12]//;
+		if($arg ne "") {
+			# Argument was part of this token
+			my @args = split(/,/, $arg);
+			for my $a (@args) { push @bt2w_args, ("-$mate", $a); }
+		} else {
+			# Argument is in the next token
+			$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+			$i++;
+			my @args = split(/,/, $bt2_args[$i]);
+			for my $a (@args) { push @bt2w_args, ("-$mate", $a); }
+			$bt2_args[$i] = undef;
+		}
+	}
+	if($arg eq "--debug") {
+		$debug = 1;
+		$bt2_args[$i] = undef;
+	}
+	if($arg eq "--no-unal") {
+		$no_unal = 1;
+		$bt2_args[$i] = undef;
+	}
+	if($arg eq "--large-index") {
+		$large_idx = 1;
+		$bt2_args[$i] = undef;
+	}
+	for my $rarg ("un-conc", "al-conc", "un", "al") {
+		if($arg =~ /^--${rarg}$/ || $arg =~ /^--${rarg}-gz$/ || $arg =~ /^--${rarg}-bz2$/) {
+			$bt2_args[$i] = undef;
+			if(scalar(@args) > 1 && $args[1] ne "") {
+				$read_fns{$rarg} = $args[1];
+			} else {
+				$i < scalar(@bt2_args)-1 || Fail("--${rarg}* option takes an argument.\n");
+				$read_fns{$rarg} = $bt2_args[$i+1];
+				$bt2_args[$i+1] = undef;
+			}
+			$read_compress{$rarg} = "";
+			$read_compress{$rarg} = "gzip"  if $arg eq "--${rarg}-gz";
+			$read_compress{$rarg} = "bzip2" if $arg eq "--${rarg}-bz2";
+			last;
+		}
+	}
+}
+# If the user asked us to redirect some reads to files, or to suppress
+# unaligned reads, then we need to capture the output from Centrifuge and pass it
+# through this wrapper.
+my $passthru = 0;
+if(scalar(keys %read_fns) > 0 || $no_unal) {
+	$passthru = 1;
+	push @bt2_args, "--passthrough";
+	$cap_out = "-";
+	for(my $i = 0; $i < scalar(@bt2_args); $i++) {
+		next unless defined($bt2_args[$i]);
+		my $arg = $bt2_args[$i];
+		if($arg eq "-S" || $arg eq "--output") {
+			$i < scalar(@bt2_args)-1 || Fail("-S/--output takes an argument.\n");
+			$cap_out = $bt2_args[$i+1];
+			$bt2_args[$i] = undef;
+			$bt2_args[$i+1] = undef;
+		}
+	}
+}
+my @tmp = ();
+for (@bt2_args) { push(@tmp, $_) if defined($_); }
+ at bt2_args = @tmp;
+
+my @unps = ();
+my @mate1s = ();
+my @mate2s = ();
+my @to_delete = ();
+my $temp_dir = "/tmp";
+my $bam_out = 0;
+my $ref_str = undef;
+my $no_pipes = 0;
+my $keep = 0;
+my $verbose = 0;
+my $readpipe = undef;
+my $log_fName = undef;
+my $help = 0;
+
+my @bt2w_args_cp = (@bt2w_args>0) ? @bt2w_args : @bt2_args;
+Getopt::Long::Configure("pass_through","no_ignore_case");
+
+my @old_ARGV = @ARGV;
+ at ARGV = @bt2w_args_cp;
+
+GetOptions(
+	"1=s"                           => \@mate1s,
+	"2=s"                           => \@mate2s,
+	"reads|U=s"                     => \@unps,
+	"temp-directory=s"              => \$temp_dir,
+	"bam"                           => \$bam_out,
+	"no-named-pipes"                => \$no_pipes,
+	"ref-string|reference-string=s" => \$ref_str,
+	"keep"                          => \$keep,
+	"verbose"                       => \$verbose,
+	"log-file=s"                    => \$log_fName,
+	"help|h"                        => \$help
+);
+
+ at ARGV = @old_ARGV;
+
+my $old_stderr;
+
+if ($log_fName) {
+    open($old_stderr, ">&STDERR") or Fail("Cannot dup STDERR!\n");
+    open(STDERR, ">", $log_fName) or Fail("Cannot redirect to log file $log_fName.\n");
+}
+
+Info("Before arg handling:\n");
+Info("  Wrapper args:\n[ @bt2w_args ]\n");
+Info("  Binary args:\n[ @bt2_args ]\n");
+
+sub cat_file($$) {
+	my ($ifn, $ofh) = @_;
+	my $ifh = undef;
+	if($ifn =~ /\.gz$/) {
+		open($ifh, "gzip -dc $ifn |") ||
+			 Fail("Could not open gzipped read file: $ifn \n");
+	} elsif($ifn =~ /\.bz2/) {
+		open($ifh, "bzip2 -dc $ifn |") ||
+			Fail("Could not open bzip2ed read file: $ifn \n");
+	} else {
+		open($ifh, $ifn) || Fail("Could not open read file: $ifn \n");
+	}
+	while(readline $ifh) { print {$ofh} $_; }
+	close($ifh);
+}
+
+# Return non-zero if and only if the input should be wrapped (i.e. because
+# it's compressed).
+sub wrapInput($$$) {
+	my ($unps, $mate1s, $mate2s) = @_;
+	for my $fn (@$unps, @$mate1s, @$mate2s) {
+		return 1 if $fn =~ /\.gz$/ || $fn =~ /\.bz2$/;
+	}
+	return 0;
+}
+
+sub Info {
+    if ($verbose) {
+        print STDERR "(INFO): " , at _;
+    }
+}
+
+sub Error {
+    my @msg = @_;
+    $msg[0] = "(ERR): ".$msg[0];
+    printf STDERR @msg;
+}
+
+sub Fail {
+    Error(@_);
+    die("Exiting now ...\n");    
+}
+
+sub Extract_IndexName_From {
+    my $index_opt = $ref_str ? '--index' : '-x';
+    for (my $i=0; $i<@_; $i++) {
+        if ($_[$i] eq $index_opt){
+            return $_[$i+1];
+        }
+    }
+    Info("Cannot find any index option (--reference-string, --ref-string or -x) in the given command line.\n");    
+}
+
+if(wrapInput(\@unps, \@mate1s, \@mate2s)) {
+	if(scalar(@mate2s) > 0) {
+		#
+		# Wrap paired-end inputs
+		#
+		# Put reads into temporary files or fork off processes to feed named pipes
+		scalar(@mate2s) == scalar(@mate1s) ||
+			Fail("Different number of files specified with --reads/-1 as with -2\n");
+		# Make a named pipe for delivering mate #1s
+		my $m1fn = "$temp_dir/$$.inpipe1";
+		push @to_delete, $m1fn;
+		push @bt2_args, "-1 $m1fn";
+		# Create named pipe 1 for writing
+		if(!$no_pipes) {
+			mkfifo($m1fn, 0700) || Fail("mkfifo($m1fn) failed.\n");
+		}
+		my $pid = 0;
+		$pid = fork() unless $no_pipes;
+		if($pid == 0) {
+			# Open named pipe 1 for writing
+			open(my $ofh, ">$m1fn") || Fail("Can't open '$m1fn' for writing\n");
+			for my $ifn (@mate1s) { cat_file($ifn, $ofh); }
+			close($ofh);
+			exit 0 unless $no_pipes;
+		}
+		# Make a named pipe for delivering mate #2s
+		my $m2fn = "$temp_dir/$$.inpipe2";
+		push @to_delete, $m2fn;
+		push @bt2_args, "-2 $m2fn";
+		# Create named pipe 2 for writing
+		if(!$no_pipes) {
+			mkfifo($m2fn, 0700) || Fail("mkfifo($m2fn) failed.\n");
+		}
+		$pid = 0;
+		$pid = fork() unless $no_pipes;
+		if($pid == 0) {
+			# Open named pipe 2 for writing
+			open(my $ofh, ">$m2fn") || Fail("Can't open '$m2fn' for writing.\n");
+			for my $ifn (@mate2s) { cat_file($ifn, $ofh); }
+			close($ofh);
+			exit 0 unless $no_pipes;
+		}
+	}
+	if(scalar(@unps) > 0) {
+		#
+		# Wrap unpaired inputs.
+		#
+		# Make a named pipe for delivering unpaired reads
+		my $ufn = "$temp_dir/$$.unp";
+		push @to_delete, $ufn;
+		push @bt2_args, "-U $ufn";
+		# Create named pipe 2 for writing
+		if(!$no_pipes) {
+			mkfifo($ufn, 0700) || Fail("mkfifo($ufn) failed.\n");
+		}
+		my $pid = 0;
+		$pid = fork() unless $no_pipes;
+		if($pid == 0) {
+			# Open named pipe 2 for writing
+			open(my $ofh, ">$ufn") || Fail("Can't open '$ufn' for writing.\n");
+			for my $ifn (@unps) { cat_file($ifn, $ofh); }
+			close($ofh);
+			exit 0 unless $no_pipes;
+		}
+	}
+} else {
+	if(scalar(@mate2s) > 0) {
+		# Just pass all the mate arguments along to the binary
+		push @bt2_args, ("-1", join(",", @mate1s));
+		push @bt2_args, ("-2", join(",", @mate2s));
+	}
+	if(scalar(@unps) > 0) {
+		push @bt2_args, ("-U", join(",", @unps));
+	}
+}
+
+if(defined($ref_str)) {
+	my $ofn = "$temp_dir/$$.ref_str.fa";
+	open(my $ofh, ">$ofn") ||
+		Fail("could not open temporary fasta file '$ofn' for writing.\n");
+	print {$ofh} ">1\n$ref_str\n";
+	close($ofh);
+	push @to_delete, $ofn;
+	system("$build_bin $ofn $ofn") == 0 ||
+		Fail("centrifuge-build returned non-0 exit level.\n");
+	push @bt2_args, ("--index", "$ofn");
+	push @to_delete, ("$ofn.1.".$idx_ext, "$ofn.2.".$idx_ext, 
+	                  "$ofn.3.".$idx_ext, "$ofn.4.".$idx_ext,
+			  "$ofn.5.".$idx_ext, "$ofn.6.".$idx_ext,
+	                  "$ofn.rev.1.".$idx_ext, "$ofn.rev.2.".$idx_ext,
+			  "$ofn.rev.5.".$idx_ext, "$ofn.rev.6.".$idx_ext);
+}
+
+Info("After arg handling:\n");
+Info("  Binary args:\n[ @bt2_args ]\n");
+
+my $index_name = Extract_IndexName_From(@bt2_args);
+
+my $debug_str = ($debug ? "-debug" : "");
+
+# Construct command invoking centrifuge-class
+my $cmd = "$align_prog$debug_str --wrapper basic-0 ".join(" ", @bt2_args);
+
+# Possibly add read input on an anonymous pipe
+$cmd = "$readpipe $cmd" if defined($readpipe);
+
+Info("$cmd\n");
+my $ret;
+if(defined($cap_out)) {
+	# Open Centrifuge pipe
+	open(BT, "$cmd |") || Fail("Could not open Centrifuge pipe: '$cmd |'\n");
+	# Open output pipe
+	my $ofh = *STDOUT;
+	my @fhs_to_close = ();
+	if($cap_out ne "-") {
+		open($ofh, ">$cap_out") ||
+			Fail("Could not open output file '$cap_out' for writing.\n");
+	}
+	my %read_fhs = ();
+	for my $i ("al", "un", "al-conc", "un-conc") {
+		if(defined($read_fns{$i})) {
+            my ($vol, $base_spec_dir, $base_fname) = File::Spec->splitpath($read_fns{$i});
+            if (-d $read_fns{$i}) {
+                $base_spec_dir = $read_fns{$i};
+                $base_fname = undef;
+            }
+			if($i =~ /-conc$/) {
+				# Open 2 output files, one for mate 1, one for mate 2
+				my ($fn1, $fn2);
+                if ($base_fname) {
+                    ($fn1, $fn2) = ($base_fname,$base_fname);
+                }
+                else {
+                    ($fn1, $fn2) = ($i.'-mate',$i.'-mate');
+                }
+				if($fn1 =~ /%/) {
+					$fn1 =~ s/%/1/g; $fn2 =~ s/%/2/g;
+				} elsif($fn1 =~ /\.[^.]*$/) {
+					$fn1 =~ s/\.([^.]*)$/.1.$1/;
+					$fn2 =~ s/\.([^.]*)$/.2.$1/;
+				} else {
+					$fn1 .= ".1";
+					$fn2 .= ".2";
+				}
+                $fn1 = File::Spec->catpath($vol,$base_spec_dir,$fn1);
+                $fn2 = File::Spec->catpath($vol,$base_spec_dir,$fn2);
+				$fn1 ne $fn2 || Fail("$fn1\n$fn2\n");
+				my ($redir1, $redir2) = (">$fn1", ">$fn2");
+				$redir1 = "| gzip -c $redir1"  if $read_compress{$i} eq "gzip";
+				$redir1 = "| bzip2 -c $redir1" if $read_compress{$i} eq "bzip2";
+				$redir2 = "| gzip -c $redir2"  if $read_compress{$i} eq "gzip";
+				$redir2 = "| bzip2 -c $redir2" if $read_compress{$i} eq "bzip2";
+				open($read_fhs{$i}{1}, $redir1) || Fail("Could not open --$i mate-1 output file '$fn1'\n");
+				open($read_fhs{$i}{2}, $redir2) || Fail("Could not open --$i mate-2 output file '$fn2'\n");
+				push @fhs_to_close, $read_fhs{$i}{1};
+				push @fhs_to_close, $read_fhs{$i}{2};
+			} else {
+			    my $redir = ">".File::Spec->catpath($vol,$base_spec_dir,$i."-seqs");
+			    if ($base_fname) {
+				    $redir = ">$read_fns{$i}";
+			    }
+				$redir = "| gzip -c $redir"  if $read_compress{$i} eq "gzip";
+				$redir = "| bzip2 -c $redir" if $read_compress{$i} eq "bzip2";
+				open($read_fhs{$i}, $redir) || Fail("Could not open --$i output file '$read_fns{$i}'\n");
+				push @fhs_to_close, $read_fhs{$i};
+			}
+		}
+	}
+	while(<BT>) {
+		chomp;
+		my $filt = 0;
+		unless(substr($_, 0, 1) eq "@") {
+			# If we are supposed to output certain reads to files...
+			my $tab1_i = index($_, "\t") + 1;
+			my $tab2_i = index($_, "\t", $tab1_i);
+			my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
+			my $unal = ($fl & 4) != 0;
+			$filt = 1 if $no_unal && $unal;
+			if($passthru) {
+				if(scalar(keys %read_fhs) == 0) {
+					# Next line is read with some whitespace escaped
+					my $l = <BT>;
+				} else {
+					my $mate1 = (($fl &  64) != 0);
+					my $mate2 = (($fl & 128) != 0);
+					my $unp = !$mate1 && !$mate2;
+					my $pair = !$unp;
+					# Next line is read with some whitespace escaped
+					my $l = <BT>;
+					chomp($l);
+					$l =~ s/%(..)/chr(hex($1))/eg;
+					if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) {
+						if($unal) {
+							# Failed to align
+							print {$read_fhs{un}} $l if defined($read_fhs{un});
+						} else {
+							# Aligned
+							print {$read_fhs{al}} $l if defined($read_fhs{al});
+						}
+					}
+					if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"})) && $pair) {
+						my $conc  = (($fl &   2) != 0);
+						if     ($conc && $mate1) {
+							print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"});
+						} elsif($conc && $mate2) {
+							print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"});
+						} elsif(!$conc && $mate1) {
+							print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"});
+						} elsif(!$conc && $mate2) {
+							print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"});
+						}
+					}
+				}
+			}
+		}
+		print {$ofh} "$_\n" if !$filt;
+	}
+	for my $k (@fhs_to_close) { close($k); }
+	close($ofh);
+	close(BT);
+	$ret = $?;
+} else {
+	$ret = system($cmd);
+}
+if(!$keep) { for(@to_delete) { unlink($_); } }
+
+if ($ret == -1) {
+    Error("Failed to execute centrifuge-class: $!\n");
+	exit 1;
+} elsif ($ret & 127) {
+	my $signm = "(unknown)";
+	$signm = $signame[$ret & 127] if defined($signame[$ret & 127]);
+	my $ad = "";
+	$ad = "(core dumped)" if (($ret & 128) != 0);
+    Error("centrifuge-class died with signal %d (%s) $ad\n", ($ret & 127), $signm);
+	exit 1;
+} elsif($ret != 0) {
+	Error("centrifuge-class exited with value %d\n", ($ret >> 8));
+}
+exit ($ret >> 8);
diff --git a/centrifuge-BuildSharedSequence.pl b/centrifuge-BuildSharedSequence.pl
new file mode 100755
index 0000000..6456c50
--- /dev/null
+++ b/centrifuge-BuildSharedSequence.pl
@@ -0,0 +1,526 @@
+#!/bin/perl
+
+use strict ;
+use Getopt::Long;
+use File::Basename;
+
+my $usage = "perl ".basename($0)." file_list [-prefix tmp -kmerSize 53 -kmerPortion 0.01 -nucmerIdy 99 -overlap 250 [-fragment] ]" ;
+
+my @fileNames ; # Assume the genes for each subspecies are concatenated.
+my @used ;
+my $i ;
+my $j ;
+my $k ;
+my %globalKmer ;
+my $kmerSize = 53 ;
+my $useKmerPortion = 0.01 ;
+my %localKmer ;
+my $id = 0 ;
+my $kmer ;
+my %chroms ;
+my %shared ;
+my $id = "" ;
+my $seq = "" ;
+my $index ;
+my $prefix = "tmp";
+my %sharedKmerCnt ;
+my $nucmerIdy = 99 ;
+my $overlap = 250 ;
+my $fragment = 0 ;
+my $jellyfish = "jellyfish";
+
+#print `jellyfish --version`;
+
+GetOptions (
+	"prefix=s" => \$prefix,
+	"kmerSize=s" => \$kmerSize,
+	"kmerPortion=s" => \$useKmerPortion,
+	"nucmerIdy=s" => \$nucmerIdy,
+	"overlap=s" => \$overlap,
+	"fragment" => \$fragment,
+	"jellyfish=s" => \$jellyfish)
+or die("Error in command line arguments. \n\n$usage");
+
+die "$usage\n" if ( scalar( @ARGV ) == 0 ) ;
+open FP1, $ARGV[0] ; 
+
+# Create the temporary files, while making sure the header id is unique within each file
+my $listSize = 0 ;
+while ( <FP1> )
+{
+	chomp ;
+	open FP2, $_ ;
+	my $fileName = $prefix."_".$listSize.".fa" ;
+	open FPtmp, ">$fileName" ;
+	my $chromCnt = 0 ;
+	while ( <FP2> )
+	{
+		if ( /^>/ )
+		{
+			s/\s/\|$chromCnt / ;
+			print FPtmp ;
+			++$chromCnt ;
+		}
+		else
+		{
+			print FPtmp ;
+		}
+	}
+	++$listSize ;
+}
+
+# Read in the file names
+#while ( <FP1> )
+#{
+#	chomp ;
+#	push @fileNames, $_ ;
+#	push @used, 0 ;
+#	++$index ;
+#}
+#close( FP1 ) ;
+for ( my $i = 0 ; $i < $listSize ; ++$i )
+{
+	my $fileName = $prefix."_".$i.".fa" ;
+	push @fileNames, $fileName ;
+	push @used, 1 ;
+	++$index ;
+}
+
+
+
+# Count and select kmers
+print "Find the kmers for testing\n" ;
+system_call("$jellyfish count -o tmp_$prefix.jf -m $kmerSize -s 5000000 -C -t 8 @fileNames") ;
+system_call("$jellyfish dump tmp_$prefix.jf > tmp_$prefix.jf_dump") ;
+
+srand( 17 ) ;
+open FP1, "tmp_$prefix.jf_dump" ;
+open FP2, ">tmp_$prefix.testingKmer" ;
+while ( <FP1> )
+{
+	my $line = $_ ;
+	$kmer = <FP1> ;
+	#chomp $kmer ;
+	
+	#++$testingKmer{ $kmer} if ( rand() < $useKmerPortion ) ;
+	print FP2 "$line$kmer" if ( rand() < $useKmerPortion ) ;
+		
+}
+close FP1 ;
+close FP2 ;
+
+print "Get the kmer profile for each input file\n" ;
+for ( $i = 0 ; $i < scalar( @fileNames )  ; ++$i )
+{
+	my $fileName = $fileNames[ $i ] ;
+	system_call("$jellyfish count --if tmp_$prefix.testingKmer -o tmp_$prefix.jf -m $kmerSize -s 5000000 -C -t 8 $fileName") ;
+	system_call("$jellyfish dump tmp_$prefix.jf > tmp_$prefix.jf_dump") ;
+	open FP1, "tmp_$prefix.jf_dump";
+
+	while ( <FP1> )
+	{
+		chomp ;
+		my $cnt = substr( $_, 1 )  ;
+		if ( $cnt eq "0" ) 
+		{
+			$kmer = <FP1> ;
+			next ;
+		}
+		#print "$cnt\n" ;
+		$kmer = <FP1> ;
+		chomp $kmer ;
+		$localKmer{$i}->{$kmer} = 1 ; 
+	}
+	close FP1 ;
+}
+
+# Get the genome sizes
+sub GetGenomeSize
+{
+	open FPfa, $_[0] ;
+	my $size = 0 ;
+	while ( <FPfa> )
+	{
+		next if ( /^>/ ) ;
+		$size += length( $_ ) - 1 ;
+	}
+	close FPfa ;
+	return $size ;
+}
+my $longestGenome = 0 ;
+for ( $i = 0 ;  $i < scalar( @fileNames ) ; ++$i )
+{
+	my $size = GetGenomeSize( $fileNames[$i] ) ;
+	if ( $size > $longestGenome )
+	{
+		$longestGenome = $size ;
+	}
+}
+
+#for ( $i = 0 ; $i < scalar( @fileNames ) ; ++$i )
+print "Begin merge files\n" ;
+my $maxSharedKmerCnt = -1 ;
+while ( 1 ) 
+{
+	# Find the suitable files
+	my @maxPair ;
+	my $max = 0 ;
+	print "Selecting two genomes to merge.\n" ;
+	foreach $i (keys %localKmer )
+	{
+		foreach $j ( keys %localKmer )
+		{
+			next if ( $i <= $j ) ;
+
+			my $cnt = 0 ;
+
+			if ( defined $sharedKmerCnt{ "$i $j" } )
+			{
+				$cnt = $sharedKmerCnt{ "$i $j" }
+			}
+			else
+			{
+				foreach $kmer ( keys %{$localKmer{ $i } } )
+				{
+					#print $kmer, "\n" ;
+					if ( defined $localKmer{ $j }->{ $kmer} ) 
+					{
+						++$cnt ;				
+					}	
+				}
+				$sharedKmerCnt{ "$i $j" } = $cnt ;
+			}
+
+			if ( $cnt > $max )
+			{
+				$max = $cnt ;
+				$maxPair[0] = $i ;
+				$maxPair[1] = $j ;
+			}
+		}
+	}
+
+	$maxSharedKmerCnt = $max if ( $maxSharedKmerCnt == -1 ) ;
+	last if ( $max == 0 || $max < $maxSharedKmerCnt * 0.01 ) ;
+
+	my @commonRegion ;
+	my $fileNameA ;
+	my $fileNameB ;
+	$i = $maxPair[0] ;
+	$j = $maxPair[1] ;
+	if ( $i < scalar( @fileNames ) )
+	{
+		$fileNameA = $fileNames[ $i ] ;
+	}
+	else
+	{
+		$fileNameA = $prefix."_".$i.".fa" ;
+	}
+	if ( $j < scalar( @fileNames ) )
+	{
+		$fileNameB = $fileNames[ $j ] ;
+	}
+	else
+	{
+		$fileNameB = $prefix."_".$j.".fa" ;
+	}
+
+	if ( GetGenomeSize( $fileNameA ) < 0.01 * $longestGenome || GetGenomeSize( $fileNameB ) < 0.01 * $longestGenome )
+	{
+		last ;
+	}
+
+	my $nucmerC = 3 * $overlap ;
+	my $nucmerG = 10 ;
+	my $nucmerB = 10 ;
+	if ( $i >= scalar( @fileNames ) && $j >= scalar( @fileNames ) )
+	{
+		$nucmerG = 5 ;
+		$nucmerB = 5 ;
+	}
+	print "nucmer --maxmatch --coords -l $kmerSize -g $nucmerG -b $nucmerB -c $nucmerC -p nucmer_$prefix $fileNameA $fileNameB\n" ; 
+	my $nucRet = system("nucmer --maxmatch --coords -l $kmerSize -g $nucmerG -b $nucmerB -c $nucmerC -p nucmer_$prefix $fileNameA $fileNameB") ; # if the call to nucmer failed, we just not compress at all. 
+
+	open FPCoords, "nucmer_$prefix.coords" ;
+	my $line ;
+	$line = <FPCoords> ;
+	$line = <FPCoords> ;
+	$line = <FPCoords> ;
+	$line = <FPCoords> ;
+	$line = <FPCoords> ;
+	
+	#1     5195  |        1     5195  |     5195     5195  |    99.98  | gi|385223048|ref|NC_017374.1|	gi|385230889|ref|NC_017381.1|
+	print "Merging $fileNameA $fileNameB\n" ;
+
+	my $cnt = 0 ;	
+	while ( <FPCoords> )
+	{
+		chomp ;
+		$line = $_ ;
+		my @cols = split /\s+/, $line ;
+
+		shift @cols if ( $cols[0] eq "" ) ;
+
+		next if ( $cols[6] <= 3 * $overlap || $cols[9] < $nucmerIdy ) ;
+		
+		++$cnt ;
+		my $ind = scalar( @commonRegion ) ;
+		push @{ $commonRegion[$ind] }, ( $cols[11], $cols[0] + $overlap, $cols[1] - $overlap )  ;
+		if ( $cols[3] < $cols[4] )
+		{
+			push @{ $commonRegion[ $ind ] }, ( $cols[12], $cols[3] + $overlap, $cols[4] - $overlap ) ;
+		}
+		else
+		{
+			push @{ $commonRegion[ $ind ] }, ( $cols[12], $cols[4] + $overlap, $cols[3] - $overlap ) ;
+		}
+	}
+	last if ( $cnt == 0 ) ;
+	close FPCoords ;
+
+	my $outputSeq = "" ;
+	my $outputHeader = ">${prefix}_${index}" ;
+	# Use fileNameA to represent the shared sequences
+	if ( $fragment == 0 )
+	{
+		# Just a big chunk.
+		open FP1, $fileNameA ;
+		while ( <FP1> )
+		{
+			chomp ;
+			next if ( /^>/ ) ;
+			$outputSeq .= $_ ;
+			#print "$_\n$outputSeq\n" ;
+		}
+		close FP1 ;
+	}
+	else
+	{
+		open FP1, $fileNameA ;
+		$id = "" ;
+		$seq = "" ;
+		undef %chroms ;
+		undef %shared ;
+		while ( <FP1> )
+		{
+			chomp ;
+			if ( /^>/ )
+			{
+				$chroms{ $id } = $seq if ( $id ne "" ) ;
+				$id = ( split /\s+/, substr( $_, 1 ) )[0] ;
+				#print $id, "\n" ;
+				$seq = "" ;
+				@#{ $shared{ $id } } = length( $seq ) + 1 ; 
+			}
+			else
+			{
+				$seq .= $_ ;
+			}
+		}
+		$chroms{ $id } = $seq if ( $id ne "" ) ;
+		@#{ $shared{ $id } } = length( $seq ) + 1 ; 
+		close FP1 ;
+
+		for ( $i = 0 ; $i < scalar( @commonRegion) ; ++$i )
+		{
+			#print "hi $i $commonRegion[$i]->[1] $commonRegion[$i]->[2]\n" ;	
+			my $tmpArray = \@{ $shared{ $commonRegion[$i]->[0] } } ;
+			$commonRegion[$i]->[1] -= $overlap if ( $commonRegion[$i]->[1] == $overlap + 1 ) ;
+			$commonRegion[$i]->[2] += $overlap if ( $commonRegion[$i]->[2] + $overlap == length( $chroms{ $commonRegion[$i]->[0] } ) ) ;
+			for ( $j = $commonRegion[$i]->[1] - 1 ; $j < $commonRegion[$i]->[2] ; ++$j ) # Shift the coordinates
+			{
+				$tmpArray->[$j] = 1 ;
+			}
+		}
+
+		# Print the information of genome A, including the shared part
+		my $fileName = $prefix."_".$index.".fa" ;
+		open fpOut, ">$fileName" ;
+		foreach $i (keys %chroms )
+		{
+			my $tmpArray = \@{ $shared{ $i } } ;
+			my $len = length( $chroms{ $i } ) ;
+			my $header = ( split /\|Range:/, $i )[0] ;
+			my $origStart = ( split /-/, ( ( split /\|Range:/, $i )[1] ) )[0] ;
+			for ( $j = 0 ; $j < $len ;  )
+			{
+				if ( $tmpArray->[$j] == 1 )
+				{
+					my $start = $j ;
+					my $end ;
+					for ( $end = $j + 1 ; $end < $len && $tmpArray->[$end] == 1 ; ++$end )
+					{
+						;
+					}
+					--$end ;
+					my $rangeStart = $origStart + $start ;
+					my $rangeEnd = $origStart + $end ;
+					print fpOut ">$header|Range:$rangeStart-$rangeEnd shared\n" ;
+					print fpOut substr( $chroms{$i}, $start, $end - $start + 1 ), "\n" ;
+
+					$j = $end + 1 ;
+				}
+				else
+				{
+					my $start = $j ;
+					my $end ;
+
+					for ( $end = $j + 1 ; $end < $len && $tmpArray->[$end] == 0 ; ++$end )
+					{
+						;
+					}
+					--$end ;
+					my $rangeStart = $origStart + $start ;
+					my $rangeEnd = $origStart + $end ;
+
+					print fpOut ">$header|Range:$rangeStart-$rangeEnd non-shared\n" ;
+					print fpOut substr( $chroms{$i}, $start, $end - $start + 1 ), "\n" ;
+
+					$j = $end + 1 ;
+				}
+			}
+		}
+	} # end if fragment. There might be bugs in the fragment mode
+
+	# Print the sequence from genome B, only including the non-shared part
+	open FP1, $fileNameB ;
+	$id = "" ;
+	$seq = "" ;
+	undef %chroms ;
+	undef %shared ;
+	while ( <FP1> )
+	{
+		chomp ;
+		if ( /^>/ )
+		{
+			$chroms{ $id } = $seq if ( $id ne "" ) ;
+			$id = ( split /\s+/, substr( $_, 1 ) )[0] ;
+			$seq = "" ;
+			@#{ $shared{ $id } } = length( $seq ) + 1 ; 
+		}
+		else
+		{
+			$seq .= $_ ;
+		}
+	}
+	$chroms{ $id } = $seq if ( $id ne "" ) ;
+	@#{ $shared{ $id } } = length( $seq ) + 1 ; 
+	close FP1 ;
+
+	for ( $i = 0 ; $i < scalar( @commonRegion) ; ++$i )
+	{
+		my $tmpArray = \@{ $shared{ $commonRegion[$i]->[3] } } ;
+		$commonRegion[$i]->[4] -= $overlap if ( $commonRegion[$i]->[4] == $overlap + 1 ) ;
+		$commonRegion[$i]->[5] += $overlap if ( $commonRegion[$i]->[5] + $overlap == length( $chroms{ $commonRegion[$i]->[3] } ) ) ;
+		for ( $j = $commonRegion[$i]->[4] - 1 ; $j < $commonRegion[$i]->[5] ; ++$j )	
+		{
+			$tmpArray->[$j] = 1 ;
+		}
+	}
+
+	foreach $i (keys %chroms )
+	{
+		my $tmpArray = \@{ $shared{ $i } } ;
+		my $len = length( $chroms{ $i } ) ;
+		my $header = ( split /\|Range:/, $i )[0] ;
+		my $origStart = ( split /-/, ( ( split /\|Range:/, $i )[1] ) )[0] ;
+		for ( $j = 0 ; $j < $len ;  )
+		{
+			if ( $tmpArray->[$j] == 1 ) 
+			{
+				++$j ;
+				next ;
+			}
+
+			my $start = $j ;
+			my $end ;
+			for ( $end = $j + 1 ; $end < $len && $tmpArray->[$end] == 0 ; ++$end )
+			{
+				;
+			}
+			--$end ;
+			$j = $end + 1 ;
+			
+			next if ( $end - $start < $overlap ) ;
+
+			my $rangeStart = $origStart + $start ;
+			my $rangeEnd = $origStart + $end ;
+
+			if ( $fragment == 0 )
+			{
+				#print length( $outputSeq ), "\n" ;
+				$outputSeq .= substr( $chroms{$i}, $start, $end - $start + 1 ) ; 	
+				#print length( $outputSeq ), "\n" ;
+			}
+			else
+			{
+				my $fileName = $prefix."_".$index.".fa" ;
+				#open fpOut, ">>$fileName" ;
+				print fpOut ">$header|Range:$rangeStart-$rangeEnd non-shared\n" ;
+				print fpOut substr( $chroms{$i}, $start, $end - $start + 1 ), "\n" ;
+				#close fpOut ;
+			}
+		}
+	}
+
+	delete $localKmer{ $maxPair[0] } ;
+	delete $localKmer{ $maxPair[1] } ;
+
+	#print defined( $localKmer{ $maxPair[0] }) ;
+	unlink glob("$fileNameA") ; #if ( $maxPair[0] >= scalar( @fileNames ) ) ;
+	unlink glob("$fileNameB") ; #if ( $maxPair[1] >= scalar( @fileNames ) ) ;
+
+	# Count the kmer for the new genome
+	my $fileName = $prefix."_".$index.".fa" ;
+	if ( $fragment == 0 )
+	{
+		open fpOut, ">$fileName" ;
+		print fpOut "$outputHeader\n$outputSeq\n" ;
+	}
+	close fpOut ;
+
+	system_call("$jellyfish count --if tmp_$prefix.testingKmer -o tmp_$prefix.jf -m $kmerSize -s 5000000 -C -t 4 $fileName") ;
+	system_call("$jellyfish dump tmp_$prefix.jf > tmp_$prefix.jf_dump") ;
+	open FP1, "tmp_$prefix.jf_dump";
+
+	while ( <FP1> )
+	{
+		chomp ;
+		my $cnt = substr( $_, 1 )  ;
+		if ( $cnt eq "0" )
+		{
+			$kmer = <FP1> ;
+			next ;
+		}
+		$kmer = <FP1> ;
+		chomp $kmer ;
+		$localKmer{$index}->{$kmer} = 1 ;
+	}
+	close FP1 ;
+	
+	++$index ;
+} # while 1
+
+#foreach $i (keys %localKmer )
+#{
+#	if ( $i < scalar( @fileNames ) ) 
+#	{
+#		my $path = $fileNames[ $i ] ;
+#		my $fileName = $prefix."_".$i.".fa" ;
+#		system_call("cp $path $fileName") ;
+#	}
+#}
+
+# clean up
+unlink glob("tmp_$prefix.jf tmp_$prefix.jf_dump tmp_$prefix.testingKmer") ;
+unlink glob("nucmer_$prefix*") ;
+print "Finish.\n" ;
+
+sub system_call {
+    print STDERR "SYSTEM CALL: ".join(" ", at _)."\n" ;
+	system(@_) == 0
+	  or die "system @_ failed: $?";
+    print STDERR " finished\n";
+}
+
+
diff --git a/centrifuge-RemoveEmptySequence.pl b/centrifuge-RemoveEmptySequence.pl
new file mode 100644
index 0000000..5f55d4b
--- /dev/null
+++ b/centrifuge-RemoveEmptySequence.pl
@@ -0,0 +1,28 @@
+#!/bin/perl
+
+# remove the headers with empty sequences. possible introduced by dustmask
+
+use strict ;
+
+# die "usage: a.pl < in.fa >out.fa"
+
+my $tag ;
+my @lines ;
+
+$lines[0] = <> ;
+$tag = 0 ;
+while ( <> )
+{
+	$lines[1-$tag] = $_ ;
+	if ( /^>/ && $lines[$tag] =~ /^>/ ) 
+	{
+		$tag = 1 - $tag ;
+		next ;
+	}
+	print $lines[$tag] ;
+	$tag = 1- $tag ;
+}
+if ( !( $lines[$tag] =~ /^>/ ) )
+{
+	print $lines[$tag] ;
+}
diff --git a/centrifuge-RemoveN.pl b/centrifuge-RemoveN.pl
new file mode 100644
index 0000000..323110d
--- /dev/null
+++ b/centrifuge-RemoveN.pl
@@ -0,0 +1,57 @@
+#/bin/perl
+
+use strict ;
+use warnings ;
+
+die "usage: a.pl xxx.fa > output.fa" if ( @ARGV == 0 ) ;
+
+my $LINE_WIDTH = 80 ;
+my $BUFFER_SIZE = 100000 ;
+open FP1, $ARGV[0] ;
+my $buffer = "" ;
+
+while ( <FP1> )
+{
+	if ( /^>/ )
+	{
+		my $line = $_ ;
+		if ( $buffer ne "" )
+		{
+			my $len = length( $buffer ) ;
+			for ( my $i = 0 ; $i < $len ; $i += $LINE_WIDTH )
+			{
+				print substr( $buffer, $i, $LINE_WIDTH ), "\n" ;
+			}
+			$buffer = "" ;
+		}
+		print $line ;
+	}
+	else
+	{
+		chomp ;
+		tr/nN//d ;
+		#my $line = uc( $_ ) ;
+		my $line = $_ ;
+		$buffer .= $line ;
+
+		if ( length( $buffer ) > $BUFFER_SIZE )
+		{
+			my $len = length( $buffer ) ;
+			my $i = 0 ;
+			for ( $i = 0 ; $i + $LINE_WIDTH - 1 < $len ; $i += $LINE_WIDTH )
+			{
+				print substr( $buffer, $i, $LINE_WIDTH ), "\n" ;
+			}
+			substr( $buffer, 0, $i, "" ) ;
+		}
+	}
+}
+if ( $buffer ne "" )
+{
+	my $len = length( $buffer ) ;
+	for ( my $i = 0 ; $i < $len ; $i += $LINE_WIDTH )
+	{
+		print substr( $buffer, $i, $LINE_WIDTH ), "\n" ;
+	}
+	$buffer = "" ;
+}
diff --git a/centrifuge-build b/centrifuge-build
new file mode 100755
index 0000000..a36f40d
--- /dev/null
+++ b/centrifuge-build
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+
+"""
+ Copyright 2014, Daehwan Kim <infphilo at gmail.com>
+
+ This file is part of Centrifuge, which is copied and modified from bowtie2 in the Bowtie2 package.
+
+ Centrifuge is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Centrifuge is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Centrifuge.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+import os
+import sys
+import inspect
+import logging
+
+
+def build_args():
+    """
+    Parse the wrapper arguments. Returns the options,<programm arguments> tuple.
+    """
+
+    parsed_args = {}
+    to_remove = []
+    argv = sys.argv[:]
+    for i, arg in enumerate(argv):
+        if arg == '--debug':
+            parsed_args[arg] = ""
+            to_remove.append(i)
+        elif arg == '--verbose':
+            parsed_args[arg] = ""
+            to_remove.append(i)
+
+    for i in reversed(to_remove):
+        del argv[i]
+
+    return parsed_args, argv
+
+
+def main():
+    logging.basicConfig(level=logging.ERROR,
+                        format='%(levelname)s: %(message)s'
+                        )
+    delta               = 200
+    small_index_max_size= 4 * 1024**3 - delta
+    build_bin_name      = "centrifuge-build-bin"
+    build_bin_s         = "centrifuge-build-bin"
+    curr_script         = os.path.realpath(inspect.getsourcefile(main))
+    ex_path             = os.path.dirname(curr_script)
+    build_bin_spec      = os.path.join(ex_path,build_bin_s)
+
+    options, argv = build_args()
+
+    if '--verbose' in options:
+        logging.getLogger().setLevel(logging.INFO)
+        
+    if '--debug' in options:
+        build_bin_spec += '-debug'
+        build_bin_l += '-debug'
+
+    argv[0] = build_bin_name
+    argv.insert(1, 'basic-0')
+    argv.insert(1, '--wrapper')
+    logging.info('Command: %s %s' % (build_bin_spec, ' '.join(argv[1:])))
+    os.execv(build_bin_spec, argv)
+
+if __name__ == "__main__":
+    main()
diff --git a/centrifuge-compress.pl b/centrifuge-compress.pl
new file mode 100755
index 0000000..6804df1
--- /dev/null
+++ b/centrifuge-compress.pl
@@ -0,0 +1,575 @@
+#!/usr/bin/perl
+
+# Read and merge the sequence for the chosen level
+
+use strict ;
+use warnings ;
+
+use threads ;
+use threads::shared ;
+use FindBin qw($Bin);
+use File::Basename;
+use File::Find;
+use Getopt::Long;
+use Cwd;
+use Cwd 'cwd' ;
+use Cwd 'abs_path' ;
+
+my $CWD = dirname( abs_path( $0 ) ) ;
+my $PWD = abs_path( "./" ) ;
+
+my $usage = "USAGE: perl ".basename($0)." path_to_download_files path_to_taxnonomy [-map header_to_taxid_map -o compressed -noCompress -t 1 -maxG 50000000 -noDustmasker]\n" ;
+
+my $level = "species" ;
+my $output = "compressed" ;
+my $bssPath = $CWD ; # take path of binary as script directory
+my $numOfThreads = 1 ;
+my $noCompress = 0 ;
+my $noDustmasker = 0 ;
+my $verbose = 0;
+my $maxGenomeSizeForCompression = 50000000 ;
+my $mapFile = "" ;
+
+GetOptions ("level|l=s" => \$level,
+			"output|o=s" => \$output,
+			"bss=s" => \$bssPath,
+			"threads|t=i" => \$numOfThreads,
+			"maxG=i" => \$maxGenomeSizeForCompression, 
+			"map=s" => \$mapFile, 
+            "verbose|v" => \$verbose,
+			"noCompress" => \$noCompress,
+			"noDustmasker" => \$noDustmasker)
+or die("Error in command line arguments. \n\n$usage");
+
+die $usage unless @ARGV == 2;
+
+my $path = $ARGV[0] ;
+my $taxPath = $ARGV[1] ;
+
+my $i ;
+
+my %gidToFile ;
+my %gidUsed ;
+my %tidToGid ; # each tid can corresponds several gid
+my %gidToTid ;
+my %speciesList ; # hold the tid in the species
+my %species ; # hold the species tid
+my %genus ; # hold the genus tid
+my %speciesToGenus ;
+my %fileUsed : shared ;
+my $fileUsedLock : shared ;
+my %taxTree ;
+
+my @speciesListKey ;
+my $speciesUsed : shared ;
+my $debug: shared ;
+my %speciesIdToName : shared ;
+
+my %idToTaxId : shared ; 
+my %newIdToTaxId : shared ;
+my %idToGenomeSize : shared ;
+my $idMapLock : shared ;
+
+my $step = 1 ;
+
+if ( `which dustmasker` eq "" ) 
+{
+	print STDERR "Could not find dustmasker. And will turn on -noDustmasker option.\n" ;
+	$noDustmasker = 1 ;
+}
+
+#Extract the gid we met in the file
+if ( $mapFile ne "" )
+{
+	print STDERR "Step $step: Reading in the provided mapping list of ids to taxionomy ids.\n";
+	++$step ;
+
+	open FP1, $mapFile ;
+	while ( <FP1> )
+	{
+		chomp ;
+		my @cols = split ;
+		$idToTaxId{ $cols[0] } = $cols[1] ;
+
+		if ( $noCompress == 1 )
+		{
+			$newIdToTaxId{ $cols[0] } = $cols[1] ;
+		}
+	}
+}
+
+print STDERR "Step $step: Collecting all .fna files in $path and getting gids\n";
+++$step ;
+if ( $noCompress == 1 )
+{
+	`rm tmp_output.fa` ;
+}
+
+find ( { wanted=>sub {
+    return unless -f  ;        # Must be a file
+    return unless -s;        # Must be non-zero
+    if ( !( /\.f[nf]?a$/ || /\.ffn$/ || /\.fasta$/ ) )
+    {
+    	return ;
+    }
+
+    my $fullfile = $File::Find::name; ## file with full path, but the CWD is actually the file's path
+    my $file = $_; ## file name
+	open FP2, $file or die "Error opening $file: $!";
+	my $head = <FP2> ;
+	close FP2 ;
+
+	chomp $head ;
+	my $headId = substr( ( split /\s+/, $head )[0], 1 ) ;  
+
+	if ( $noCompress == 1 )
+	{
+		# it seems the find will change the working directory
+		system_call( "cat $PWD/$fullfile >> $PWD/tmp_output.fa" ) ;
+		if ( defined $idToTaxId{ $headId } )
+		{
+			$newIdToTaxId{ $headId } = $idToTaxId{ $headId } ;
+		}
+		else
+		{
+			$newIdToTaxId{ $headId } = -1 ;
+			my @cols = split /\|/, $headId ;
+			my $subHeader = $cols[0]."\|".$cols[1] ; 
+			if ( $headId =~ /gi\|([0-9]+)?\|/ )
+			{
+				$newIdToTaxId{ $subHeader } = -1 ;
+				#print STDERR "$headId $subHeader\n" if $verbose ;
+			}
+			elsif ( $headId =~ /taxid\|([0-9]+)?[\|\s]/ )
+			{
+				$newIdToTaxId{ $headId } = $1 ;
+				$newIdToTaxId{ $subHeader } = $1 ;
+			}
+			elsif ( scalar( @cols ) > 2 )
+			{
+				$newIdToTaxId{ $subHeader } = -1 ;
+			}
+		}
+	}
+
+
+	if ( defined $idToTaxId{ $headId } )
+	{
+		my $tid = $idToTaxId{ $headId } ;
+		my $dummyGid = "centrifuge_gid_".$fullfile."_$tid" ;
+		$gidUsed{ $dummyGid } = 1 ;
+		$gidToFile{ $dummyGid } = $fullfile ;
+		$fileUsed{ $fullfile } = 0 ;
+		push @{ $tidToGid{ $tid } }, $dummyGid ;	
+
+		print STDERR "tid=$tid $file\n" if $verbose;
+	}
+	elsif ( $head =~ /^>gi\|([0-9]+)?\|/ ) {
+		my $gid = $1 ;
+		print STDERR "gid=$gid $file\n" if $verbose;
+		if ( defined $gidUsed{ $gid } )
+		{
+			print "Repeated gid $gid\n" if $verbose ;
+			$fileUsed{ $fullfile } = 1 ;
+		}
+		else
+		{
+			$fileUsed{ $fullfile } = 0 ;
+			$gidToFile{ $gid } = $fullfile ;
+		}
+
+		$gidUsed{ $gid } = 1 ;
+	} elsif ( $head =~ /taxid\|([0-9]+)?[\|\s]/ ) {
+		my $tid = $1 ;
+		my $dummyGid = "centrifuge_gid_".$fullfile."_$1" ;
+		$gidUsed{ $dummyGid } = 1 ;
+		$gidToFile{ $dummyGid } = $fullfile ;
+		$fileUsed{ $fullfile } = 0 ;
+		push @{ $tidToGid{ $tid } }, $dummyGid ;	
+		
+		print STDERR "tid=$tid $file\n" if $verbose;
+	} else {
+		print STDERR "Excluding $fullfile: Wrong header.\n";
+	}
+
+}, follow=>1 }, $path );
+
+if ( $noCompress == 1 ) 
+{
+# Remove the Ns from the file
+	if ( $noDustmasker == 1 )
+	{
+		system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
+	}
+	else
+	{
+		system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa > tmp_output_fmt.fa") ;
+		system_call( "dustmasker -infmt fasta -in tmp_output_fmt.fa -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]//g' > tmp_output_dustmasker.fa" ) ;
+		system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output_dustmasker.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
+	}
+}
+
+# Extract the tid that are associated with the gids
+print STDERR "Step $step: Extract the taxonomy ids that are associated with the gids\n";
+++$step ;
+open FP1, "$taxPath/gi_taxid_nucl.dmp" ;
+while ( <FP1> )
+{
+	chomp ;
+	my @cols = split ;		
+#print $cols[0], "\n" ;	
+	next if ( @ARGV < 2 ) ;
+	if ( defined( $gidUsed{ $cols[0] } ) )
+	{
+		push @{ $tidToGid{ $cols[1] } }, $cols[0] ;
+		$gidToTid{ $cols[0] } = $cols[1] ;
+		print STDERR "gid=", $cols[0], " tid=", $cols[1], " ", $gidToFile{ $cols[0] }, "\n" if $verbose;
+	}
+}
+close FP1 ;
+
+if ( $noCompress == 1 )
+{
+	open FP1, ">tmp_$output.map" ;
+	foreach my $key (keys %newIdToTaxId )
+	{
+		if ( $newIdToTaxId{$key} != -1 )
+		{
+			print FP1 "$key\t", $newIdToTaxId{$key}, "\n" ; 
+		}
+		elsif ( $key =~ /gi\|([0-9]+)?/ )
+		{
+			#if ( defined $gidToTid{ $1 } )
+			#{
+			#	$newIdToTaxId{ $key } = $gidToTid{ $1 } ;
+			#}
+			my $taxId = 1 ;
+			if (defined $gidToTid{ $1 } )
+			{
+				$taxId = $gidToTid{ $1 } ;
+			}
+			print FP1 "$key\t", $taxId, "\n" ; 
+		}
+		else
+		{
+			print FP1 "$key\t1\n" ;
+		}
+	}
+	close FP1 ;
+	system_call( "sort tmp_$output.map | uniq > $output.map" ) ;
+	exit ;
+}
+
+# Organize the tree
+print STDERR "Step $step: Organizing the taxonomical tree\n";
+++$step ;
+open FP1, "$taxPath/nodes.dmp" or die "Couldn't open $taxPath/nodes.dmp: $!";
+while ( <FP1> ) {
+	chomp ;
+
+	my @cols = split ;
+#next if ( !defined $tidToGid{ $cols[0] } ) ;
+
+	my $tid = $cols[0] ;
+	my $parentTid = $cols[2] ;
+	my $rank = $cols[4] ;
+#print "subspecies: $tid $parentTid\n" ;
+#push @{ $species{ $parentTid } }, $tid ;
+#$tidToSpecies{ $tid } = $parentTid ;
+
+	$taxTree{ $cols[0] } = $cols[2] ;
+#print $cols[0], "=>", $cols[2], "\n" ;
+	if ( $rank eq "species" )	
+	{
+		$species{ $cols[0] } = 1 ;
+	}
+	elsif ( $rank eq "genus" )
+	{
+		$genus{ $cols[0] } = 1 ;
+	}
+}
+close FP1 ;
+
+# Put the sub-species taxonomy id into the corresponding species.
+print STDERR "Step $step: Putting the sub-species taxonomy id into the corresponding species\n";
+++$step ;
+for $i ( keys %tidToGid )
+{
+	my $p = $i ;
+	my $found = 0 ;
+	while ( 1 )
+	{
+		last if ( $p <= 1 ) ;
+		if ( defined $species{ $p } ) 
+		{
+			$found = 1 ;
+			last ;
+		}
+		if ( defined $taxTree{ $p } )
+		{
+			$p = $taxTree{ $p } ;
+		}
+		else
+		{
+			last ;
+		}
+	}
+
+	if ( $found == 1 )
+	{
+		push @{ $speciesList{ $p } }, $i ;
+	}
+}
+
+print STDERR "Step $step: Reading the name of the species\n";
+++$step ;
+open FP1, "$taxPath/names.dmp" or die "Could not open $taxPath/names.dmp: $!";
+while ( <FP1> )
+{
+	next if (!/scientific name/ ) ;
+	my @cols = split /\t/ ;
+
+	if ( defined $species{ $cols[0] } )
+	{
+		$speciesIdToName{ $cols[0] } = $cols[2] ;
+	}
+}
+close FP1 ;
+#exit ;
+
+sub GetGenomeSize
+{
+	open FPfa, $_[0] ;
+	my $size = 0 ;
+	while ( <FPfa> )
+	{
+		next if ( /^>/ ) ;
+		$size += length( $_ ) - 1 ;
+	}
+	close FPfa ;
+	return $size ;
+}
+
+# Compress one species.
+sub solve
+{
+# Extracts the files
+#print "find $pwd -name *.fna > tmp.list\n" ;
+	my $tid = threads->tid() - 1 ;
+#system_call("find $pwd -name *.fna > tmp_$tid.list") ;
+
+#system_call("find $pwd -name *.fa >> tmp.list") ; # Just in case
+# Build the header
+	my $genusId ;
+	my $speciesId = $_[0] ;
+	my $speciesName ;
+	my $i ;
+	my $file ;
+	my @subspeciesList = @{ $speciesList{ $speciesId } } ;
+	$genusId = $taxTree{ $speciesId } ;
+	while ( 1 )
+	{
+		if ( !defined $genusId || $genusId <= 1 )
+		{
+			$genusId = 0 ;
+			last ;
+		}
+
+		if ( defined $genus{ $genusId } )
+		{
+			last ;
+		}
+		else
+		{
+			$genusId = $taxTree{ $genusId } ;
+		}
+	}
+
+	my $FP1 ;
+	open FP1, ">tmp_$tid.list" ;
+	my $genomeSize = 0 ;
+	my $avgGenomeSize = 0 ;
+	foreach $i ( @subspeciesList )
+	{
+		foreach my $j ( @{$tidToGid{ $i } } )
+		{
+#{
+#	lock( $debug ) ;
+#	print "Merge ", $gidToFile{ $j }, "\n" ;
+#}
+			$file =  $gidToFile{ $j } ;
+			{
+				lock( $fileUsedLock ) ;
+				$fileUsed{ $file } = 1 ;
+			}
+			print FP1 $file, "\n" ;
+
+			my $tmp = GetGenomeSize( $file ) ;
+			if ( $tmp > $genomeSize )
+			{
+				$genomeSize = $tmp ;
+			}
+			$avgGenomeSize += $tmp ;
+		}
+	}
+	close FP1 ;
+
+#$genomeSize = int( $genomeSize / scalar( @subspeciesList ) ) ;
+	$avgGenomeSize = int( $avgGenomeSize / scalar( @subspeciesList ) ) ;
+
+#print $file, "\n" ;	
+#if ( $file =~ /\/(\w*?)uid/ )
+#{
+#	$speciesName = ( split /_/, $1 )[0]."_". ( split /_/, $1 )[1] ;
+#}
+	if ( defined $speciesIdToName{ $speciesId } )
+	{
+		$speciesName = $speciesIdToName{ $speciesId } ;
+		$speciesName =~ s/ /_/g ;
+	}
+	else
+	{
+		$speciesName = "Unknown_species_name" ;
+	}
+	my $id = $speciesId ;#( $speciesId << 32 ) | $genusId ;
+	my $header = ">cid|$id $speciesName $avgGenomeSize ".scalar( @subspeciesList ) ;
+	print STDERR "$header\n" ;
+	{
+		lock( $idMapLock ) ;
+		$newIdToTaxId{ "cid|$id" } = $speciesId ;
+		$idToGenomeSize{ "cid|$id" } = $avgGenomeSize ;
+	}
+
+#return ;
+# Build the sequence
+	my $seq = "" ;
+	if ( $noCompress == 0 &&  ( $maxGenomeSizeForCompression < 0 || $genomeSize <= $maxGenomeSizeForCompression ) ) #$genomeSize < 50000000 )
+	{
+		system_call("perl $bssPath/centrifuge-BuildSharedSequence.pl tmp_$tid.list -prefix tmp_${tid}_$id" ) ;
+
+# Merge all the fragmented sequence into one big chunk.
+		system_call("cat tmp_${tid}_${id}_*.fa > tmp_${tid}_$id.fa");
+
+		open FP1, "tmp_${tid}_$id.fa" ;
+		while ( <FP1> )
+		{
+			chomp ;
+			next if ( /^>/ ) ;
+			$seq .= $_ ;
+		}
+		close FP1 ;
+	}
+	else
+	{
+		foreach $i ( @subspeciesList )
+		{
+			foreach my $j ( @{$tidToGid{ $i } } )
+			{
+				$file =  $gidToFile{ $j } ;
+				open FP1, $file ;
+				while ( <FP1> )
+				{
+					#chomp ;
+					next if ( /^>/ ) ;
+					$seq .= $_ ;
+				}
+				close FP1 ;
+			}
+		}
+	}
+	open fpOut, ">>${output}_${tid}" ;
+	print fpOut "$header\n$seq\n" ;
+	close fpOut ;
+
+	unlink glob("tmp_${tid}_*");	
+}
+
+sub system_call {
+	print STDERR "SYSTEM CALL: ".join(" ", at _);
+	system(@_) == 0
+		or die "system @_ failed: $?";
+	print STDERR " finished\n";
+}
+
+sub threadWrapper
+{
+	my $tid = threads->tid() - 1 ;
+	unlink("${output}_${tid}");
+
+	while ( 1 )
+	{
+		my $u ;
+		{
+			lock $speciesUsed ;
+			$u = $speciesUsed ;
+			++$speciesUsed ;
+		}
+		last if ( $u >= scalar( @speciesListKey ) ) ;
+		solve( $speciesListKey[ $u ] ) ;
+	}
+}
+
+
+print STDERR "Step $step: Merging sub-species\n";
+++$step ;
+my @threads ;
+ at speciesListKey = keys %speciesList ; 
+$speciesUsed = 0 ;
+for ( $i = 0 ; $i < $numOfThreads ; ++$i )
+{
+	push @threads, $i ;
+}
+
+foreach (@threads)
+{
+	$_ = threads->create( \&threadWrapper ) ;
+}
+
+foreach (@threads)
+{
+	$_->join() ;
+}
+
+# merge the files generated from each threads
+system_call("cat ${output}_* > tmp_output.fa");
+unlink glob("${output}_*");
+
+#print unused files
+foreach $i ( keys %fileUsed )
+{
+	if ( $fileUsed{ $i } == 0 )
+	{
+#print $i, "\n" ;
+#`cat $i >> tmp_output.fa` ;
+		print "Unused file: $i\n" ;
+	}
+}
+
+# Remove the Ns from the file
+if ( $noDustmasker == 1 )
+{
+	system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
+}
+else
+{
+	system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output.fa > tmp_output_fmt.fa") ;
+	system_call( "dustmasker -infmt fasta -in tmp_output_fmt.fa -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]//g' > tmp_output_dustmasker.fa" ) ;
+	system_call("perl $bssPath/centrifuge-RemoveN.pl tmp_output_dustmasker.fa | perl $bssPath/centrifuge-RemoveEmptySequence.pl > $output.fa") ;
+}
+
+# Output the mapping of the ids to species
+open FP1, ">$output.map" ;
+foreach my $key (keys %newIdToTaxId )
+{
+	print FP1 "$key\t", $newIdToTaxId{ $key }, "\n" ;
+}
+close FP1 ;
+
+# Output the genome sizem map
+open FP1, ">$output.size" ;
+foreach my $key ( keys %newIdToTaxId )
+{
+	print FP1 $newIdToTaxId{ $key }, "\t", $idToGenomeSize{ $key }, "\n" ;
+}
+close FP1 ;
+unlink glob("tmp_*") ;
diff --git a/centrifuge-download b/centrifuge-download
new file mode 100755
index 0000000..c131003
--- /dev/null
+++ b/centrifuge-download
@@ -0,0 +1,298 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+exists() {
+  command -v "$1" >/dev/null 2>&1
+}
+
+if hash wget 2>/dev/null; then
+	DL_PROG="wget -N --reject=index.html -qO"
+else 
+	DL_PROG="curl -s -o"
+fi
+export DL_PROG
+
+cut_after_first_space_or_second_pipe() {
+	grep '^>' | sed 's/ .*//' | sed 's/\([^|]*|[^|]*\).*/\1/'
+}
+export -f cut_after_first_space_or_second_pipe
+
+map_headers_to_taxid() {
+	grep '^>' | cut_after_first_space_or_second_pipe | sed -e "s/^>//" -e  "s/\$/	$1/"
+}
+export -f map_headers_to_taxid
+
+
+
+#########################################################
+## Functions
+
+function download_n_process() {
+    IFS=$'\t' read -r TAXID FILEPATH <<< "$1"
+
+    NAME=`basename $FILEPATH .gz`
+    $DL_PROG "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || { printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
+    [[ -f "$LIBDIR/$DOMAIN/$NAME.gz" ]] || return;
+    gunzip "$LIBDIR/$DOMAIN/$NAME.gz" ||{ printf "\nError gunzipping $LIBDIR/$DOMAIN/$NAME.gz [ downloaded from $FILEPATH ]!\n" >&2 &&  exit 255; }
+
+	
+    if [[ "$CHANGE_HEADER" == "1" ]]; then
+        sed -i "s/^>/>kraken:taxid|$TAXID /" $LIBDIR/$DOMAIN/$NAME
+	fi
+
+	if [[ "$FILTER_UNPLACED" == "1" ]]; then
+        echo TODO 2>&1
+		##sed -n '1,/^>.*unplaced/p; /'
+	fi
+
+	## Output sequenceID to taxonomy ID map to STDOUT
+	cat $LIBDIR/$DOMAIN/$NAME | map_headers_to_taxid $TAXID
+
+    if [[ "$DO_DUST" == "1" ]]; then
+      ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps
+      dustmasker -infmt fasta -in $LIBDIR/$DOMAIN/$NAME -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > $LIBDIR/$DOMAIN/${NAME%.fna}_dustmasked.fna
+      rm $LIBDIR/$DOMAIN/$NAME
+    fi
+    echo done
+}
+export -f download_n_process
+
+ceol=`tput el` # terminfo clr_eol
+
+function count {
+   typeset C=0
+   while read L; do
+      if [[ "$L" == "done" ]]; then
+        C=$(( C + 1 ))
+        _progress=$(( (${C}*100/${1}*100)/100 ))
+        _done=$(( (${_progress}*4)/10 ))
+        _left=$(( 40-$_done ))
+        # Build progressbar string lengths
+        _done=$(printf "%${_done}s")
+        _left=$(printf "%${_left}s")
+
+        printf "\rProgress : [${_done// /#}${_left// /-}] ${_progress}%% $C/$1"  1>&2
+      else
+        echo "$L"
+      fi
+   done
+}
+
+function check_or_mkdir_no_fail {
+    #echo -n "Creating $1 ... " >&2
+    if [[ -d $1 && ! -n `find $1 -prune -empty -type d` ]]; then
+        echo "Directory exists already! Continuing" >&2
+        return `true`
+    else 
+        #echo "Done" >&2
+        mkdir -p $1
+        return `true`
+    fi
+}
+
+function c_echo() {
+        printf "\033[34m$*\033[0m\n"
+}
+
+
+
+## Check if GNU parallel exists
+command -v parallel >/dev/null 2>&1 && PARALLEL=1 || PARALLEL=0
+
+
+ALL_GENOMES="bacteria viral archaea fungi protozoa invertebrate plant vertebrate_mammalian vertebrate_other"
+ALL_DATABASES="refseq genbank taxonomy contaminants"
+ALL_ASSEMBLY_LEVELS="Complete\ Genome Chromosome Scaffold Contig"
+
+## Option parsing
+DATABASE="refseq"
+ASSEMBLY_LEVEL="Complete Genome"
+REFSEQ_CATEGORY=""
+TAXID=""
+DOWNLOAD_GI_MAP=0
+
+BASE_DIR="."
+N_PROC=1
+CHANGE_HEADER=0
+DOWNLOAD_RNA=0
+DO_DUST=0
+FILTER_UNPLACED=0
+
+USAGE="
+`basename $0` [<options>] <database>
+
+ARGUMENT
+ <database>        One of refseq, genbank, contaminants or taxonomy:
+                     - use refseq or genbank for genomic sequences,
+                     - contaminants gets contaminant sequences from UniVec and EmVec,
+                     - taxonomy for taxonomy mappings.
+
+COMMON OPTIONS
+ -o <directory>         Folder to which the files are downloaded. Default: '$BASE_DIR'.
+ -P <# of threads>      Number of processes when downloading (uses xargs). Default: '$N_PROC'
+
+WHEN USING database refseq OR genbank:
+ -d <domain>            What domain to download. One or more of ${ALL_GENOMES// /, } (comma separated).
+ -a <assembly level>    Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'.
+ -c <refseq category>   Only download genomes in the specified refseq category. Default: any.
+ -t <taxids>            Only download the specified taxonomy IDs, comma separated. Default: any.
+ -r                     Download RNA sequences, too.
+ -u                     Filter unplaced sequences.
+ -m                     Mask low-complexity regions using dustmasker. Default: off.
+ -l                     Modify header to include taxonomy ID. Default: off.
+ -g                     Download GI map.
+"
+
+# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific)
+while getopts "o:P:d:a:c:t:urlmg" OPT "$@"; do
+    case $OPT in
+        o) BASE_DIR="$OPTARG" ;;
+        P) N_PROC="$OPTARG" ;;
+        d) DOMAINS=${OPTARG//,/ } ;;
+        a) ASSEMBLY_LEVEL="$OPTARG" ;;
+        c) REFSEQ_CATEGORY="$OPTARG" ;;
+        t) TAXID="$OPTARG" ;;
+        r) DOWNLOAD_RNA=1 ;;
+		u) FILTER_UNPLACED=1 ;;
+        m) DO_DUST=1 ;;
+        l) CHANGE_HEADER=1 ;;
+        g) DOWNLOAD_GI_MAP=1 ;;
+        \?) echo "Invalid option: -$OPTARG" >&2 
+            exit 1 
+        ;;
+        :) echo "Option -$OPTARG requires an argument." >&2
+           exit 1
+        ;;
+    esac
+done
+shift $((OPTIND-1))
+
+[[ $# -eq 1 ]] || { printf "$USAGE" >&2 && exit 1; };
+DATABASE=$1
+
+#### TAXONOMY DOWNLOAD
+FTP="ftp://ftp.ncbi.nih.gov"
+if [[ "$DATABASE" == "taxonomy" ]]; then 
+  echo "Downloading NCBI taxonomy ... " >&2
+  if check_or_mkdir_no_fail "$BASE_DIR"; then
+    cd "$BASE_DIR" > /dev/null
+    if [[ "$DOWNLOAD_GI_MAP" == "1" ]]; then
+        $DL_PROG gi_taxid_nucl.dmp.gz $FTP/pub/taxonomy/gi_taxid_nucl.dmp.gz
+        gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
+	else
+        $DL_PROG taxdump.tar.gz $FTP/pub/taxonomy/taxdump.tar.gz
+        tar -zxvf taxdump.tar.gz nodes.dmp
+        tar -zxvf taxdump.tar.gz names.dmp
+        rm taxdump.tar.gz
+    fi
+    cd - > /dev/null
+  fi
+  exit 0
+fi
+
+dat_to_fna() {
+	grep -E '^DE|^ ' | awk '/^DE/ { sub(/DE */,">"); gsub(/[ |]/,"_") }; { print }' | awk '/^ / { gsub(/ /,""); sub(/[0-9]*$/,"") }; { print }' 
+}
+
+#### CONTAMINANT SEQ DOWNLOAD
+if [[ "$DATABASE" == "contaminants" ]]; then 
+  echo "Downloading contaminant databases ... " >&2
+  CONTAMINANT_TAXID=32630
+  CONTAMINANT_DIR="$BASE_DIR/contaminants"
+  if check_or_mkdir_no_fail "$CONTAMINANT_DIR"; then
+    cd "$CONTAMINANT_DIR" > /dev/null
+
+    # download UniVec and EmVec database
+    $DL_PROG UniVec.fna ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec
+    $DL_PROG emvec.dat.gz ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz
+    gunzip -c emvec.dat.gz | dat_to_fna > EmVec.fna
+ 
+    if [[ "$CHANGE_HEADER" == "1" ]]; then
+        sed -i "s/^>/>taxid|$CONTAMINANT_TAXID /" UniVec.fna
+        sed -i "s/^>/>taxid|$CONTAMINANT_TAXID /" EmVec.fna
+    else 
+	cat UniVec.fna | map_headers_to_taxid $CONTAMINANT_TAXID
+	cat EmVec.fna | map_headers_to_taxid $CONTAMINANT_TAXID
+    fi
+
+   #sed ':a $!N; s/^>.*\n>/>/; P; D' Contaminants/emvec.fa  > Contaminants/emvec.fa
+    rm emvec.dat.gz
+
+    cd - > /dev/null
+	exit 0;
+  fi
+fi
+
+
+
+#### REFSEQ/GENBANK DOWNLOAD
+
+export LIBDIR="$BASE_DIR"
+export DO_DUST="$DO_DUST"
+export CHANGE_HEADER="$CHANGE_HEADER"
+
+## Fields in the assembly_summary.txt file
+REFSEQ_CAT_FIELD=5
+TAXID_FIELD=6
+SPECIES_TAXID_FIELD=7
+VERSION_STATUS_FIELD=11
+ASSEMBLY_LEVEL_FIELD=12
+FTP_PATH_FIELD=20
+
+AWK_QUERY="\$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\" && \$$VERSION_STATUS_FIELD==\"latest\""
+[[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\""
+
+TAXID=${TAXID//,/|}
+[[ "$TAXID" != "" ]] && AWK_QUERY="$AWK_QUERY && match(\$$TAXID_FIELD,\"^($TAXID)\$\")"
+
+#echo "$AWK_QUERY" >&2
+
+#echo "Downloading genomes for $DOMAINS at assembly level $ASSEMBLY_LEVEL" >&2
+if exists wget; then
+	wget -qO- --no-remove-listing ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > /dev/null
+else
+	curl -s ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > .listing
+fi
+
+if [[ "$CHANGE_HEADER" == "1" ]]; then
+    echo "Modifying header to include taxonomy ID" >&2
+fi
+
+
+for DOMAIN in $DOMAINS; do
+    if [[ ! `grep " $DOMAIN" .listing` ]]; then
+        c_echo "$DOMAIN is not a valid domain - use one of the following:" >&2
+        grep '^d' .listing  | sed 's/.* //'
+        exit 1
+    fi
+    
+    #if [[ "$CHANGE_HEADER" != "1" ]]; then
+        #echo "Writing taxonomy ID to sequence ID map to STDOUT" >&2
+        #[[ -f "$LIBDIR/$DOMAIN.map" ]] && rm "$LIBDIR/$DOMAIN.map"
+    #fi
+
+    export DOMAIN=$DOMAIN
+    check_or_mkdir_no_fail $LIBDIR/$DOMAIN
+
+    FULL_ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary.txt"
+    ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary_filtered.txt"
+
+    #echo "Downloading and filtering the assembly_summary.txt file ..." >&2
+    $DL_PROG "$FULL_ASSEMBLY_SUMMARY_FILE" ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$DOMAIN/assembly_summary.txt > "$FULL_ASSEMBLY_SUMMARY_FILE"
+    awk -F "\t" "BEGIN {OFS=\"\t\"} $AWK_QUERY" "$FULL_ASSEMBLY_SUMMARY_FILE" > "$ASSEMBLY_SUMMARY_FILE"
+
+    N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
+    [[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }
+    echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
+    cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
+       tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process "$@"' _ | count $N_EXPECTED
+    echo >&2
+
+    if [[ "$DOWNLOAD_RNA" == "1" && ! `echo $DOMAIN | egrep 'bacteria|viral|archaea'` ]]; then
+    	echo "Downloadinging rna sequence files" >&2
+        cut -f $TAXID_FIELD,$FTP_PATH_FIELD  "$ASSEMBLY_SUMMARY_FILE"| sed 's#\([^/]*\)$#\1/\1_rna.fna.gz#' |\
+            tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process "$@"' _ | count $N_EXPECTED
+        echo >&2
+    fi
+done
diff --git a/centrifuge-inspect b/centrifuge-inspect
new file mode 100755
index 0000000..9babc1b
--- /dev/null
+++ b/centrifuge-inspect
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+"""
+ Copyright 2014, Daehwan Kim <infphilo at gmail.com>
+
+ This file is part of Centrifuge, which is copied and modified from bowtie2-inspect in the Bowtie2 package.
+
+ Centrifuge is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Centrifuge is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Centrifuge.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+import os
+import imp
+import inspect
+import logging
+
+
+
+def main():
+    logging.basicConfig(level=logging.ERROR,
+                        format='%(levelname)s: %(message)s'
+                        )
+    inspect_bin_name      = "centrifuge-inspect"
+    curr_script           = os.path.realpath(inspect.getsourcefile(main))
+    ex_path               = os.path.dirname(curr_script)
+    inspect_bin_spec      = os.path.join(ex_path, "centrifuge-inspect-bin")
+    bld                   = imp.load_source('centrifuge-build', os.path.join(ex_path,'centrifuge-build'))
+    options,arguments     = bld.build_args()
+
+    if '--verbose' in options:
+        logging.getLogger().setLevel(logging.INFO)
+        
+    if '--debug' in options:
+        inspect_bin_spec += '-debug'
+    
+    arguments[0] = inspect_bin_name
+    arguments.insert(1, 'basic-0')
+    arguments.insert(1, '--wrapper')
+    logging.info('Command: %s %s' %  (inspect_bin_spec,' '.join(arguments[1:])))
+    os.execv(inspect_bin_spec, arguments)        
+        
+        
+if __name__ == "__main__":
+    main()
diff --git a/centrifuge-kreport b/centrifuge-kreport
new file mode 100755
index 0000000..b51afd4
--- /dev/null
+++ b/centrifuge-kreport
@@ -0,0 +1,161 @@
+#!/usr/bin/perl
+
+# Give a Kraken-style report from a Centrifuge output
+#
+# Based on kraken-report by Derrick Wood
+# Copyright 2013-2016, Derrick Wood <dwood at cs.jhu.edu>
+#
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my ($centrifuge_index, $min_score, $min_length);
+my $only_unique = 1;
+my $show_zeros = 0;
+my $is_cnts_table = 0;
+my $PROG = "centrifuge-kreport";
+
+GetOptions(
+  "help" => \&display_help,
+  "x=s" => \$centrifuge_index,
+  "show-zeros" => \$show_zeros,
+  "is-report-file" => \$is_cnts_table,
+  "min-score=i" => \$min_score,
+  "min-length=i"=> \$min_length,
+  "only-unique" => \$only_unique
+) or usage();
+
+usage() unless defined $centrifuge_index;
+usage() unless defined $ARGV[0];
+
+sub usage {
+  my $exit_code = @_ ? shift : 64;
+  print STDERR "Usage: centrifuge-kreport -x <index name> [--show-zeros] [--min-score=SCORE] [--is-counts] [--min-length=LENGTH] <centrifuge output file(s)>\n";
+  exit $exit_code;
+}
+
+sub display_help {
+  usage(0);
+}
+
+my (%child_lists, %name_map, %rank_map);
+print STDERR "Loading taxonomy ...\n";
+load_taxonomy();
+
+my %taxo_counts;
+my $seq_count = 0;
+$taxo_counts{0} = 0;
+if ($is_cnts_table) {
+  while (<>) {
+    my ($taxid,$count) = split;
+    $taxo_counts{$taxid} = $count;
+    $seq_count += $count;
+  }
+} else {
+  <>;
+  while (<>) {
+    my (undef,$seqID,$taxid,$score, undef,$hitLength,$numMatches) = split /\t/;
+    my ($l,$rl) = split(/ \/ /,$hitLength);
+    #print STDERR "undef,$seqID,$taxid,$score, undef,$hitLength,$numMatches: $l || $rl\n";
+    next if $only_unique && $numMatches > 1;
+    next if defined $min_length && $l < $min_length;
+    next if defined $min_score && $score < $min_score;
+    $taxo_counts{$taxid} += 1/$numMatches;
+    $seq_count++;
+  }
+}
+my $classified_count = $seq_count - $taxo_counts{0};
+
+my %clade_counts = %taxo_counts;
+dfs_summation(1);
+
+for (keys %name_map) {
+  $clade_counts{$_} ||= 0;
+}
+
+printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
+  $clade_counts{0} * 100 / $seq_count,
+  $clade_counts{0}, $taxo_counts{0}, "U",
+  0, "", "unclassified";
+dfs_report(1, 0);
+
+sub dfs_report {
+  my $node = shift;
+  my $depth = shift;
+  if (! $clade_counts{$node} && ! $show_zeros) {
+    return;
+  }
+  printf "%6.2f\t%d\t%d\t%s\t%d\t%s%s\n",
+    ($clade_counts{$node} || 0) * 100 / $seq_count,
+    ($clade_counts{$node} || 0),
+    ($taxo_counts{$node} || 0),
+    rank_code($rank_map{$node}),
+    $node,
+    "  " x $depth,
+    $name_map{$node};
+  my $children = $child_lists{$node};
+  if ($children) {
+    my @sorted_children = sort { $clade_counts{$b} <=> $clade_counts{$a} } @$children;
+    for my $child (@sorted_children) {
+      dfs_report($child, $depth + 1);
+    }
+  }
+}
+
+sub rank_code {
+  my $rank = shift;
+  for ($rank) {
+    $_ eq "species" and return "S";
+    $_ eq "genus" and return "G";
+    $_ eq "family" and return "F";
+    $_ eq "order" and return "O";
+    $_ eq "class" and return "C";
+    $_ eq "phylum" and return "P";
+    $_ eq "kingdom" and return "K";
+    $_ eq "superkingdom" and return "D";
+  }
+  return "-";
+}
+
+sub dfs_summation {
+  my $node = shift;
+  my $children = $child_lists{$node};
+  if ($children) {
+    for my $child (@$children) {
+      dfs_summation($child);
+      $clade_counts{$node} += ($clade_counts{$child} || 0);
+    }
+  }
+}
+
+sub load_taxonomy {
+
+  print STDERR "Loading names file ...\n";
+  open NAMES, "-|", "centrifuge-inspect --name-table $centrifuge_index"
+    or die "$PROG: can't open names file: $!\n";
+  while (<NAMES>) {
+    chomp;
+    s/\t\|$//;
+    my @fields = split /\t/;
+    my ($node_id, $name) = @fields[0,1];
+    $name_map{$node_id} = $name;
+  }
+  close NAMES;
+
+  print STDERR "Loading nodes file ...\n";
+  open NODES, "-|", "centrifuge-inspect --taxonomy-tree $centrifuge_index"
+    or die "$PROG: can't open nodes file: $!\n";
+  while (<NODES>) {
+    chomp;
+    my @fields = split /\t\|\t/;
+    my ($node_id, $parent_id, $rank) = @fields[0,1,2];
+    if ($node_id == 1) {
+      $parent_id = 0;
+    }
+    $child_lists{$parent_id} ||= [];
+    push @{ $child_lists{$parent_id} }, $node_id;
+    $rank_map{$node_id} = $rank;
+  }
+  close NODES;
+}
diff --git a/centrifuge-sort-nt.pl b/centrifuge-sort-nt.pl
new file mode 100755
index 0000000..d4124eb
--- /dev/null
+++ b/centrifuge-sort-nt.pl
@@ -0,0 +1,63 @@
+#! /usr/bin/env perl
+#
+# Short description for sort-nt.pl
+#
+# Author fbreitwieser <fbreitwieser at sherman>
+# Version 0.1
+# Copyright (C) 2016 fbreitwieser <fbreitwieser at sherman>
+# Modified On 2016-02-28 12:56
+# Created  2016-02-28 12:56
+#
+use strict;
+use warnings;
+use File::Basename;
+
+$#ARGV==1 or die "USAGE: ".basename($0)." <sequence file> <mapping file>\n";
+my ($nt_file,$gi_taxid_file) = @ARGV;
+
+my %gi_to_pos;
+my %taxid_to_gi;
+my %gi_to_taxid;
+
+print STDERR "Reading headers from $nt_file ...\n";
+open(my $NT, "<", $nt_file) or die $!;
+while (<$NT>) {
+    if (/(^>(gi\|[0-9]*).*)/) {
+        #print STDERR "\$gi_to_pos{$2} = [$1,".tell($NT)."]\n";
+        $gi_to_pos{$2} = [tell($NT),$1];
+    }
+}
+
+print STDERR "Reading gi to taxid mapping $gi_taxid_file ...\n";
+my $FP1;
+if ($gi_taxid_file =~ /.zip$/) {
+    open($FP1, "-|", "unzip -c '$gi_taxid_file'") or die $!;
+} else {
+    open($FP1, "<", $gi_taxid_file) or die $!;
+}
+while ( <$FP1> ) {
+	chomp;
+	my ($gi,$taxid) = split;
+    next unless defined $taxid;
+	if ( defined( $gi_to_pos{ $gi } ) )
+	{
+		push @{ $taxid_to_gi{ $taxid } }, $gi;
+		$gi_to_taxid{ $gi } = $taxid;
+	}
+}
+close $FP1;
+
+print STDERR "Outputting sorted FASTA ...\n";
+foreach my $taxid (keys %taxid_to_gi) {
+    my @gis = @{$taxid_to_gi{$taxid}};
+    my @sorted_gis = sort { $gi_to_pos{$a}->[0] <=> $gi_to_pos{$b}->[0] } @gis;
+    foreach (@sorted_gis) {
+        print $gi_to_pos{$_}->[1],"\n";
+        seek($NT, $gi_to_pos{$_}->[0], 0);
+        while (<$NT>) {
+            last if (/^>/);
+            print $_;
+        }
+    }
+}
+close $NT;
diff --git a/centrifuge.cpp b/centrifuge.cpp
new file mode 100644
index 0000000..f1e0e1c
--- /dev/null
+++ b/centrifuge.cpp
@@ -0,0 +1,3201 @@
+/*
+ * Copyright 2014, Daehwan Kim <infphilo at gmail.com>
+ *
+ * This file is part of HISAT.
+ *
+ * HISAT is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * HISAT is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with HISAT.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cassert>
+#include <stdexcept>
+#include <getopt.h>
+#include <math.h>
+#include <utility>
+#include <limits>
+#include <map>
+#include "alphabet.h"
+#include "assert_helpers.h"
+#include "endian_swap.h"
+#include "bt2_idx.h"
+#include "bt2_io.h"
+#include "bt2_util.h"
+#include "hier_idx.h"
+#include "formats.h"
+#include "sequence_io.h"
+#include "tokenize.h"
+#include "aln_sink.h"
+#include "pat.h"
+#include "threading.h"
+#include "ds.h"
+#include "aligner_metrics.h"
+#include "aligner_seed_policy.h"
+#include "classifier.h"
+#include "util.h"
+#include "pe.h"
+#include "simple_func.h"
+#include "presets.h"
+#include "opts.h"
+#include "outq.h"
+
+using namespace std;
+
+static EList<string> mates1;  // mated reads (first mate)
+static EList<string> mates2;  // mated reads (second mate)
+static EList<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
+static string adjIdxBase;
+bool gColor;              // colorspace (not supported)
+int gVerbose;             // be talkative
+static bool startVerbose; // be talkative at startup
+int gQuiet;               // print nothing but the alignments
+static int sanityCheck;   // enable expensive sanity checks
+static int format;        // default read format is FASTQ
+static string origString; // reference text, or filename(s)
+static int seed;          // srandom() seed
+static int timing;        // whether to report basic timing data
+static int metricsIval;   // interval between alignment metrics messages (0 = no messages)
+static string metricsFile;// output file to put alignment metrics in
+static bool metricsStderr;// output file to put alignment metrics in
+static bool metricsPerRead; // report a metrics tuple for every read
+static bool allHits;      // for multihits, report just one
+static bool showVersion;  // just print version and quit?
+static int ipause;        // pause before maching?
+static uint32_t qUpto;    // max # of queries to read
+int gTrim5;               // amount to trim from 5' end
+int gTrim3;               // amount to trim from 3' end
+static int offRate;       // keep default offRate
+static bool solexaQuals;  // quality strings are solexa quals, not phred, and subtract 64 (not 33)
+static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33)
+static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII
+static int nthreads;      // number of pthreads operating concurrently
+static int outType;       // style of output
+static bool noRefNames;   // true -> print reference indexes; not names
+static uint32_t khits;    // number of hits per read; >1 is much slower
+static uint32_t mhits;    // don't report any hits if there are > mhits
+static int partitionSz;   // output a partitioning key in first field
+static bool useSpinlock;  // false -> don't use of spinlocks even if they're #defines
+static bool fileParallel; // separate threads read separate input files in parallel
+static bool useShmem;     // use shared memory to hold the index
+static bool useMm;        // use memory-mapped files to hold the index
+static bool mmSweep;      // sweep through memory-mapped files immediately after mapping
+int gMinInsert;           // minimum insert size
+int gMaxInsert;           // maximum insert size
+bool gMate1fw;            // -1 mate aligns in fw orientation on fw strand
+bool gMate2fw;            // -2 mate aligns in rc orientation on fw strand
+bool gFlippedMatesOK;     // allow mates to be in wrong order
+bool gDovetailMatesOK;    // allow one mate to extend off the end of the other
+bool gContainMatesOK;     // allow one mate to contain the other in PE alignment
+bool gOlapMatesOK;        // allow mates to overlap in PE alignment
+bool gExpandToFrag;       // incr max frag length to =larger mate len if necessary
+bool gReportDiscordant;   // find and report discordant paired-end alignments
+bool gReportMixed;        // find and report unpaired alignments for paired reads
+static uint32_t cacheLimit;      // ranges w/ size > limit will be cached
+static uint32_t cacheSize;       // # words per range cache
+static uint32_t skipReads;       // # reads/read pairs to skip
+bool gNofw; // don't align fw orientation of read
+bool gNorc; // don't align rc orientation of read
+static uint32_t fastaContLen;
+static uint32_t fastaContFreq;
+static bool hadoopOut; // print Hadoop status and summary messages
+static bool fuzzy;
+static bool fullRef;
+static bool samTruncQname; // whether to truncate QNAME to 255 chars
+static bool samOmitSecSeqQual; // omit SEQ/QUAL for 2ndary alignments?
+static bool samNoUnal; // don't print records for unaligned reads
+static bool samNoHead; // don't print any header lines in SAM output
+static bool samNoSQ;   // don't print @SQ header lines
+static bool sam_print_as;
+static bool sam_print_xs;  // XS:i
+static bool sam_print_xss; // Xs:i and Ys:i
+static bool sam_print_yn;  // YN:i and Yn:i
+static bool sam_print_xn;
+static bool sam_print_cs;
+static bool sam_print_cq;
+static bool sam_print_x0;
+static bool sam_print_x1;
+static bool sam_print_xm;
+static bool sam_print_xo;
+static bool sam_print_xg;
+static bool sam_print_nm;
+static bool sam_print_md;
+static bool sam_print_yf;
+static bool sam_print_yi;
+static bool sam_print_ym;
+static bool sam_print_yp;
+static bool sam_print_yt;
+static bool sam_print_ys;
+static bool sam_print_zs;
+static bool sam_print_xr;
+static bool sam_print_xt;
+static bool sam_print_xd;
+static bool sam_print_xu;
+static bool sam_print_yl;
+static bool sam_print_ye;
+static bool sam_print_yu;
+static bool sam_print_xp;
+static bool sam_print_yr;
+static bool sam_print_zb;
+static bool sam_print_zr;
+static bool sam_print_zf;
+static bool sam_print_zm;
+static bool sam_print_zi;
+static bool sam_print_zp;
+static bool sam_print_zu;
+static bool sam_print_xs_a;
+static bool bwaSwLike;
+static float bwaSwLikeC;
+static float bwaSwLikeT;
+static bool qcFilter;
+static bool sortByScore;      // prioritize alignments to report by score?
+bool gReportOverhangs;        // false -> filter out alignments that fall off the end of a reference sequence
+static string rgid;           // ID: setting for @RG header line
+static string rgs;            // SAM outputs for @RG header line
+static string rgs_optflag;    // SAM optional flag to add corresponding to @RG ID
+static bool msample;          // whether to report a random alignment when maxed-out via -m/-M
+int      gGapBarrier;         // # diags on top/bot only to be entered diagonally
+static EList<string> qualities;
+static EList<string> qualities1;
+static EList<string> qualities2;
+static string polstr;         // temporary holder for policy string
+static bool  msNoCache;       // true -> disable local cache
+static int   bonusMatchType;  // how to reward matches
+static int   bonusMatch;      // constant reward if bonusMatchType=constant
+static int   penMmcType;      // how to penalize mismatches
+static int   penMmcMax;       // max mm penalty
+static int   penMmcMin;       // min mm penalty
+static int   penNType;        // how to penalize Ns in the read
+static int   penN;            // constant if N pelanty is a constant
+static bool  penNCatPair;     // concatenate mates before N filtering?
+static bool  localAlign;      // do local alignment in DP steps
+static bool  noisyHpolymer;   // set to true if gap penalties should be reduced to be consistent with a sequencer that under- and overcalls homopolymers
+static int   penRdGapConst;   // constant cost of extending a gap in the read
+static int   penRfGapConst;   // constant cost of extending a gap in the reference
+static int   penRdGapLinear;  // coeff of linear term for cost of gap extension in read
+static int   penRfGapLinear;  // coeff of linear term for cost of gap extension in ref
+static SimpleFunc scoreMin;   // minimum valid score as function of read len
+static SimpleFunc nCeil;      // max # Ns allowed as function of read len
+static SimpleFunc msIval;     // interval between seeds as function of read len
+static double descConsExp;    // how to adjust score minimum as we descent further into index-assisted alignment
+static size_t descentLanding; // don't place a search root if it's within this many positions of end
+static SimpleFunc descentTotSz;    // maximum space a DescentDriver can use in bytes
+static SimpleFunc descentTotFmops; // maximum # FM ops a DescentDriver can perform
+static int    multiseedMms;   // mismatches permitted in a multiseed seed
+static int    multiseedLen;   // length of multiseed seeds
+static size_t multiseedOff;   // offset to begin extracting seeds
+static uint32_t seedCacheLocalMB;   // # MB to use for non-shared seed alignment cacheing
+static uint32_t seedCacheCurrentMB; // # MB to use for current-read seed hit cacheing
+static uint32_t exactCacheCurrentMB; // # MB to use for current-read seed hit cacheing
+static size_t maxhalf;        // max width on one side of DP table
+static bool seedSumm;         // print summary information about seed hits, not alignments
+static bool doUngapped;       // do ungapped alignment
+static size_t maxIters;       // stop after this many extend loop iterations
+static size_t maxUg;          // stop after this many ungap extends
+static size_t maxDp;          // stop after this many DPs
+static size_t maxItersIncr;   // amt to add to maxIters for each -k > 1
+static size_t maxEeStreak;    // stop after this many end-to-end fails in a row
+static size_t maxUgStreak;    // stop after this many ungap fails in a row
+static size_t maxDpStreak;    // stop after this many dp fails in a row
+static size_t maxStreakIncr;  // amt to add to streak for each -k > 1
+static size_t maxMateStreak;  // stop seed range after this many mate-find fails
+static bool doExtend;         // extend seed hits
+static bool enable8;          // use 8-bit SSE where possible?
+static size_t cminlen;        // longer reads use checkpointing
+static size_t cpow2;          // checkpoint interval log2
+static bool doTri;            // do triangular mini-fills?
+static string defaultPreset;  // default preset; applied immediately
+static bool ignoreQuals;      // all mms incur same penalty, regardless of qual
+static string wrapper;        // type of wrapper script, so we can print correct usage
+static EList<string> queries; // list of query files
+static string outfile;        // write SAM output to this file
+static int mapqv;             // MAPQ calculation version
+static int tighten;           // -M tighten mode (0=none, 1=best, 2=secbest+1)
+static bool doExactUpFront;   // do exact search up front if seeds seem good enough
+static bool do1mmUpFront;     // do 1mm search up front if seeds seem good enough
+static size_t do1mmMinLen;    // length below which we disable 1mm e2e search
+static int seedBoostThresh;   // if average non-zero position has more than this many elements
+static size_t nSeedRounds;    // # seed rounds
+static bool reorder;          // true -> reorder SAM recs in -p mode
+static float sampleFrac;      // only align random fraction of input reads
+static bool arbitraryRandom;  // pseudo-randoms no longer a function of read properties
+static bool bowtie2p5;
+
+static string bt2index;      // read Bowtie 2 index from files with this prefix
+static EList<pair<int, string> > extra_opts;
+static size_t extra_opts_cur;
+
+static EList<uint64_t> thread_rids;
+static MUTEX_T         thread_rids_mutex;
+
+static uint32_t minHitLen;   // minimum length of partial hits
+static string reportFile;    // file name of specices report file
+static uint32_t minTotalLen; // minimum summed length of partial hits per read
+static bool abundance_analysis;
+static bool tree_traverse;
+static string classification_rank;
+static EList<uint64_t> host_taxIDs;
+static EList<uint64_t> excluded_taxIDs;
+
+#ifdef USE_SRA
+static EList<string> sra_accs;
+#endif
+
+#define DMAX std::numeric_limits<double>::max()
+
+static void resetOptions() {
+
+#ifndef NDEBUG
+	cerr << "Setting standard options" << endl;
+#endif
+
+	mates1.clear();
+	mates2.clear();
+	mates12.clear();
+	adjIdxBase	            = "";
+	gColor                  = false;
+	gVerbose                = 0;
+	startVerbose			= 0;
+	gQuiet					= false;
+	sanityCheck				= 0;  // enable expensive sanity checks
+	format					= FASTQ; // default read format is FASTQ
+	origString				= ""; // reference text, or filename(s)
+	seed					= 0; // srandom() seed
+	timing					= 0; // whether to report basic timing data
+	metricsIval				= 1; // interval between alignment metrics messages (0 = no messages)
+	metricsFile             = ""; // output file to put alignment metrics in
+	metricsStderr           = false; // print metrics to stderr (in addition to --metrics-file if it's specified
+	metricsPerRead          = false; // report a metrics tuple for every read?
+	allHits					= false; // for multihits, report just one
+	showVersion				= false; // just print version and quit?
+	ipause					= 0; // pause before maching?
+	qUpto					= 0xffffffff; // max # of queries to read
+	gTrim5					= 0; // amount to trim from 5' end
+	gTrim3					= 0; // amount to trim from 3' end
+	offRate					= -1; // keep default offRate
+	solexaQuals				= false; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
+	phred64Quals			= false; // quality chars are phred, but must subtract 64 (not 33)
+	integerQuals			= false; // quality strings are space-separated strings of integers, not ASCII
+	nthreads				= 1;     // number of pthreads operating concurrently
+	outType					= OUTPUT_SAM;  // style of output
+	noRefNames				= false; // true -> print reference indexes; not names
+	khits					= 5;     // number of hits per read; >1 is much slower
+	mhits					= 0;     // stop after finding this many alignments+1
+	partitionSz				= 0;     // output a partitioning key in first field
+	useSpinlock				= true;  // false -> don't use of spinlocks even if they're #defines
+	fileParallel			= false; // separate threads read separate input files in parallel
+	useShmem				= false; // use shared memory to hold the index
+	useMm					= false; // use memory-mapped files to hold the index
+	mmSweep					= false; // sweep through memory-mapped files immediately after mapping
+	gMinInsert				= 0;     // minimum insert size
+	gMaxInsert				= 500;   // maximum insert size
+	gMate1fw				= true;  // -1 mate aligns in fw orientation on fw strand
+	gMate2fw				= false; // -2 mate aligns in rc orientation on fw strand
+	gFlippedMatesOK         = false; // allow mates to be in wrong order
+	gDovetailMatesOK        = false; // allow one mate to extend off the end of the other
+	gContainMatesOK         = true;  // allow one mate to contain the other in PE alignment
+	gOlapMatesOK            = true;  // allow mates to overlap in PE alignment
+	gExpandToFrag           = true;  // incr max frag length to =larger mate len if necessary
+	gReportDiscordant       = true;  // find and report discordant paired-end alignments
+	gReportMixed            = true;  // find and report unpaired alignments for paired reads
+
+	cacheLimit				= 5;     // ranges w/ size > limit will be cached
+	cacheSize				= 0;     // # words per range cache
+	skipReads				= 0;     // # reads/read pairs to skip
+	gNofw					= false; // don't align fw orientation of read
+	gNorc					= false; // don't align rc orientation of read
+	fastaContLen			= 0;
+	fastaContFreq			= 0;
+	hadoopOut				= false; // print Hadoop status and summary messages
+	fuzzy					= false; // reads will have alternate basecalls w/ qualities
+	fullRef					= false; // print entire reference name instead of just up to 1st space
+	samTruncQname           = true;  // whether to truncate QNAME to 255 chars
+	samOmitSecSeqQual       = false; // omit SEQ/QUAL for 2ndary alignments?
+	samNoUnal               = false; // omit SAM records for unaligned reads
+	samNoHead				= true;  // don't print any header lines in SAM output
+	samNoSQ					= false; // don't print @SQ header lines
+	sam_print_as            = true;
+	sam_print_xs            = true;
+	sam_print_xss           = false; // Xs:i and Ys:i
+	sam_print_yn            = false; // YN:i and Yn:i
+	sam_print_xn            = true;
+	sam_print_cs            = false;
+	sam_print_cq            = false;
+	sam_print_x0            = true;
+	sam_print_x1            = true;
+	sam_print_xm            = true;
+	sam_print_xo            = true;
+	sam_print_xg            = true;
+	sam_print_nm            = true;
+	sam_print_md            = true;
+	sam_print_yf            = true;
+	sam_print_yi            = false;
+	sam_print_ym            = false;
+	sam_print_yp            = false;
+	sam_print_yt            = true;
+	sam_print_ys            = true;
+	sam_print_zs            = false;
+	sam_print_xr            = false;
+	sam_print_xt            = false;
+	sam_print_xd            = false;
+	sam_print_xu            = false;
+	sam_print_yl            = false;
+	sam_print_ye            = false;
+	sam_print_yu            = false;
+	sam_print_xp            = false;
+	sam_print_yr            = false;
+	sam_print_zb            = false;
+	sam_print_zr            = false;
+	sam_print_zf            = false;
+	sam_print_zm            = false;
+	sam_print_zi            = false;
+	sam_print_zp            = false;
+	sam_print_zu            = false;
+    sam_print_xs_a          = true;
+	bwaSwLike               = false;
+	bwaSwLikeC              = 5.5f;
+	bwaSwLikeT              = 20.0f;
+	qcFilter                = false; // don't believe upstream qc by default
+	sortByScore             = true;  // prioritize alignments to report by score?
+	rgid					= "";    // SAM outputs for @RG header line
+	rgs						= "";    // SAM outputs for @RG header line
+	rgs_optflag				= "";    // SAM optional flag to add corresponding to @RG ID
+	msample				    = true;
+	gGapBarrier				= 4;     // disallow gaps within this many chars of either end of alignment
+	qualities.clear();
+	qualities1.clear();
+	qualities2.clear();
+	polstr.clear();
+	msNoCache       = true; // true -> disable local cache
+	bonusMatchType  = DEFAULT_MATCH_BONUS_TYPE;
+	bonusMatch      = DEFAULT_MATCH_BONUS;
+	penMmcType      = DEFAULT_MM_PENALTY_TYPE;
+	penMmcMax       = DEFAULT_MM_PENALTY_MAX;
+	penMmcMin       = DEFAULT_MM_PENALTY_MIN;
+	penNType        = DEFAULT_N_PENALTY_TYPE;
+	penN            = DEFAULT_N_PENALTY;
+	penNCatPair     = DEFAULT_N_CAT_PAIR; // concatenate mates before N filtering?
+	localAlign      = false;     // do local alignment in DP steps
+	noisyHpolymer   = false;
+	penRdGapConst   = DEFAULT_READ_GAP_CONST;
+	penRfGapConst   = DEFAULT_REF_GAP_CONST;
+	penRdGapLinear  = DEFAULT_READ_GAP_LINEAR;
+	penRfGapLinear  = DEFAULT_REF_GAP_LINEAR;
+	// scoreMin.init  (SIMPLE_FUNC_LINEAR, DEFAULT_MIN_CONST,   DEFAULT_MIN_LINEAR);
+    scoreMin.init  (SIMPLE_FUNC_CONST, -18, 0);
+	nCeil.init     (SIMPLE_FUNC_LINEAR, 0.0f, DMAX, 2.0f, 0.1f);
+	msIval.init    (SIMPLE_FUNC_LINEAR, 1.0f, DMAX, DEFAULT_IVAL_B, DEFAULT_IVAL_A);
+	descConsExp     = 2.0;
+	descentLanding  = 20;
+	descentTotSz.init(SIMPLE_FUNC_LINEAR, 1024.0, DMAX, 0.0, 1024.0);
+	descentTotFmops.init(SIMPLE_FUNC_LINEAR, 100.0, DMAX, 0.0, 10.0);
+	multiseedMms    = DEFAULT_SEEDMMS;
+	multiseedLen    = DEFAULT_SEEDLEN;
+	multiseedOff    = 0;
+	seedCacheLocalMB   = 32; // # MB to use for non-shared seed alignment cacheing
+	seedCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing
+	exactCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing
+	maxhalf            = 15; // max width on one side of DP table
+	seedSumm           = false; // print summary information about seed hits, not alignments
+	doUngapped         = true;  // do ungapped alignment
+	maxIters           = 400;   // max iterations of extend loop
+	maxUg              = 300;   // stop after this many ungap extends
+	maxDp              = 300;   // stop after this many dp extends
+	maxItersIncr       = 20;    // amt to add to maxIters for each -k > 1
+	maxEeStreak        = 15;    // stop after this many end-to-end fails in a row
+	maxUgStreak        = 15;    // stop after this many ungap fails in a row
+	maxDpStreak        = 15;    // stop after this many dp fails in a row
+	maxStreakIncr      = 10;    // amt to add to streak for each -k > 1
+	maxMateStreak      = 10;    // in PE: abort seed range after N mate-find fails
+	doExtend           = true;  // do seed extensions
+	enable8            = true;  // use 8-bit SSE where possible?
+	cminlen            = 2000;  // longer reads use checkpointing
+	cpow2              = 4;     // checkpoint interval log2
+	doTri              = false; // do triangular mini-fills?
+	defaultPreset      = "sensitive%LOCAL%"; // default preset; applied immediately
+	extra_opts.clear();
+	extra_opts_cur = 0;
+	bt2index.clear();        // read Bowtie 2 index from files with this prefix
+	ignoreQuals = false;     // all mms incur same penalty, regardless of qual
+	wrapper.clear();         // type of wrapper script, so we can print correct usage
+	queries.clear();         // list of query files
+	outfile.clear();         // write SAM output to this file
+	mapqv = 2;               // MAPQ calculation version
+	tighten = 3;             // -M tightening mode
+	doExactUpFront = true;   // do exact search up front if seeds seem good enough
+	do1mmUpFront = true;     // do 1mm search up front if seeds seem good enough
+	seedBoostThresh = 300;   // if average non-zero position has more than this many elements
+	nSeedRounds = 2;         // # rounds of seed searches to do for repetitive reads
+	do1mmMinLen = 60;        // length below which we disable 1mm search
+	reorder = false;         // reorder SAM records with -p > 1
+	sampleFrac = 1.1f;       // align all reads
+	arbitraryRandom = false; // let pseudo-random seeds be a function of read properties
+	bowtie2p5 = false;
+    minHitLen = 22;
+    minTotalLen = 0;
+    reportFile = "centrifuge_report.tsv";
+    abundance_analysis = true;
+    tree_traverse = true;
+    host_taxIDs.clear();
+    classification_rank = "strain";
+    excluded_taxIDs.clear();
+    
+#ifdef USE_SRA
+    sra_accs.clear();
+#endif
+}
+
+static const char *short_options = "fF:qbzhcu:rv:s:aP:t3:5:w:p:k:M:1:2:I:X:CQ:N:i:L:U:x:S:g:O:D:R:";
+
+static struct option long_options[] = {
+	{(char*)"verbose",      no_argument,       0,            ARG_VERBOSE},
+	{(char*)"startverbose", no_argument,       0,            ARG_STARTVERBOSE},
+	{(char*)"quiet",        no_argument,       0,            ARG_QUIET},
+	{(char*)"sanity",       no_argument,       0,            ARG_SANITY},
+	{(char*)"pause",        no_argument,       &ipause,      1},
+	{(char*)"orig",         required_argument, 0,            ARG_ORIG},
+	{(char*)"all",          no_argument,       0,            'a'},
+	{(char*)"solexa-quals", no_argument,       0,            ARG_SOLEXA_QUALS},
+	{(char*)"integer-quals",no_argument,       0,            ARG_INTEGER_QUALS},
+	{(char*)"int-quals",    no_argument,       0,            ARG_INTEGER_QUALS},
+	{(char*)"metrics",      required_argument, 0,            ARG_METRIC_IVAL},
+	{(char*)"metrics-file", required_argument, 0,            ARG_METRIC_FILE},
+	{(char*)"metrics-stderr",no_argument,      0,            ARG_METRIC_STDERR},
+	{(char*)"metrics-per-read", no_argument,   0,            ARG_METRIC_PER_READ},
+	{(char*)"met-read",     no_argument,       0,            ARG_METRIC_PER_READ},
+	{(char*)"met",          required_argument, 0,            ARG_METRIC_IVAL},
+	{(char*)"met-file",     required_argument, 0,            ARG_METRIC_FILE},
+	{(char*)"met-stderr",   no_argument,       0,            ARG_METRIC_STDERR},
+	{(char*)"time",         no_argument,       0,            't'},
+	{(char*)"trim3",        required_argument, 0,            '3'},
+	{(char*)"trim5",        required_argument, 0,            '5'},
+	{(char*)"seed",         required_argument, 0,            ARG_SEED},
+	{(char*)"qupto",        required_argument, 0,            'u'},
+	{(char*)"upto",         required_argument, 0,            'u'},
+	{(char*)"version",      no_argument,       0,            ARG_VERSION},
+	{(char*)"filepar",      no_argument,       0,            ARG_FILEPAR},
+	{(char*)"help",         no_argument,       0,            'h'},
+	{(char*)"threads",      required_argument, 0,            'p'},
+	{(char*)"khits",        required_argument, 0,            'k'},
+	{(char*)"minins",       required_argument, 0,            'I'},
+	{(char*)"maxins",       required_argument, 0,            'X'},
+	{(char*)"quals",        required_argument, 0,            'Q'},
+	{(char*)"Q1",           required_argument, 0,            ARG_QUALS1},
+	{(char*)"Q2",           required_argument, 0,            ARG_QUALS2},
+	{(char*)"refidx",       no_argument,       0,            ARG_REFIDX},
+	{(char*)"partition",    required_argument, 0,            ARG_PARTITION},
+	{(char*)"ff",           no_argument,       0,            ARG_FF},
+	{(char*)"fr",           no_argument,       0,            ARG_FR},
+	{(char*)"rf",           no_argument,       0,            ARG_RF},
+	{(char*)"cachelim",     required_argument, 0,            ARG_CACHE_LIM},
+	{(char*)"cachesz",      required_argument, 0,            ARG_CACHE_SZ},
+	{(char*)"nofw",         no_argument,       0,            ARG_NO_FW},
+	{(char*)"norc",         no_argument,       0,            ARG_NO_RC},
+	{(char*)"skip",         required_argument, 0,            's'},
+	{(char*)"12",           required_argument, 0,            ARG_ONETWO},
+	{(char*)"tab5",         required_argument, 0,            ARG_TAB5},
+	{(char*)"tab6",         required_argument, 0,            ARG_TAB6},
+	{(char*)"phred33-quals", no_argument,      0,            ARG_PHRED33},
+	{(char*)"phred64-quals", no_argument,      0,            ARG_PHRED64},
+	{(char*)"phred33",       no_argument,      0,            ARG_PHRED33},
+	{(char*)"phred64",      no_argument,       0,            ARG_PHRED64},
+	{(char*)"solexa1.3-quals", no_argument,    0,            ARG_PHRED64},
+	{(char*)"mm",           no_argument,       0,            ARG_MM},
+	{(char*)"shmem",        no_argument,       0,            ARG_SHMEM},
+	{(char*)"mmsweep",      no_argument,       0,            ARG_MMSWEEP},
+	{(char*)"hadoopout",    no_argument,       0,            ARG_HADOOPOUT},
+	{(char*)"fuzzy",        no_argument,       0,            ARG_FUZZY},
+	{(char*)"fullref",      no_argument,       0,            ARG_FULLREF},
+	{(char*)"usage",        no_argument,       0,            ARG_USAGE},
+	{(char*)"omit-sec-seq", no_argument,       0,            ARG_SAM_OMIT_SEC_SEQ},
+	{(char*)"gbar",         required_argument, 0,            ARG_GAP_BAR},
+	{(char*)"qseq",         no_argument,       0,            ARG_QSEQ},
+	{(char*)"policy",       required_argument, 0,            ARG_ALIGN_POLICY},
+	{(char*)"preset",       required_argument, 0,            'P'},
+	{(char*)"seed-summ",    no_argument,       0,            ARG_SEED_SUMM},
+	{(char*)"seed-summary", no_argument,       0,            ARG_SEED_SUMM},
+	{(char*)"overhang",     no_argument,       0,            ARG_OVERHANG},
+	{(char*)"no-cache",     no_argument,       0,            ARG_NO_CACHE},
+	{(char*)"cache",        no_argument,       0,            ARG_USE_CACHE},
+	{(char*)"454",          no_argument,       0,            ARG_NOISY_HPOLY},
+	{(char*)"ion-torrent",  no_argument,       0,            ARG_NOISY_HPOLY},
+	{(char*)"no-mixed",     no_argument,       0,            ARG_NO_MIXED},
+	{(char*)"no-discordant",no_argument,       0,            ARG_NO_DISCORDANT},
+	{(char*)"local",        no_argument,       0,            ARG_LOCAL},
+	{(char*)"end-to-end",   no_argument,       0,            ARG_END_TO_END},
+	{(char*)"ungapped",     no_argument,       0,            ARG_UNGAPPED},
+	{(char*)"no-ungapped",  no_argument,       0,            ARG_UNGAPPED_NO},
+	{(char*)"sse8",         no_argument,       0,            ARG_SSE8},
+	{(char*)"no-sse8",      no_argument,       0,            ARG_SSE8_NO},
+	{(char*)"scan-narrowed",no_argument,       0,            ARG_SCAN_NARROWED},
+	{(char*)"qc-filter",    no_argument,       0,            ARG_QC_FILTER},
+	{(char*)"bwa-sw-like",  no_argument,       0,            ARG_BWA_SW_LIKE},
+	{(char*)"multiseed",        required_argument, 0,        ARG_MULTISEED_IVAL},
+	{(char*)"ma",               required_argument, 0,        ARG_SCORE_MA},
+	{(char*)"mp",               required_argument, 0,        ARG_SCORE_MMP},
+	{(char*)"np",               required_argument, 0,        ARG_SCORE_NP},
+	{(char*)"rdg",              required_argument, 0,        ARG_SCORE_RDG},
+	{(char*)"rfg",              required_argument, 0,        ARG_SCORE_RFG},
+	{(char*)"score-min",        required_argument, 0,        ARG_SCORE_MIN},
+	{(char*)"min-score",        required_argument, 0,        ARG_SCORE_MIN},
+	{(char*)"n-ceil",           required_argument, 0,        ARG_N_CEIL},
+	{(char*)"dpad",             required_argument, 0,        ARG_DPAD},
+	{(char*)"mapq-print-inputs",no_argument,       0,        ARG_SAM_PRINT_YI},
+	{(char*)"no-score-priority",no_argument,       0,        ARG_NO_SCORE_PRIORITY},
+	{(char*)"seedlen",          required_argument, 0,        'L'},
+	{(char*)"seedmms",          required_argument, 0,        'N'},
+	{(char*)"seedival",         required_argument, 0,        'i'},
+	{(char*)"ignore-quals",     no_argument,       0,        ARG_IGNORE_QUALS},
+	{(char*)"index",            required_argument, 0,        'x'},
+	{(char*)"arg-desc",         no_argument,       0,        ARG_DESC},
+	{(char*)"wrapper",          required_argument, 0,        ARG_WRAPPER},
+	{(char*)"unpaired",         required_argument, 0,        'U'},
+	{(char*)"output",           required_argument, 0,        'S'},
+	{(char*)"mapq-v",           required_argument, 0,        ARG_MAPQ_V},
+	{(char*)"dovetail",         no_argument,       0,        ARG_DOVETAIL},
+	{(char*)"no-dovetail",      no_argument,       0,        ARG_NO_DOVETAIL},
+	{(char*)"contain",          no_argument,       0,        ARG_CONTAIN},
+	{(char*)"no-contain",       no_argument,       0,        ARG_NO_CONTAIN},
+	{(char*)"overlap",          no_argument,       0,        ARG_OVERLAP},
+	{(char*)"no-overlap",       no_argument,       0,        ARG_NO_OVERLAP},
+	{(char*)"tighten",          required_argument, 0,        ARG_TIGHTEN},
+	{(char*)"exact-upfront",    no_argument,       0,        ARG_EXACT_UPFRONT},
+	{(char*)"1mm-upfront",      no_argument,       0,        ARG_1MM_UPFRONT},
+	{(char*)"no-exact-upfront", no_argument,       0,        ARG_EXACT_UPFRONT_NO},
+	{(char*)"no-1mm-upfront",   no_argument,       0,        ARG_1MM_UPFRONT_NO},
+	{(char*)"1mm-minlen",       required_argument, 0,        ARG_1MM_MINLEN},
+	{(char*)"seed-off",         required_argument, 0,        'O'},
+	{(char*)"seed-boost",       required_argument, 0,        ARG_SEED_BOOST_THRESH},
+	{(char*)"read-times",       no_argument,       0,        ARG_READ_TIMES},
+	{(char*)"show-rand-seed",   no_argument,       0,        ARG_SHOW_RAND_SEED},
+	{(char*)"dp-fail-streak",   required_argument, 0,        ARG_DP_FAIL_STREAK_THRESH},
+	{(char*)"ee-fail-streak",   required_argument, 0,        ARG_EE_FAIL_STREAK_THRESH},
+	{(char*)"ug-fail-streak",   required_argument, 0,        ARG_UG_FAIL_STREAK_THRESH},
+	{(char*)"fail-streak",      required_argument, 0,        'D'},
+	{(char*)"dp-fails",         required_argument, 0,        ARG_DP_FAIL_THRESH},
+	{(char*)"ug-fails",         required_argument, 0,        ARG_UG_FAIL_THRESH},
+	{(char*)"extends",          required_argument, 0,        ARG_EXTEND_ITERS},
+	{(char*)"no-extend",        no_argument,       0,        ARG_NO_EXTEND},
+	{(char*)"mapq-extra",       no_argument,       0,        ARG_MAPQ_EX},
+	{(char*)"seed-rounds",      required_argument, 0,        'R'},
+	{(char*)"reorder",          no_argument,       0,        ARG_REORDER},
+	{(char*)"passthrough",      no_argument,       0,        ARG_READ_PASSTHRU},
+	{(char*)"sample",           required_argument, 0,        ARG_SAMPLE},
+	{(char*)"cp-min",           required_argument, 0,        ARG_CP_MIN},
+	{(char*)"cp-ival",          required_argument, 0,        ARG_CP_IVAL},
+	{(char*)"tri",              no_argument,       0,        ARG_TRI},
+	{(char*)"nondeterministic", no_argument,       0,        ARG_NON_DETERMINISTIC},
+	{(char*)"non-deterministic", no_argument,      0,        ARG_NON_DETERMINISTIC},
+	{(char*)"local-seed-cache-sz", required_argument, 0,     ARG_LOCAL_SEED_CACHE_SZ},
+	{(char*)"seed-cache-sz",       required_argument, 0,     ARG_CURRENT_SEED_CACHE_SZ},
+	{(char*)"no-unal",          no_argument,       0,        ARG_SAM_NO_UNAL},
+	{(char*)"test-25",          no_argument,       0,        ARG_TEST_25},
+	// TODO: following should be a function of read length?
+	{(char*)"desc-kb",          required_argument, 0,        ARG_DESC_KB},
+	{(char*)"desc-landing",     required_argument, 0,        ARG_DESC_LANDING},
+	{(char*)"desc-exp",         required_argument, 0,        ARG_DESC_EXP},
+	{(char*)"desc-fmops",       required_argument, 0,        ARG_DESC_FMOPS},
+    {(char*)"min-hitlen",       required_argument, 0,        ARG_MIN_HITLEN},
+    {(char*)"min-totallen",     required_argument, 0,        ARG_MIN_TOTALLEN},
+    {(char*)"host-taxids",      required_argument, 0,        ARG_HOST_TAXIDS},
+	{(char*)"report-file",      required_argument, 0,        ARG_REPORT_FILE},
+    {(char*)"no-abundance",     no_argument,       0,        ARG_NO_ABUNDANCE},
+    {(char*)"no-traverse",      no_argument,       0,        ARG_NO_TRAVERSE},
+    {(char*)"classification-rank", required_argument,    0,  ARG_CLASSIFICATION_RANK},
+    {(char*)"exclude-taxids",      required_argument,    0,  ARG_EXCLUDE_TAXIDS},
+#ifdef USE_SRA
+    {(char*)"sra-acc",   required_argument, 0,        ARG_SRA_ACC},
+#endif
+	{(char*)0, 0, 0, 0} // terminator
+};
+
+/**
+ * Print out a concise description of what options are taken and whether they
+ * take an argument.
+ */
+static void printArgDesc(ostream& out) {
+	// struct option {
+	//   const char *name;
+	//   int has_arg;
+	//   int *flag;
+	//   int val;
+	// };
+	size_t i = 0;
+	while(long_options[i].name != 0) {
+		out << long_options[i].name << "\t"
+		    << (long_options[i].has_arg == no_argument ? 0 : 1)
+		    << endl;
+		i++;
+	}
+	size_t solen = strlen(short_options);
+	for(i = 0; i < solen; i++) {
+		// Has an option?  Does if next char is :
+		if(i == solen-1) {
+			assert_neq(':', short_options[i]);
+			cout << (char)short_options[i] << "\t" << 0 << endl;
+		} else {
+			if(short_options[i+1] == ':') {
+				// Option with argument
+				cout << (char)short_options[i] << "\t" << 1 << endl;
+				i++; // skip the ':'
+			} else {
+				// Option with no argument
+				cout << (char)short_options[i] << "\t" << 0 << endl;
+			}
+		}
+	}
+}
+
+/**
+ * Print a summary usage message to the provided output stream.
+ */
+static void printUsage(ostream& out) {
+	out << "Centrifuge version " << string(CENTRIFUGE_VERSION).c_str() << " by Daehwan Kim (infphilo at gmail.com, www.ccb.jhu.edu/people/infphilo)" << endl;
+	string tool_name = "centrifuge-class";
+	if(wrapper == "basic-0") {
+		tool_name = "hisat";
+	}
+    out << "Usage: " << endl
+#ifdef USE_SRA
+    << "  " << tool_name.c_str() << " [options]* -x <bt2-idx> {-1 <m1> -2 <m2> | -U <r> | --sra-acc <SRA accession number>} [-S <filename>] [--report-file <report>]" << endl
+#else
+    << "  " << tool_name.c_str() << " [options]* -x <bt2-idx> {-1 <m1> -2 <m2> | -U <r>} [-S <filename>] [--report-file <report>]" << endl
+#endif
+	    << endl
+		<<     "  <cf-idx>   Index filename prefix (minus trailing .X." << gEbwt_ext << ")." << endl
+	    <<     "  <m1>       Files with #1 mates, paired with files in <m2>." << endl;
+	if(wrapper == "basic-0") {
+		out << "             Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
+	}
+	out <<     "  <m2>       Files with #2 mates, paired with files in <m1>." << endl;
+	if(wrapper == "basic-0") {
+		out << "             Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
+	}
+	out <<     "  <r>        Files with unpaired reads." << endl;
+	if(wrapper == "basic-0") {
+		out << "             Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
+	}
+#ifdef USE_SRA
+    out <<     "  <SRA accession number>        Comma-separated list of SRA accession numbers, e.g. --sra-acc SRR353653,SRR353654." << endl;
+#endif
+	out <<     "  <filename>      File for classification output (default: stdout)" << endl
+	    <<     "  <report>   File for tabular report output (default: " << reportFile << ")" << endl
+	    << endl
+	    << "  <m1>, <m2>, <r> can be comma-separated lists (no whitespace) and can be" << endl
+		<< "  specified many times.  E.g. '-U file1.fq,file2.fq -U file3.fq'." << endl
+		// Wrapper script should write <bam> line next
+		<< endl
+	    << "Options (defaults in parentheses):" << endl
+		<< endl
+	    << " Input:" << endl
+	    << "  -q                 query input files are FASTQ .fq/.fastq (default)" << endl
+	    << "  --qseq             query input files are in Illumina's qseq format" << endl
+	    << "  -f                 query input files are (multi-)FASTA .fa/.mfa" << endl
+	    << "  -r                 query input files are raw one-sequence-per-line" << endl
+	    << "  -c                 <m1>, <m2>, <r> are sequences themselves, not files" << endl
+	    << "  -s/--skip <int>    skip the first <int> reads/pairs in the input (none)" << endl
+	    << "  -u/--upto <int>    stop after first <int> reads/pairs (no limit)" << endl
+	    << "  -5/--trim5 <int>   trim <int> bases from 5'/left end of reads (0)" << endl
+	    << "  -3/--trim3 <int>   trim <int> bases from 3'/right end of reads (0)" << endl
+	    << "  --phred33          qualities are Phred+33 (default)" << endl
+	    << "  --phred64          qualities are Phred+64" << endl
+	    << "  --int-quals        qualities encoded as space-delimited integers" << endl
+		<< "  --ignore-quals     treat all quality values as 30 on Phred scale (off)" << endl
+	    << "  --nofw             do not align forward (original) version of read (off)" << endl
+	    << "  --norc             do not align reverse-complement version of read (off)" << endl
+        << "  --min-hitlen       " << endl
+#ifdef USE_SRA
+        << "  --sra-acc          SRA accession ID" << endl
+#endif
+		<< endl
+		<< "Classification:" << endl
+		<< "  --min-hitlen <int>    minimum length of partial hits (default " << minHitLen << ", must be greater than 15)" << endl
+		<< "  --min-totallen <int>  minimum summed length of partial hits per read (default " << minTotalLen << ")" << endl
+        << "  --host-taxids <taxids> comma-separated list of taxonomic IDs that will be preferred in classification" << endl
+        << "  --exclude-taxids <taxids> comma-separated list of taxonomic IDs that will be excluded in classification" << endl
+		<< endl
+	    << " Output:" << endl;
+	//if(wrapper == "basic-0") {
+	//	out << "  --bam              output directly to BAM (by piping through 'samtools view')" << endl;
+	//}
+	out << "  -t/--time          print wall-clock time taken by search phases" << endl;
+	if(wrapper == "basic-0") {
+	out << "  --un <path>           write unpaired reads that didn't align to <path>" << endl
+	    << "  --al <path>           write unpaired reads that aligned at least once to <path>" << endl
+	    << "  --un-conc <path>      write pairs that didn't align concordantly to <path>" << endl
+	    << "  --al-conc <path>      write pairs that aligned concordantly at least once to <path>" << endl
+	    << "  (Note: for --un, --al, --un-conc, or --al-conc, add '-gz' to the option name, e.g." << endl
+		<< "  --un-gz <path>, to gzip compress output, or add '-bz2' to bzip2 compress output.)" << endl;
+	}
+	out << "  --quiet            print nothing to stderr except serious errors" << endl
+	//  << "  --refidx           refer to ref. seqs by 0-based index rather than name" << endl
+		<< "  --met-file <path>  send metrics to file at <path> (off)" << endl
+		<< "  --met-stderr       send metrics to stderr (off)" << endl
+		<< "  --met <int>        report internal counters & metrics every <int> secs (1)" << endl
+		<< endl
+	    << " Performance:" << endl
+	    << "  -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
+	    << "  -p/--threads <int> number of alignment threads to launch (1)" << endl
+#ifdef BOWTIE_MM
+	    << "  --mm               use memory-mapped I/O for index; many 'bowtie's can share" << endl
+#endif
+		<< endl
+	    << " Other:" << endl
+		<< "  --qc-filter        filter out reads that are bad according to QSEQ filter" << endl
+	    << "  --seed <int>       seed for random number generator (0)" << endl
+	    << "  --non-deterministic seed rand. gen. arbitrarily instead of using read attributes" << endl
+	//  << "  --verbose          verbose output for debugging" << endl
+	    << "  --version          print version information and quit" << endl
+	    << "  -h/--help          print this usage message" << endl
+	    ;
+	if(wrapper.empty()) {
+		cerr << endl
+		     << "*** Warning ***" << endl
+			 << "'centrifuge-class' was run directly.  It is recommended that you run the wrapper script 'centrifuge' instead." << endl
+			 << endl;
+	}
+}
+
+/**
+ * Parse an int out of optarg and enforce that it be at least 'lower';
+ * if it is less than 'lower', than output the given error message and
+ * exit with an error and a usage message.
+ */
+static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
+	long l;
+	char *endPtr= NULL;
+	l = strtol(arg, &endPtr, 10);
+	if (endPtr != NULL) {
+		if (l < lower || l > upper) {
+			cerr << errmsg << endl;
+			printUsage(cerr);
+			throw 1;
+		}
+		return (int32_t)l;
+	}
+	cerr << errmsg << endl;
+	printUsage(cerr);
+	throw 1;
+	return -1;
+}
+
+/**
+ * Upper is maximum int by default.
+ */
+static int parseInt(int lower, const char *errmsg, const char *arg) {
+	return parseInt(lower, std::numeric_limits<int>::max(), errmsg, arg);
+}
+
+/**
+ * Parse a T string 'str'.
+ */
+template<typename T>
+T parse(const char *s) {
+	T tmp;
+	stringstream ss(s);
+	ss >> tmp;
+	return tmp;
+}
+
+/**
+ * Parse a pair of Ts from a string, 'str', delimited with 'delim'.
+ */
+template<typename T>
+pair<T, T> parsePair(const char *str, char delim) {
+	string s(str);
+	EList<string> ss;
+	tokenize(s, delim, ss);
+	pair<T, T> ret;
+	ret.first = parse<T>(ss[0].c_str());
+	ret.second = parse<T>(ss[1].c_str());
+	return ret;
+}
+
+/**
+ * Parse a pair of Ts from a string, 'str', delimited with 'delim'.
+ */
+template<typename T>
+void parseTuple(const char *str, char delim, EList<T>& ret) {
+	string s(str);
+	EList<string> ss;
+	tokenize(s, delim, ss);
+	for(size_t i = 0; i < ss.size(); i++) {
+		ret.push_back(parse<T>(ss[i].c_str()));
+	}
+}
+
+static string applyPreset(const string& sorig, Presets& presets) {
+	string s = sorig;
+	size_t found = s.find("%LOCAL%");
+	if(found != string::npos) {
+		s.replace(found, strlen("%LOCAL%"), localAlign ? "-local" : "");
+	}
+	if(gVerbose) {
+		cerr << "Applying preset: '" << s.c_str() << "' using preset menu '"
+			 << presets.name() << "'" << endl;
+	}
+	string pol;
+	presets.apply(s, pol, extra_opts);
+	return pol;
+}
+
+static bool saw_M;
+static bool saw_a;
+static bool saw_k;
+static EList<string> presetList;
+
+/**
+ * TODO: Argument parsing is very, very flawed.  The biggest problem is that
+ * there are two separate worlds of arguments, the ones set via polstr, and
+ * the ones set directly in variables.  This makes for nasty interactions,
+ * e.g., with the -M option being resolved at an awkward time relative to
+ * the -k and -a options.
+ */
+static void parseOption(int next_option, const char *arg) {
+	switch (next_option) {
+		case ARG_TEST_25: bowtie2p5 = true; break;
+		case ARG_DESC_KB: descentTotSz = SimpleFunc::parse(arg, 0.0, 1024.0, 1024.0, DMAX); break;
+		case ARG_DESC_FMOPS: descentTotFmops = SimpleFunc::parse(arg, 0.0, 10.0, 100.0, DMAX); break;
+		case ARG_DESC_LANDING: descentLanding = parse<int>(arg); break;
+		case ARG_DESC_EXP: {
+			descConsExp = parse<double>(arg);
+			if(descConsExp < 0.0) {
+				cerr << "Error: --desc-exp must be greater than or equal to 0" << endl;
+				throw 1;
+			}
+			break;
+		}
+		case '1': tokenize(arg, ",", mates1); break;
+		case '2': tokenize(arg, ",", mates2); break;
+		case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
+		case ARG_TAB5:   tokenize(arg, ",", mates12); format = TAB_MATE5; break;
+		case ARG_TAB6:   tokenize(arg, ",", mates12); format = TAB_MATE6; break;
+		case 'f': format = FASTA; break;
+		case 'F': {
+			format = FASTA_CONT;
+			pair<uint32_t, uint32_t> p = parsePair<uint32_t>(arg, ',');
+			fastaContLen = p.first;
+			fastaContFreq = p.second;
+			break;
+		}
+		case ARG_BWA_SW_LIKE: {
+			bwaSwLikeC = 5.5f;
+			bwaSwLikeT = 30;
+			bwaSwLike = true;
+			localAlign = true;
+			// -a INT   Score of a match [1]
+			// -b INT   Mismatch penalty [3]
+			// -q INT   Gap open penalty [5]
+			// -r INT   Gap extension penalty. The penalty for a contiguous
+			//          gap of size k is q+k*r. [2] 
+			polstr += ";MA=1;MMP=C3;RDG=5,2;RFG=5,2";
+			break;
+		}
+		case 'q': format = FASTQ; break;
+		case 'r': format = RAW; break;
+		case 'c': format = CMDLINE; break;
+		case ARG_QSEQ: format = QSEQ; break;
+		case 'C': {
+			cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl;
+			throw 1;
+			break;
+		}
+		case 'I':
+			gMinInsert = parseInt(0, "-I arg must be positive", arg);
+			break;
+		case 'X':
+			gMaxInsert = parseInt(1, "-X arg must be at least 1", arg);
+			break;
+		case ARG_NO_DISCORDANT: gReportDiscordant = false; break;
+		case ARG_NO_MIXED: gReportMixed = false; break;
+		case 's':
+			skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg);
+			break;
+		case ARG_FF: gMate1fw = true;  gMate2fw = true;  break;
+		case ARG_RF: gMate1fw = false; gMate2fw = true;  break;
+		case ARG_FR: gMate1fw = true;  gMate2fw = false; break;
+		case ARG_SHMEM: useShmem = true; break;
+		case ARG_SEED_SUMM: seedSumm = true; break;
+		case ARG_MM: {
+#ifdef BOWTIE_MM
+			useMm = true;
+			break;
+#else
+			cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl
+				 << "BOWTIE_MM defined.  Memory-mapped I/O is not supported under Windows.  If you" << endl
+				 << "would like to use memory-mapped I/O on a platform that supports it, please" << endl
+				 << "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl;
+			throw 1;
+#endif
+		}
+		case ARG_MMSWEEP: mmSweep = true; break;
+		case ARG_HADOOPOUT: hadoopOut = true; break;
+		case ARG_SOLEXA_QUALS: solexaQuals = true; break;
+		case ARG_INTEGER_QUALS: integerQuals = true; break;
+		case ARG_PHRED64: phred64Quals = true; break;
+		case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break;
+		case ARG_OVERHANG: gReportOverhangs = true; break;
+		case ARG_NO_CACHE: msNoCache = true; break;
+		case ARG_USE_CACHE: msNoCache = false; break;
+		case ARG_LOCAL_SEED_CACHE_SZ:
+			seedCacheLocalMB = (uint32_t)parseInt(1, "--local-seed-cache-sz arg must be at least 1", arg);
+			break;
+		case ARG_CURRENT_SEED_CACHE_SZ:
+			seedCacheCurrentMB = (uint32_t)parseInt(1, "--seed-cache-sz arg must be at least 1", arg);
+			break;
+		case ARG_REFIDX: noRefNames = true; break;
+		case ARG_FUZZY: fuzzy = true; break;
+		case ARG_FULLREF: fullRef = true; break;
+		case ARG_GAP_BAR:
+			gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg);
+			break;
+		case ARG_SEED:
+			seed = parseInt(0, "--seed arg must be at least 0", arg);
+			break;
+		case ARG_NON_DETERMINISTIC:
+			arbitraryRandom = true;
+			break;
+		case 'u':
+			qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg);
+			break;
+		case 'Q':
+			tokenize(arg, ",", qualities);
+			integerQuals = true;
+			break;
+		case ARG_QUALS1:
+			tokenize(arg, ",", qualities1);
+			integerQuals = true;
+			break;
+		case ARG_QUALS2:
+			tokenize(arg, ",", qualities2);
+			integerQuals = true;
+			break;
+		case ARG_CACHE_LIM:
+			cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1", arg);
+			break;
+		case ARG_CACHE_SZ:
+			cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1", arg);
+			cacheSize *= (1024 * 1024); // convert from MB to B
+			break;
+		case ARG_WRAPPER: wrapper = arg; break;
+        case 'x': bt2index = arg; break;
+		case 'p':
+			nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg);
+			break;
+		case ARG_FILEPAR:
+			fileParallel = true;
+			break;
+		case '3': gTrim3 = parseInt(0, "-3/--trim3 arg must be at least 0", arg); break;
+		case '5': gTrim5 = parseInt(0, "-5/--trim5 arg must be at least 0", arg); break;
+		case 'h': printUsage(cout); throw 0; break;
+		case ARG_USAGE: printUsage(cout); throw 0; break;
+		//
+		// NOTE that unlike in Bowtie 1, -M, -a and -k are mutually
+		// exclusive here.
+		//
+		case 'M': {
+			msample = true;
+			mhits = parse<uint32_t>(arg);
+			if(saw_a || saw_k) {
+				cerr << "Warning: -M, -k and -a are mutually exclusive. "
+					 << "-M will override" << endl;
+				khits = 1;
+			}
+			assert_eq(1, khits);
+			saw_M = true;
+			cerr << "Warning: -M is deprecated.  Use -D and -R to adjust " <<
+			        "effort instead." << endl;
+			break;
+		}
+		case ARG_EXTEND_ITERS: {
+			maxIters = parse<size_t>(arg);
+			break;
+		}
+		case ARG_NO_EXTEND: {
+			doExtend = false;
+			break;
+		}
+		case 'R': { polstr += ";ROUNDS="; polstr += arg; break; }
+		case 'D': { polstr += ";DPS=";    polstr += arg; break; }
+		case ARG_DP_MATE_STREAK_THRESH: {
+			maxMateStreak = parse<size_t>(arg);
+			break;
+		}
+		case ARG_DP_FAIL_STREAK_THRESH: {
+			maxDpStreak = parse<size_t>(arg);
+			break;
+		}
+		case ARG_EE_FAIL_STREAK_THRESH: {
+			maxEeStreak = parse<size_t>(arg);
+			break;
+		}
+		case ARG_UG_FAIL_STREAK_THRESH: {
+			maxUgStreak = parse<size_t>(arg);
+			break;
+		}
+		case ARG_DP_FAIL_THRESH: {
+			maxDp = parse<size_t>(arg);
+			break;
+		}
+		case ARG_UG_FAIL_THRESH: {
+			maxUg = parse<size_t>(arg);
+			break;
+		}
+		case ARG_SEED_BOOST_THRESH: {
+			seedBoostThresh = parse<int>(arg);
+			break;
+		}
+		case 'a': {
+			msample = false;
+			allHits = true;
+			mhits = 0; // disable -M
+			if(saw_M || saw_k) {
+				cerr << "Warning: -M, -k and -a are mutually exclusive. "
+					 << "-a will override" << endl;
+			}
+			saw_a = true;
+			break;
+		}
+		case 'k': {
+			msample = false;
+			khits = (uint32_t)parseInt(1, "-k arg must be at least 1", arg);
+			mhits = 0; // disable -M
+			if(saw_M || saw_a) {
+				cerr << "Warning: -M, -k and -a are mutually exclusive. "
+					 << "-k will override" << endl;
+			}
+			saw_k = true;
+			break;
+		}
+		case ARG_VERBOSE: gVerbose = 1; break;
+		case ARG_STARTVERBOSE: startVerbose = true; break;
+		case ARG_QUIET: gQuiet = true; break;
+		case ARG_SANITY: sanityCheck = true; break;
+		case 't': timing = true; break;
+		case ARG_METRIC_IVAL: {
+			metricsIval = parseInt(1, "--metrics arg must be at least 1", arg);
+			break;
+		}
+		case ARG_METRIC_FILE: metricsFile = arg; break;
+		case ARG_METRIC_STDERR: metricsStderr = true; break;
+		case ARG_METRIC_PER_READ: metricsPerRead = true; break;
+		case ARG_NO_FW: gNofw = true; break;
+		case ARG_NO_RC: gNorc = true; break;
+		case ARG_SAM_NO_QNAME_TRUNC: samTruncQname = false; break;
+		case ARG_SAM_OMIT_SEC_SEQ: samOmitSecSeqQual = true; break;
+		case ARG_SAM_NO_UNAL: samNoUnal = true; break;
+		case ARG_SAM_NOHEAD: samNoHead = true; break;
+		case ARG_SAM_NOSQ: samNoSQ = true; break;
+		case ARG_SAM_PRINT_YI: sam_print_yi = true; break;
+		case ARG_REORDER: reorder = true; break;
+		case ARG_MAPQ_EX: {
+			sam_print_zp = true;
+			sam_print_zu = true;
+			sam_print_xp = true;
+			sam_print_xss = true;
+			sam_print_yn = true;
+			break;
+		}
+		case ARG_SHOW_RAND_SEED: {
+			sam_print_zs = true;
+			break;
+		}
+		case ARG_SAMPLE:
+			sampleFrac = parse<float>(arg);
+			break;
+		case ARG_CP_MIN:
+			cminlen = parse<size_t>(arg);
+			break;
+		case ARG_CP_IVAL:
+			cpow2 = parse<size_t>(arg);
+			break;
+		case ARG_TRI:
+			doTri = true;
+			break;
+		case ARG_READ_PASSTHRU: {
+			sam_print_xr = true;
+			break;
+		}
+		case ARG_READ_TIMES: {
+			sam_print_xt = true;
+			sam_print_xd = true;
+			sam_print_xu = true;
+			sam_print_yl = true;
+			sam_print_ye = true;
+			sam_print_yu = true;
+			sam_print_yr = true;
+			sam_print_zb = true;
+			sam_print_zr = true;
+			sam_print_zf = true;
+			sam_print_zm = true;
+			sam_print_zi = true;
+			break;
+		}
+		case ARG_PARTITION: partitionSz = parse<int>(arg); break;
+		case ARG_DPAD:
+			maxhalf = parseInt(0, "--dpad must be no less than 0", arg);
+			break;
+		case ARG_ORIG:
+			if(arg == NULL || strlen(arg) == 0) {
+				cerr << "--orig arg must be followed by a string" << endl;
+				printUsage(cerr);
+				throw 1;
+			}
+			origString = arg;
+			break;
+		case ARG_NO_DOVETAIL: gDovetailMatesOK = false; break;
+		case ARG_NO_CONTAIN:  gContainMatesOK  = false; break;
+		case ARG_NO_OVERLAP:  gOlapMatesOK     = false; break;
+		case ARG_DOVETAIL:    gDovetailMatesOK = true;  break;
+		case ARG_CONTAIN:     gContainMatesOK  = true;  break;
+		case ARG_OVERLAP:     gOlapMatesOK     = true;  break;
+		case ARG_QC_FILTER: qcFilter = true; break;
+		case ARG_NO_SCORE_PRIORITY: sortByScore = false; break;
+		case ARG_IGNORE_QUALS: ignoreQuals = true; break;
+		case ARG_MAPQ_V: mapqv = parse<int>(arg); break;
+		case 'P': { presetList.push_back(arg); break; }
+		case ARG_ALIGN_POLICY: {
+			if(strlen(arg) > 0) {
+				polstr += ";"; polstr += arg;
+			}
+			break;
+		}
+		case 'N': { polstr += ";SEED="; polstr += arg; break; }
+		case 'L': {
+			int64_t len = parse<size_t>(arg);
+			if(len < 0) {
+				cerr << "Error: -L argument must be >= 0; was " << arg << endl;
+				throw 1;
+			}
+			if(len > 32) {
+				cerr << "Error: -L argument must be <= 32; was" << arg << endl;
+				throw 1;
+			}
+			polstr += ";SEEDLEN="; polstr += arg; break;
+		}
+		case 'O':
+			multiseedOff = parse<size_t>(arg);
+			break;
+		case 'i': {
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 3 || args.size() == 0) {
+				cerr << "Error: expected 3 or fewer comma-separated "
+					 << "arguments to -i option, got "
+					 << args.size() << endl;
+				throw 1;
+			}
+			// Interval-settings arguments
+			polstr += (";IVAL=" + args[0]); // Function type
+			if(args.size() > 1) {
+				polstr += ("," + args[1]);  // Constant term
+			}
+			if(args.size() > 2) {
+				polstr += ("," + args[2]);  // Coefficient
+			}
+			break;
+		}
+		case ARG_MULTISEED_IVAL: {
+			polstr += ";";
+			// Split argument by comma
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 5 || args.size() == 0) {
+				cerr << "Error: expected 5 or fewer comma-separated "
+					 << "arguments to --multiseed option, got "
+					 << args.size() << endl;
+				throw 1;
+			}
+			// Seed mm and length arguments
+			polstr += "SEED=";
+			polstr += (args[0]); // # mismatches
+			if(args.size() >  1) polstr += ("," + args[ 1]); // length
+			if(args.size() >  2) polstr += (";IVAL=" + args[2]); // Func type
+			if(args.size() >  3) polstr += ("," + args[ 3]); // Constant term
+			if(args.size() >  4) polstr += ("," + args[ 4]); // Coefficient
+			break;
+		}
+		case ARG_N_CEIL: {
+			// Split argument by comma
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 3) {
+				cerr << "Error: expected 3 or fewer comma-separated "
+					 << "arguments to --n-ceil option, got "
+					 << args.size() << endl;
+				throw 1;
+			}
+			if(args.size() == 0) {
+				cerr << "Error: expected at least one argument to --n-ceil option" << endl;
+				throw 1;
+			}
+			polstr += ";NCEIL=";
+			if(args.size() == 3) {
+				polstr += (args[0] + "," + args[1] + "," + args[2]);
+			} else {
+                if(args.size() == 1) {
+                    polstr += ("C," + args[0]);
+                } else {
+					polstr += (args[0] + "," + args[1]);
+				}
+			}
+			break;
+		}
+		case ARG_SCORE_MA:  polstr += ";MA=";    polstr += arg; break;
+		case ARG_SCORE_MMP: {
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 2 || args.size() == 0) {
+				cerr << "Error: expected 1 or 2 comma-separated "
+					 << "arguments to --mmp option, got " << args.size() << endl;
+				throw 1;
+			}
+			if(args.size() >= 1) {
+				polstr += ";MMP=Q,";
+				polstr += args[0];
+				if(args.size() >= 2) {
+					polstr += ",";
+					polstr += args[1];
+				}
+			}
+			break;
+		}
+		case ARG_SCORE_NP:  polstr += ";NP=C";   polstr += arg; break;
+		case ARG_SCORE_RDG: polstr += ";RDG=";   polstr += arg; break;
+		case ARG_SCORE_RFG: polstr += ";RFG=";   polstr += arg; break;
+		case ARG_SCORE_MIN: {
+			polstr += ";";
+			EList<string> args;
+			tokenize(arg, ",", args);
+			if(args.size() > 3 && args.size() == 0) {
+				cerr << "Error: expected 3 or fewer comma-separated "
+					 << "arguments to --n-ceil option, got "
+					 << args.size() << endl;
+				throw 1;
+			}
+			polstr += ("MIN=" + args[0]);
+			if(args.size() > 1) {
+				polstr += ("," + args[1]);
+			}
+			if(args.size() > 2) {
+				polstr += ("," + args[2]);
+			}
+			break;
+		}
+		case ARG_DESC: printArgDesc(cout); throw 0;
+		case 'S': outfile = arg; break;
+		case 'U': {
+			EList<string> args;
+			tokenize(arg, ",", args);
+			for(size_t i = 0; i < args.size(); i++) {
+				queries.push_back(args[i]);
+			}
+			break;
+		}
+		case ARG_VERSION: showVersion = 1; break;
+        case ARG_MIN_HITLEN: {
+            minHitLen = parseInt(15, "--min-hitlen arg must be at least 15", arg);
+            break;
+        }
+        case ARG_MIN_TOTALLEN: {
+        	minTotalLen = parseInt(50, "--min-totallen arg must be at least 50", arg);
+        	break;
+        }
+        case ARG_HOST_TAXIDS: {
+            EList<string> args;
+            tokenize(arg, ",", args);
+            for(size_t i = 0; i < args.size(); i++) {
+                istringstream ss(args[i]);
+                uint64_t tid;
+                ss >> tid;
+                host_taxIDs.push_back(tid);
+            }
+            break;
+        }
+        case ARG_REPORT_FILE: {
+        	reportFile = arg;
+        	break;
+        }
+        case ARG_NO_ABUNDANCE: {
+            abundance_analysis = false;
+            break;
+        }
+        case ARG_NO_TRAVERSE: {
+            tree_traverse = false;
+            break;
+        }
+        case ARG_CLASSIFICATION_RANK: {
+            classification_rank = arg;
+            if(classification_rank != "strain" &&
+               classification_rank != "species" &&
+               classification_rank != "genus" &&
+               classification_rank != "family" &&
+               classification_rank != "order" &&
+               classification_rank != "class" &&
+               classification_rank != "phylum") {
+                cerr << "Error: " << classification_rank << " (--classification-rank) should be one of strain, species, genus, family, order, class, and phylum" << endl;
+                exit(1);
+            }
+            break;
+        }
+        case ARG_EXCLUDE_TAXIDS: {
+            EList<string> args;
+            tokenize(arg, ",", args);
+            for(size_t i = 0; i < args.size(); i++) {
+                istringstream ss(args[i]);
+                uint64_t tid;
+                ss >> tid;
+                excluded_taxIDs.push_back(tid);
+            }
+            break;
+        }
+#ifdef USE_SRA
+        case ARG_SRA_ACC: {
+            tokenize(arg, ",", sra_accs); format = SRA_FASTA;
+            break;
+        }
+#endif
+		default:
+			printUsage(cerr);
+			throw 1;
+	}
+}
+
+/**
+ * Read command-line arguments
+ */
+static void parseOptions(int argc, const char **argv) {
+
+	int option_index = 0;
+	int next_option;
+	saw_M = false;
+	saw_a = false;
+	saw_k = true;
+	presetList.clear();
+	if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); }
+	while(true) {
+		next_option = getopt_long(
+			argc, const_cast<char**>(argv),
+			short_options, long_options, &option_index);
+		const char * arg = optarg;
+		if(next_option == EOF) {
+			if(extra_opts_cur < extra_opts.size()) {
+				next_option = extra_opts[extra_opts_cur].first;
+				arg = extra_opts[extra_opts_cur].second.c_str();
+				extra_opts_cur++;
+			} else {
+				break;
+			}
+		}
+		parseOption(next_option, arg);
+	}
+	// Now parse all the presets.  Might want to pick which presets version to
+	// use according to other parameters.
+	auto_ptr<Presets> presets(new PresetsV0());
+	// Apply default preset
+	if(!defaultPreset.empty()) {
+		polstr = applyPreset(defaultPreset, *presets.get()) + polstr;
+	}
+	// Apply specified presets
+	for(size_t i = 0; i < presetList.size(); i++) {
+		polstr += applyPreset(presetList[i], *presets.get());
+	}
+	for(size_t i = 0; i < extra_opts.size(); i++) {
+		next_option = extra_opts[extra_opts_cur].first;
+		const char *arg = extra_opts[extra_opts_cur].second.c_str();
+		parseOption(next_option, arg);
+	}
+	// Remove initial semicolons
+	while(!polstr.empty() && polstr[0] == ';') {
+		polstr = polstr.substr(1);
+	}
+	if(gVerbose) {
+		cerr << "Final policy string: '" << polstr.c_str() << "'" << endl;
+	}
+	size_t failStreakTmp = 0;
+	SeedAlignmentPolicy::parseString(
+		polstr,
+		localAlign,
+		noisyHpolymer,
+		ignoreQuals,
+		bonusMatchType,
+		bonusMatch,
+		penMmcType,
+		penMmcMax,
+		penMmcMin,
+		penNType,
+		penN,
+		penRdGapConst,
+		penRfGapConst,
+		penRdGapLinear,
+		penRfGapLinear,
+		scoreMin,
+		nCeil,
+		penNCatPair,
+		multiseedMms,
+		multiseedLen,
+		msIval,
+		failStreakTmp,
+		nSeedRounds);
+	if(failStreakTmp > 0) {
+		maxEeStreak = failStreakTmp;
+		maxUgStreak = failStreakTmp;
+		maxDpStreak = failStreakTmp;
+	}
+	if(saw_a || saw_k) {
+		msample = false;
+		mhits = 0;
+	} else {
+		assert_gt(mhits, 0);
+		msample = true;
+	}
+	if(mates1.size() != mates2.size()) {
+		cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl
+		     << "mate files/sequences were specified with -2.  The same number of mate files/" << endl
+		     << "sequences must be specified with -1 and -2." << endl;
+		throw 1;
+	}
+	if(qualities.size() && format != FASTA) {
+		cerr << "Error: one or more quality files were specified with -Q but -f was not" << endl
+		     << "enabled.  -Q works only in combination with -f and -C." << endl;
+		throw 1;
+	}
+	if(qualities1.size() && format != FASTA) {
+		cerr << "Error: one or more quality files were specified with --Q1 but -f was not" << endl
+		     << "enabled.  --Q1 works only in combination with -f and -C." << endl;
+		throw 1;
+	}
+	if(qualities2.size() && format != FASTA) {
+		cerr << "Error: one or more quality files were specified with --Q2 but -f was not" << endl
+		     << "enabled.  --Q2 works only in combination with -f and -C." << endl;
+		throw 1;
+	}
+	if(qualities1.size() > 0 && mates1.size() != qualities1.size()) {
+		cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << qualities1.size() << endl
+		     << "quality files were specified with --Q1.  The same number of mate and quality" << endl
+		     << "files must sequences must be specified with -1 and --Q1." << endl;
+		throw 1;
+	}
+	if(qualities2.size() > 0 && mates2.size() != qualities2.size()) {
+		cerr << "Error: " << mates2.size() << " mate files/sequences were specified with -2, but " << qualities2.size() << endl
+		     << "quality files were specified with --Q2.  The same number of mate and quality" << endl
+		     << "files must sequences must be specified with -2 and --Q2." << endl;
+		throw 1;
+	}
+	if(!rgs.empty() && rgid.empty()) {
+		cerr << "Warning: --rg was specified without --rg-id also "
+		     << "being specified.  @RG line is not printed unless --rg-id "
+			 << "is specified." << endl;
+	}
+	// Check for duplicate mate input files
+	if(format != CMDLINE) {
+		for(size_t i = 0; i < mates1.size(); i++) {
+			for(size_t j = 0; j < mates2.size(); j++) {
+				if(mates1[i] == mates2[j] && !gQuiet) {
+					cerr << "Warning: Same mate file \"" << mates1[i].c_str() << "\" appears as argument to both -1 and -2" << endl;
+				}
+			}
+		}
+	}
+	// If both -s and -u are used, we need to adjust qUpto accordingly
+	// since it uses rdid to know if we've reached the -u limit (and
+	// rdids are all shifted up by skipReads characters)
+	if(qUpto + skipReads > qUpto) {
+		qUpto += skipReads;
+	}
+	if(useShmem && useMm && !gQuiet) {
+		cerr << "Warning: --shmem overrides --mm..." << endl;
+		useMm = false;
+	}
+	if(gGapBarrier < 1) {
+		cerr << "Warning: --gbar was set less than 1 (=" << gGapBarrier
+		     << "); setting to 1 instead" << endl;
+		gGapBarrier = 1;
+	}
+	if(multiseedMms >= multiseedLen) {
+		assert_gt(multiseedLen, 0);
+		cerr << "Warning: seed mismatches (" << multiseedMms
+		     << ") is less than seed length (" << multiseedLen
+			 << "); setting mismatches to " << (multiseedMms-1)
+			 << " instead" << endl;
+		multiseedMms = multiseedLen-1;
+	}
+	sam_print_zm = sam_print_zm && bowtie2p5;
+#ifndef NDEBUG
+	if(!gQuiet) {
+		cerr << "Warning: Running in debug mode.  Please use debug mode only "
+			 << "for diagnosing errors, and not for typical use of Centrifuge."
+			 << endl;
+	}
+#endif
+}
+
+static const char *argv0 = NULL;
+
+/// Create a PatternSourcePerThread for the current thread according
+/// to the global params and return a pointer to it
+static PatternSourcePerThreadFactory*
+createPatsrcFactory(PairedPatternSource& _patsrc, int tid) {
+	PatternSourcePerThreadFactory *patsrcFact;
+	patsrcFact = new WrappedPatternSourcePerThreadFactory(_patsrc);
+	assert(patsrcFact != NULL);
+	return patsrcFact;
+}
+
+#define PTHREAD_ATTRS (PTHREAD_CREATE_JOINABLE | PTHREAD_CREATE_DETACHED)
+
+typedef TIndexOffU index_t;
+typedef uint16_t local_index_t;
+static PairedPatternSource*              multiseed_patsrc;
+static Ebwt<index_t>*                    multiseed_ebwtFw;
+static Ebwt<index_t>*                    multiseed_ebwtBw;
+static Scoring*                          multiseed_sc;
+static BitPairReference*                 multiseed_refs;
+static AlnSink<index_t>*                 multiseed_msink;
+static OutFileBuf*                       multiseed_metricsOfb;
+static EList<string>                     multiseed_refnames;
+
+/**
+ * Metrics for measuring the work done by the outer read alignment
+ * loop.
+ */
+struct OuterLoopMetrics {
+
+	OuterLoopMetrics() {
+	    reset();
+	}
+
+	/**
+	 * Set all counters to 0.
+	 */
+	void reset() {
+		reads = bases = srreads = srbases =
+		freads = fbases = ureads = ubases = 0;
+	}
+
+	/**
+	 * Sum the counters in m in with the conters in this object.  This
+	 * is the only safe way to update an OuterLoopMetrics that's shared
+	 * by multiple threads.
+	 */
+	void merge(
+		const OuterLoopMetrics& m,
+		bool getLock = false)
+	{
+		ThreadSafe ts(&mutex_m, getLock);
+		reads += m.reads;
+		bases += m.bases;
+		srreads += m.srreads;
+		srbases += m.srbases;
+		freads += m.freads;
+		fbases += m.fbases;
+		ureads += m.ureads;
+		ubases += m.ubases;
+	}
+
+	uint64_t reads;   // total reads
+	uint64_t bases;   // total bases
+	uint64_t srreads; // same-read reads
+	uint64_t srbases; // same-read bases
+	uint64_t freads;  // filtered reads
+	uint64_t fbases;  // filtered bases
+	uint64_t ureads;  // unfiltered reads
+	uint64_t ubases;  // unfiltered bases
+	MUTEX_T mutex_m;
+};
+
+/**
+ * Collection of all relevant performance metrics when aligning in
+ * multiseed mode.
+ */
+struct PerfMetrics {
+
+	PerfMetrics() : first(true) { reset(); }
+
+	/**
+	 * Set all counters to 0.
+	 */
+	void reset() {
+		olm.reset();
+		wlm.reset();
+		rpm.reset();
+		spm.reset();
+		nbtfiltst = 0;
+		nbtfiltsc = 0;
+		nbtfiltdo = 0;
+		
+		olmu.reset();
+		wlmu.reset();
+		rpmu.reset();
+		spmu.reset();
+		nbtfiltst_u = 0;
+		nbtfiltsc_u = 0;
+		nbtfiltdo_u = 0;
+        
+        him.reset();
+	}
+
+	/**
+	 * Merge a set of specific metrics into this object.
+	 */
+	void merge(
+		const OuterLoopMetrics *ol,
+		const WalkMetrics *wl,
+		const ReportingMetrics *rm,
+		const SpeciesMetrics *sm,
+		uint64_t nbtfiltst_,
+		uint64_t nbtfiltsc_,
+		uint64_t nbtfiltdo_,
+        const HIMetrics *hi,
+		bool getLock)
+	{
+
+		ThreadSafe ts(&mutex_m, getLock);
+		if(ol != NULL) {
+			olmu.merge(*ol, false);
+		}
+		if(wl != NULL) {
+			wlmu.merge(*wl, false);
+		}
+		if(rm != NULL) {
+			rpmu.merge(*rm, false);
+		}
+		if (sm != NULL) {
+			spmu.merge(*sm, false);
+		}
+		nbtfiltst_u += nbtfiltst_;
+		nbtfiltsc_u += nbtfiltsc_;
+		nbtfiltdo_u += nbtfiltdo_;
+        if(hi != NULL) {
+            him.merge(*hi, false);
+        }
+	}
+
+	/**
+	 * Reports a matrix of results, incl. column labels, to an OutFileBuf.
+	 * Optionally also sends results to stderr (unbuffered).  Can optionally
+	 * print a per-read record with the read name at the beginning.
+	 */
+	void reportInterval(
+		OutFileBuf* o,        // file to send output to
+		bool metricsStderr,   // additionally output to stderr?
+		bool total,           // true -> report total, otherwise incremental
+		bool sync,            //  synchronize output
+		const BTString *name) // non-NULL name pointer if is per-read record
+	{
+		ThreadSafe ts(&mutex_m, sync);
+		ostringstream stderrSs;
+		time_t curtime = time(0);
+		char buf[1024];
+		if(first) {
+			const char *str =
+				/*  1 */ "Time"           "\t"
+				/*  2 */ "Read"           "\t"
+				/*  3 */ "Base"           "\t"
+				/*  4 */ "SameRead"       "\t"
+				/*  5 */ "SameReadBase"   "\t"
+				/*  6 */ "UnfilteredRead" "\t"
+				/*  7 */ "UnfilteredBase" "\t"
+				
+				/*  8 */ "Paired"         "\t"
+				/*  9 */ "Unpaired"       "\t"
+				/* 10 */ "AlConUni"       "\t"
+				/* 11 */ "AlConRep"       "\t"
+				/* 12 */ "AlConFail"      "\t"
+				/* 13 */ "AlDis"          "\t"
+				/* 14 */ "AlConFailUni"   "\t"
+				/* 15 */ "AlConFailRep"   "\t"
+				/* 16 */ "AlConFailFail"  "\t"
+				/* 17 */ "AlConRepUni"    "\t"
+				/* 18 */ "AlConRepRep"    "\t"
+				/* 19 */ "AlConRepFail"   "\t"
+				/* 20 */ "AlUnpUni"       "\t"
+				/* 21 */ "AlUnpRep"       "\t"
+				/* 22 */ "AlUnpFail"      "\t"
+				
+				/* 23 */ "SeedSearch"     "\t"
+				/* 24 */ "IntraSCacheHit" "\t"
+				/* 25 */ "InterSCacheHit" "\t"
+				/* 26 */ "OutOfMemory"    "\t"
+				/* 27 */ "AlBWOp"         "\t"
+				/* 28 */ "AlBWBranch"     "\t"
+				/* 29 */ "ResBWOp"        "\t"
+				/* 30 */ "ResBWBranch"    "\t"
+				/* 31 */ "ResResolve"     "\t"
+				/* 34 */ "ResReport"      "\t"
+				/* 35 */ "RedundantSHit"  "\t"
+
+				/* 36 */ "BestMinEdit0"   "\t"
+				/* 37 */ "BestMinEdit1"   "\t"
+				/* 38 */ "BestMinEdit2"   "\t"
+
+				/* 39 */ "ExactAttempts"  "\t"
+				/* 40 */ "ExactSucc"      "\t"
+				/* 41 */ "ExactRanges"    "\t"
+				/* 42 */ "ExactRows"      "\t"
+				/* 43 */ "ExactOOMs"      "\t"
+
+				/* 44 */ "1mmAttempts"    "\t"
+				/* 45 */ "1mmSucc"        "\t"
+				/* 46 */ "1mmRanges"      "\t"
+				/* 47 */ "1mmRows"        "\t"
+				/* 48 */ "1mmOOMs"        "\t"
+
+				/* 49 */ "UngappedSucc"   "\t"
+				/* 50 */ "UngappedFail"   "\t"
+				/* 51 */ "UngappedNoDec"  "\t"
+
+				/* 52 */ "DPExLt10Gaps"   "\t"
+				/* 53 */ "DPExLt5Gaps"    "\t"
+				/* 54 */ "DPExLt3Gaps"    "\t"
+
+				/* 55 */ "DPMateLt10Gaps" "\t"
+				/* 56 */ "DPMateLt5Gaps"  "\t"
+				/* 57 */ "DPMateLt3Gaps"  "\t"
+
+				/* 58 */ "DP16ExDps"      "\t"
+				/* 59 */ "DP16ExDpSat"    "\t"
+				/* 60 */ "DP16ExDpFail"   "\t"
+				/* 61 */ "DP16ExDpSucc"   "\t"
+				/* 62 */ "DP16ExCol"      "\t"
+				/* 63 */ "DP16ExCell"     "\t"
+				/* 64 */ "DP16ExInner"    "\t"
+				/* 65 */ "DP16ExFixup"    "\t"
+				/* 66 */ "DP16ExGathSol"  "\t"
+				/* 67 */ "DP16ExBt"       "\t"
+				/* 68 */ "DP16ExBtFail"   "\t"
+				/* 69 */ "DP16ExBtSucc"   "\t"
+				/* 70 */ "DP16ExBtCell"   "\t"
+				/* 71 */ "DP16ExCoreRej"  "\t"
+				/* 72 */ "DP16ExNRej"     "\t"
+
+				/* 73 */ "DP8ExDps"       "\t"
+				/* 74 */ "DP8ExDpSat"     "\t"
+				/* 75 */ "DP8ExDpFail"    "\t"
+				/* 76 */ "DP8ExDpSucc"    "\t"
+				/* 77 */ "DP8ExCol"       "\t"
+				/* 78 */ "DP8ExCell"      "\t"
+				/* 79 */ "DP8ExInner"     "\t"
+				/* 80 */ "DP8ExFixup"     "\t"
+				/* 81 */ "DP8ExGathSol"   "\t"
+				/* 82 */ "DP8ExBt"        "\t"
+				/* 83 */ "DP8ExBtFail"    "\t"
+				/* 84 */ "DP8ExBtSucc"    "\t"
+				/* 85 */ "DP8ExBtCell"    "\t"
+				/* 86 */ "DP8ExCoreRej"   "\t"
+				/* 87 */ "DP8ExNRej"      "\t"
+
+				/* 88 */ "DP16MateDps"     "\t"
+				/* 89 */ "DP16MateDpSat"   "\t"
+				/* 90 */ "DP16MateDpFail"  "\t"
+				/* 91 */ "DP16MateDpSucc"  "\t"
+				/* 92 */ "DP16MateCol"     "\t"
+				/* 93 */ "DP16MateCell"    "\t"
+				/* 94 */ "DP16MateInner"   "\t"
+				/* 95 */ "DP16MateFixup"   "\t"
+				/* 96 */ "DP16MateGathSol" "\t"
+				/* 97 */ "DP16MateBt"      "\t"
+				/* 98 */ "DP16MateBtFail"  "\t"
+				/* 99 */ "DP16MateBtSucc"  "\t"
+				/* 100 */ "DP16MateBtCell"  "\t"
+				/* 101 */ "DP16MateCoreRej" "\t"
+				/* 102 */ "DP16MateNRej"    "\t"
+
+				/* 103 */ "DP8MateDps"     "\t"
+				/* 104 */ "DP8MateDpSat"   "\t"
+				/* 105 */ "DP8MateDpFail"  "\t"
+				/* 106 */ "DP8MateDpSucc"  "\t"
+				/* 107 */ "DP8MateCol"     "\t"
+				/* 108 */ "DP8MateCell"    "\t"
+				/* 109 */ "DP8MateInner"   "\t"
+				/* 110 */ "DP8MateFixup"   "\t"
+				/* 111 */ "DP8MateGathSol" "\t"
+				/* 112 */ "DP8MateBt"      "\t"
+				/* 113 */ "DP8MateBtFail"  "\t"
+				/* 114 */ "DP8MateBtSucc"  "\t"
+				/* 115 */ "DP8MateBtCell"  "\t"
+				/* 116 */ "DP8MateCoreRej" "\t"
+				/* 117 */ "DP8MateNRej"    "\t"
+
+				/* 118 */ "DPBtFiltStart"  "\t"
+				/* 119 */ "DPBtFiltScore"  "\t"
+				/* 120 */ "DpBtFiltDom"    "\t"
+
+				/* 121 */ "MemPeak"        "\t"
+				/* 122 */ "UncatMemPeak"   "\t" // 0
+				/* 123 */ "EbwtMemPeak"    "\t" // EBWT_CAT
+				/* 124 */ "CacheMemPeak"   "\t" // CA_CAT
+				/* 125 */ "ResolveMemPeak" "\t" // GW_CAT
+				/* 126 */ "AlignMemPeak"   "\t" // AL_CAT
+				/* 127 */ "DPMemPeak"      "\t" // DP_CAT
+				/* 128 */ "MiscMemPeak"    "\t" // MISC_CAT
+				/* 129 */ "DebugMemPeak"   "\t" // DEBUG_CAT
+            
+                /* 130 */ "LocalSearch"         "\t"
+                /* 131 */ "AnchorSearch"        "\t"
+                /* 132 */ "LocalIndexSearch"    "\t"
+                /* 133 */ "LocalExtSearch"      "\t"
+                /* 134 */ "LocalSearchRecur"    "\t"
+                /* 135 */ "GlobalGenomeCoords"  "\t"
+                /* 136 */ "LocalGenomeCoords"   "\t"
+            
+            
+				"\n";
+			
+			if(name != NULL) {
+				if(o != NULL) o->writeChars("Name\t");
+				if(metricsStderr) stderrSs << "Name\t";
+			}
+			
+			if(o != NULL) o->writeChars(str);
+			if(metricsStderr) stderrSs << str;
+			first = false;
+		}
+		
+		if(total) mergeIncrementals();
+		
+		// 0. Read name, if needed
+		if(name != NULL) {
+			if(o != NULL) {
+				o->writeChars(name->toZBuf());
+				o->write('\t');
+			}
+			if(metricsStderr) {
+				stderrSs << (*name) << '\t';
+			}
+		}
+			
+		// 1. Current time in secs
+		itoa10<time_t>(curtime, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		
+		const OuterLoopMetrics& ol = total ? olm : olmu;
+		
+		// 2. Reads
+		itoa10<uint64_t>(ol.reads, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 3. Bases
+		itoa10<uint64_t>(ol.bases, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 4. Same-read reads
+		itoa10<uint64_t>(ol.srreads, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 5. Same-read bases
+		itoa10<uint64_t>(ol.srbases, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 6. Unfiltered reads
+		itoa10<uint64_t>(ol.ureads, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 7. Unfiltered bases
+		itoa10<uint64_t>(ol.ubases, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+
+		const ReportingMetrics& rp = total ? rpm : rpmu;
+		//const SpeciesMetrics& sp = total ? spm : spmu; // TODO: do something with sp
+
+		// 8. Paired reads
+		itoa10<uint64_t>(rp.npaired, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 9. Unpaired reads
+		itoa10<uint64_t>(rp.nunpaired, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 10. Pairs with unique concordant alignments
+		itoa10<uint64_t>(rp.nconcord_uni, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+#if 0
+		// 11. Pairs with repetitive concordant alignments
+		itoa10<uint64_t>(rp.nconcord_rep, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 12. Pairs with 0 concordant alignments
+		itoa10<uint64_t>(rp.nconcord_0, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 13. Pairs with 1 discordant alignment
+		itoa10<uint64_t>(rp.ndiscord, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 14. Mates from unaligned pairs that align uniquely
+		itoa10<uint64_t>(rp.nunp_0_uni, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 15. Mates from unaligned pairs that align repetitively
+		itoa10<uint64_t>(rp.nunp_0_rep, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 16. Mates from unaligned pairs that fail to align
+		itoa10<uint64_t>(rp.nunp_0_0, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 17. Mates from repetitive pairs that align uniquely
+		itoa10<uint64_t>(rp.nunp_rep_uni, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 18. Mates from repetitive pairs that align repetitively
+		itoa10<uint64_t>(rp.nunp_rep_rep, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 19. Mates from repetitive pairs that fail to align
+		itoa10<uint64_t>(rp.nunp_rep_0, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 20. Unpaired reads that align uniquely
+		itoa10<uint64_t>(rp.nunp_uni, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 21. Unpaired reads that align repetitively
+		itoa10<uint64_t>(rp.nunp_rep, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 22. Unpaired reads that fail to align
+		itoa10<uint64_t>(rp.nunp_0, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+#endif
+        
+		const WalkMetrics& wl = total ? wlm : wlmu;
+		
+		// 29. Burrows-Wheeler ops in resolver
+		itoa10<uint64_t>(wl.bwops, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 30. Burrows-Wheeler branches in resolver
+		itoa10<uint64_t>(wl.branches, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 31. Burrows-Wheeler offset resolutions
+		itoa10<uint64_t>(wl.resolves, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 34. Offset reports
+		itoa10<uint64_t>(wl.reports, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		
+		// 121. Overall memory peak
+		itoa10<size_t>(gMemTally.peak() >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 122. Uncategorized memory peak
+		itoa10<size_t>(gMemTally.peak(0) >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 123. Ebwt memory peak
+		itoa10<size_t>(gMemTally.peak(EBWT_CAT) >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 124. Cache memory peak
+		itoa10<size_t>(gMemTally.peak(CA_CAT) >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 125. Resolver memory peak
+		itoa10<size_t>(gMemTally.peak(GW_CAT) >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 126. Seed aligner memory peak
+		itoa10<size_t>(gMemTally.peak(AL_CAT) >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 127. Dynamic programming aligner memory peak
+		itoa10<size_t>(gMemTally.peak(DP_CAT) >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 128. Miscellaneous memory peak
+		itoa10<size_t>(gMemTally.peak(MISC_CAT) >> 20, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+		// 129. Debug memory peak
+		itoa10<size_t>(gMemTally.peak(DEBUG_CAT) >> 20, buf);
+        if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+        
+        // 130
+        itoa10<size_t>(him.localatts, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+        // 131
+        itoa10<size_t>(him.anchoratts, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+        // 132
+        itoa10<size_t>(him.localindexatts, buf);
+		if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+        // 133
+        itoa10<size_t>(him.localextatts, buf);
+        if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+        // 134
+        itoa10<size_t>(him.localsearchrecur, buf);
+        if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+        // 135
+        itoa10<size_t>(him.globalgenomecoords, buf);
+        if(metricsStderr) stderrSs << buf << '\t';
+		if(o != NULL) { o->writeChars(buf); o->write('\t'); }
+        // 136
+        itoa10<size_t>(him.localgenomecoords, buf);
+        if(metricsStderr) stderrSs << buf;
+		if(o != NULL) { o->writeChars(buf); }
+
+		if(o != NULL) { o->write('\n'); }
+		if(metricsStderr) cerr << stderrSs.str().c_str() << endl;
+		if(!total) mergeIncrementals();
+	}
+	
+	void mergeIncrementals() {
+		olm.merge(olmu, false);
+		wlm.merge(wlmu, false);
+		nbtfiltst_u += nbtfiltst;
+		nbtfiltsc_u += nbtfiltsc;
+		nbtfiltdo_u += nbtfiltdo;
+
+		olmu.reset();
+		wlmu.reset();
+		rpmu.reset();
+		spmu.reset();
+		nbtfiltst_u = 0;
+		nbtfiltsc_u = 0;
+		nbtfiltdo_u = 0;
+	}
+
+	// Total over the whole job
+	OuterLoopMetrics  olm;   // overall metrics
+	WalkMetrics       wlm;   // metrics related to walking left (i.e. resolving reference offsets)
+	ReportingMetrics  rpm;   // metrics related to reporting
+	SpeciesMetrics    spm;   // metrics related to species
+	uint64_t          nbtfiltst;
+	uint64_t          nbtfiltsc;
+	uint64_t          nbtfiltdo;
+
+	// Just since the last update
+	OuterLoopMetrics  olmu;  // overall metrics
+	WalkMetrics       wlmu;  // metrics related to walking left (i.e. resolving reference offsets)
+	ReportingMetrics  rpmu;  // metrics related to reporting
+	SpeciesMetrics    spmu;  // metrics related to species counting
+	uint64_t          nbtfiltst_u;
+	uint64_t          nbtfiltsc_u;
+	uint64_t          nbtfiltdo_u;
+    
+    //
+    HIMetrics         him;
+
+	MUTEX_T           mutex_m;  // lock for when one ob
+	bool              first; // yet to print first line?
+	time_t            lastElapsed; // used in reportInterval to measure time since last call
+};
+
+static PerfMetrics metrics;
+
+// Cyclic rotations
+#define ROTL(n, x) (((x) << (n)) | ((x) >> (32-n)))
+#define ROTR(n, x) (((x) >> (n)) | ((x) << (32-n)))
+
+static inline void printMmsSkipMsg(
+	const PatternSourcePerThread& ps,
+	bool paired,
+	bool mate1,
+	int seedmms)
+{
+	ostringstream os;
+	if(paired) {
+		os << "Warning: skipping mate #" << (mate1 ? '1' : '2')
+		   << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length())
+		   << ") <= # seed mismatches (" << seedmms << ")" << endl;
+	} else {
+		os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length())
+		   << ") <= # seed mismatches (" << seedmms << ")" << endl;
+	}
+	cerr << os.str().c_str();
+}
+
+static inline void printLenSkipMsg(
+	const PatternSourcePerThread& ps,
+	bool paired,
+	bool mate1)
+{
+	ostringstream os;
+	if(paired) {
+		os << "Warning: skipping mate #" << (mate1 ? '1' : '2')
+		   << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "' because it was < 2 characters long" << endl;
+	} else {
+		os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "' because it was < 2 characters long" << endl;
+	}
+	cerr << os.str().c_str();
+}
+
+static inline void printLocalScoreMsg(
+	const PatternSourcePerThread& ps,
+	bool paired,
+	bool mate1)
+{
+	ostringstream os;
+	if(paired) {
+		os << "Warning: minimum score function gave negative number in "
+		   << "--local mode for mate #" << (mate1 ? '1' : '2')
+		   << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "; setting to 0 instead" << endl;
+	} else {
+		os << "Warning: minimum score function gave negative number in "
+		   << "--local mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "; setting to 0 instead" << endl;
+	}
+	cerr << os.str().c_str();
+}
+
+static inline void printEEScoreMsg(
+	const PatternSourcePerThread& ps,
+	bool paired,
+	bool mate1)
+{
+	ostringstream os;
+	if(paired) {
+		os << "Warning: minimum score function gave positive number in "
+		   << "--end-to-end mode for mate #" << (mate1 ? '1' : '2')
+		   << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "; setting to 0 instead" << endl;
+	} else {
+		os << "Warning: minimum score function gave positive number in "
+		   << "--end-to-end mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
+		   << "; setting to 0 instead" << endl;
+	}
+	cerr << os.str().c_str();
+}
+
+
+#define MERGE_METRICS(met, sync) { \
+	msink.mergeMetrics(rpm); \
+	met.merge( \
+		&olm, \
+		&wlm, \
+		&rpm, \
+		&spm, \
+		nbtfiltst, \
+		nbtfiltsc, \
+		nbtfiltdo, \
+        &him, \
+		sync); \
+	olm.reset(); \
+	wlm.reset(); \
+	rpm.reset(); \
+	spm.reset(); \
+    him.reset(); \
+}
+
+/**
+ * Called once per thread.  Sets up per-thread pointers to the shared global
+ * data structures, creates per-thread structures, then enters the alignment
+ * loop.  The general flow of the alignment loop is:
+ *
+ * - If it's been a while and we're the master thread, report some alignment
+ *   metrics
+ * - Get the next read/pair
+ * - Check if this read/pair is identical to the previous
+ *   + If identical, check whether we can skip any or all alignment stages.  If
+ *     we can skip all stages, report the result immediately and move to next
+ *     read/pair
+ *   + If not identical, continue
+ * -
+ */
+static void multiseedSearchWorker(void *vp) {
+
+	int tid = *((int*)vp);
+	assert(multiseed_ebwtFw != NULL);
+	assert(multiseedMms == 0 || multiseed_ebwtBw != NULL);
+	PairedPatternSource&             patsrc   = *multiseed_patsrc;
+	const Ebwt<index_t>&             ebwtFw   = *multiseed_ebwtFw;
+	const Ebwt<index_t>&             ebwtBw   = *multiseed_ebwtBw;
+	const Scoring&                   sc       = *multiseed_sc;
+	const BitPairReference&          ref      = *multiseed_refs;
+	AlnSink<index_t>&                msink    = *multiseed_msink;
+	OutFileBuf*                      metricsOfb = multiseed_metricsOfb;
+    
+	// Sinks: these are so that we can print tables encoding counts for
+	// events of interest on a per-read, per-seed, per-join, or per-SW
+	// level.  These in turn can be used to diagnose performance
+	// problems, or generally characterize performance.
+	
+	//const BitPairReference& refs   = *multiseed_refs;
+	auto_ptr<PatternSourcePerThreadFactory> patsrcFact(createPatsrcFactory(patsrc, tid));
+	auto_ptr<PatternSourcePerThread> ps(patsrcFact->create());
+	
+	// Instantiate an object for holding reporting-related parameters.
+    ReportingParams rp((allHits ? std::numeric_limits<THitInt>::max() : khits),
+                       ebwtFw.compressed()); // -k
+
+	// Make a per-thread wrapper for the global MHitSink object.
+	AlnSinkWrap<index_t> msinkwrap(
+                                   msink,         // global sink
+                                   rp,            // reporting parameters
+                                   (size_t)tid);  // thread id
+    
+    Classifier<index_t, local_index_t> classifier(
+                                                  ebwtFw,
+                                                  multiseed_refnames,
+                                                  gMate1fw,
+                                                  gMate2fw,
+                                                  minHitLen,
+                                                  tree_traverse,
+                                                  classification_rank,
+                                                  host_taxIDs,
+                                                  excluded_taxIDs);
+	OuterLoopMetrics olm;
+	WalkMetrics wlm;
+	ReportingMetrics rpm;
+	PerReadMetrics prm;
+	SpeciesMetrics spm;
+
+	RandomSource rnd, rndArb;
+	uint64_t nbtfiltst = 0; // TODO: find a new home for these
+	uint64_t nbtfiltsc = 0; // TODO: find a new home for these
+	uint64_t nbtfiltdo = 0; // TODO: find a new home for these
+    HIMetrics him;
+    
+	ASSERT_ONLY(BTDnaString tmp);
+    
+	int pepolFlag;
+	if(gMate1fw && gMate2fw) {
+		pepolFlag = PE_POLICY_FF;
+	} else if(gMate1fw && !gMate2fw) {
+		pepolFlag = PE_POLICY_FR;
+	} else if(!gMate1fw && gMate2fw) {
+		pepolFlag = PE_POLICY_RF;
+	} else {
+		pepolFlag = PE_POLICY_RR;
+	}
+	assert_geq(gMaxInsert, gMinInsert);
+	assert_geq(gMinInsert, 0);
+	PairedEndPolicy pepol(
+                          pepolFlag,
+                          gMaxInsert,
+                          gMinInsert,
+                          localAlign,
+                          gFlippedMatesOK,
+                          gDovetailMatesOK,
+                          gContainMatesOK,
+                          gOlapMatesOK,
+                          gExpandToFrag);
+    
+  	PerfMetrics metricsPt; // per-thread metrics object; for read-level metrics
+	BTString nametmp;
+	
+	// Used by thread with threadid == 1 to measure time elapsed
+	time_t iTime = time(0);
+    
+	// Keep track of whether last search was exhaustive for mates 1 and 2
+	bool exhaustive[2] = { false, false };
+	// Keep track of whether mates 1/2 were filtered out last time through
+	bool filt[2]    = { true, true };
+	// Keep track of whether mates 1/2 were filtered out due Ns last time
+	bool nfilt[2]   = { true, true };
+	// Keep track of whether mates 1/2 were filtered out due to not having
+	// enough characters to rise about the score threshold.
+	bool scfilt[2]  = { true, true };
+	// Keep track of whether mates 1/2 were filtered out due to not having
+	// more characters than the number of mismatches permitted in a seed.
+	bool lenfilt[2] = { true, true };
+	// Keep track of whether mates 1/2 were filtered out by upstream qc
+	bool qcfilt[2]  = { true, true };
+    
+	rndArb.init((uint32_t)time(0));
+	int mergei = 0;
+	int mergeival = 16;
+	while(true) {
+		bool success = false, done = false, paired = false;
+		ps->nextReadPair(success, done, paired, outType != OUTPUT_SAM);
+		if(!success && done) {
+			break;
+		} else if(!success) {
+			continue;
+		}
+		TReadId rdid = ps->rdid();
+		bool sample = true;
+		if(arbitraryRandom) {
+			ps->bufa().seed = rndArb.nextU32();
+			ps->bufb().seed = rndArb.nextU32();
+		}
+		if(sampleFrac < 1.0f) {
+			rnd.init(ROTL(ps->bufa().seed, 2));
+			sample = rnd.nextFloat() < sampleFrac;
+		}
+		if(rdid >= skipReads && rdid < qUpto && sample) {
+			// Align this read/pair
+			bool retry = true;
+			//
+			// Check if there is metrics reporting for us to do.
+			//
+			if(metricsIval > 0 &&
+			   (metricsOfb != NULL || metricsStderr) &&
+			   !metricsPerRead &&
+			   ++mergei == mergeival)
+			{
+				// Do a periodic merge.  Update global metrics, in a
+				// synchronized manner if needed.
+				MERGE_METRICS(metrics, nthreads > 1);
+				mergei = 0;
+				// Check if a progress message should be printed
+				if(tid == 0) {
+					// Only thread 1 prints progress messages
+					time_t curTime = time(0);
+					if(curTime - iTime >= metricsIval) {
+						metrics.reportInterval(metricsOfb, metricsStderr, false, true, NULL);
+						iTime = curTime;
+					}
+				}
+			}
+
+			prm.reset(); // per-read metrics
+			prm.doFmString = false;
+			if(sam_print_xt) {
+				gettimeofday(&prm.tv_beg, &prm.tz_beg);
+			}
+			// Try to align this read
+			while(retry) {
+				retry = false;
+				assert_eq(ps->bufa().color, false);
+				olm.reads++;
+				bool pair = paired;
+				const size_t rdlen1 = ps->bufa().length();
+				const size_t rdlen2 = pair ? ps->bufb().length() : 0;
+				olm.bases += (rdlen1 + rdlen2);
+				msinkwrap.nextRead(
+                                   &ps->bufa(),
+                                   pair ? &ps->bufb() : NULL,
+                                   rdid,
+                                   sc.qualitiesMatter());
+				assert(msinkwrap.inited());
+				size_t rdlens[2] = { rdlen1, rdlen2 };
+				// Calculate the minimum valid score threshold for the read
+				TAlScore minsc[2], maxpen[2];
+				maxpen[0] = maxpen[1] = 0;
+				minsc[0] = minsc[1] = std::numeric_limits<TAlScore>::max();
+				if(bwaSwLike) {
+					// From BWA-SW manual: "Given an l-long query, the
+					// threshold for a hit to be retained is
+					// a*max{T,c*log(l)}."  We try to recreate that here.
+					float a = (float)sc.match(30);
+					float T = bwaSwLikeT, c = bwaSwLikeC;
+					minsc[0] = (TAlScore)max<float>(a*T, a*c*log(rdlens[0]));
+					if(paired) {
+						minsc[1] = (TAlScore)max<float>(a*T, a*c*log(rdlens[1]));
+					}
+				} else {
+					minsc[0] = scoreMin.f<TAlScore>(rdlens[0]);
+					if(paired) minsc[1] = scoreMin.f<TAlScore>(rdlens[1]);
+					if(localAlign) {
+						if(minsc[0] < 0) {
+							if(!gQuiet) printLocalScoreMsg(*ps, paired, true);
+							minsc[0] = 0;
+						}
+						if(paired && minsc[1] < 0) {
+							if(!gQuiet) printLocalScoreMsg(*ps, paired, false);
+							minsc[1] = 0;
+						}
+					} else {
+						if(minsc[0] > 0) {
+							if(!gQuiet) printEEScoreMsg(*ps, paired, true);
+							minsc[0] = 0;
+						}
+						if(paired && minsc[1] > 0) {
+							if(!gQuiet) printEEScoreMsg(*ps, paired, false);
+							minsc[1] = 0;
+						}
+					}
+				}
+                
+				// N filter; does the read have too many Ns?
+				size_t readns[2] = {0, 0};
+				sc.nFilterPair(
+                               &ps->bufa().patFw,
+                               pair ? &ps->bufb().patFw : NULL,
+                               readns[0],
+                               readns[1],
+                               nfilt[0],
+                               nfilt[1]);
+				// Score filter; does the read enough character to rise above
+				// the score threshold?
+				scfilt[0] = sc.scoreFilter(minsc[0], rdlens[0]);
+				scfilt[1] = sc.scoreFilter(minsc[1], rdlens[1]);
+				lenfilt[0] = lenfilt[1] = true;
+				if(rdlens[0] <= (size_t)multiseedMms || rdlens[0] < 2) {
+					if(!gQuiet) printMmsSkipMsg(*ps, paired, true, multiseedMms);
+					lenfilt[0] = false;
+				}
+				if((rdlens[1] <= (size_t)multiseedMms || rdlens[1] < 2) && paired) {
+					if(!gQuiet) printMmsSkipMsg(*ps, paired, false, multiseedMms);
+					lenfilt[1] = false;
+				}
+				if(rdlens[0] < 2) {
+					if(!gQuiet) printLenSkipMsg(*ps, paired, true);
+					lenfilt[0] = false;
+				}
+				if(rdlens[1] < 2 && paired) {
+					if(!gQuiet) printLenSkipMsg(*ps, paired, false);
+					lenfilt[1] = false;
+				}
+				qcfilt[0] = qcfilt[1] = true;
+				if(qcFilter) {
+					qcfilt[0] = (ps->bufa().filter != '0');
+					qcfilt[1] = (ps->bufb().filter != '0');
+				}
+				filt[0] = (nfilt[0] && scfilt[0] && lenfilt[0] && qcfilt[0]);
+				filt[1] = (nfilt[1] && scfilt[1] && lenfilt[1] && qcfilt[1]);
+				prm.nFilt += (filt[0] ? 0 : 1) + (filt[1] ? 0 : 1);
+				Read* rds[2] = { &ps->bufa(), &ps->bufb() };
+				// For each mate...
+				assert(msinkwrap.empty());
+				//size_t minedfw[2] = { 0, 0 };
+				//size_t minedrc[2] = { 0, 0 };
+				// Calcualte nofw / no rc
+				bool nofw[2] = { false, false };
+				bool norc[2] = { false, false };
+				nofw[0] = paired ? (gMate1fw ? gNofw : gNorc) : gNofw;
+				norc[0] = paired ? (gMate1fw ? gNorc : gNofw) : gNorc;
+				nofw[1] = paired ? (gMate2fw ? gNofw : gNorc) : gNofw;
+				norc[1] = paired ? (gMate2fw ? gNorc : gNofw) : gNorc;
+				// Calculate nceil
+				int nceil[2] = { 0, 0 };
+				nceil[0] = nCeil.f<int>((double)rdlens[0]);
+				nceil[0] = min(nceil[0], (int)rdlens[0]);
+				if(paired) {
+					nceil[1] = nCeil.f<int>((double)rdlens[1]);
+					nceil[1] = min(nceil[1], (int)rdlens[1]);
+				}
+				exhaustive[0] = exhaustive[1] = false;
+				//size_t matemap[2] = { 0, 1 };
+				bool pairPostFilt = filt[0] && filt[1];
+				if(pairPostFilt) {
+					rnd.init(ps->bufa().seed ^ ps->bufb().seed);
+				} else {
+					rnd.init(ps->bufa().seed);
+				}
+				// Calculate interval length for both mates
+				int interval[2] = { 0, 0 };
+				for(size_t mate = 0; mate < (pair ? 2:1); mate++) {
+					interval[mate] = msIval.f<int>((double)rdlens[mate]);
+					if(filt[0] && filt[1]) {
+						// Boost interval length by 20% for paired-end reads
+						interval[mate] = (int)(interval[mate] * 1.2 + 0.5);
+					}
+					interval[mate] = max(interval[mate], 1);
+				}
+				// Calculate streak length
+				size_t streak[2]    = { maxDpStreak,   maxDpStreak };
+				size_t mtStreak[2]  = { maxMateStreak, maxMateStreak };
+				size_t mxDp[2]      = { maxDp,         maxDp       };
+				size_t mxUg[2]      = { maxUg,         maxUg       };
+				size_t mxIter[2]    = { maxIters,      maxIters    };
+				if(allHits) {
+					streak[0]   = streak[1]   = std::numeric_limits<size_t>::max();
+					mtStreak[0] = mtStreak[1] = std::numeric_limits<size_t>::max();
+					mxDp[0]     = mxDp[1]     = std::numeric_limits<size_t>::max();
+					mxUg[0]     = mxUg[1]     = std::numeric_limits<size_t>::max();
+					mxIter[0]   = mxIter[1]   = std::numeric_limits<size_t>::max();
+				} else if(khits > 1) {
+					for(size_t mate = 0; mate < 2; mate++) {
+						streak[mate]   += (khits-1) * maxStreakIncr;
+						mtStreak[mate] += (khits-1) * maxStreakIncr;
+						mxDp[mate]     += (khits-1) * maxItersIncr;
+						mxUg[mate]     += (khits-1) * maxItersIncr;
+						mxIter[mate]   += (khits-1) * maxItersIncr;
+					}
+				}
+				if(filt[0] && filt[1]) {
+					streak[0] = (size_t)ceil((double)streak[0] / 2.0);
+					streak[1] = (size_t)ceil((double)streak[1] / 2.0);
+					assert_gt(streak[1], 0);
+				}
+				assert_gt(streak[0], 0);
+				// Calculate # seed rounds for each mate
+
+				size_t nrounds[2] = { nSeedRounds, nSeedRounds };
+				if(filt[0] && filt[1]) {
+					nrounds[0] = (size_t)ceil((double)nrounds[0] / 2.0);
+					nrounds[1] = (size_t)ceil((double)nrounds[1] / 2.0);
+					assert_gt(nrounds[1], 0);
+				}
+				assert_gt(nrounds[0], 0);
+				// Increment counters according to what got filtered
+				for(size_t mate = 0; mate < (pair ? 2:1); mate++) {
+					if(!filt[mate]) {
+						// Mate was rejected by N filter
+						olm.freads++;               // reads filtered out
+						olm.fbases += rdlens[mate]; // bases filtered out
+					} else {
+						//shs[mate].clear();
+						//shs[mate].nextRead(mate == 0 ? ps->bufa() : ps->bufb());
+						//assert(shs[mate].empty());
+						olm.ureads++;               // reads passing filter
+						olm.ubases += rdlens[mate]; // bases passing filter
+					}
+				}
+				//size_t eePeEeltLimit = std::numeric_limits<size_t>::max();
+				// Whether we're done with mate1 / mate2
+                bool done[2] = { !filt[0], !filt[1] };
+				// size_t nelt[2] = {0, 0};
+                if(filt[0] && filt[1]) {
+                    classifier.initReads(rds, nofw, norc, minsc, maxpen);
+                } 
+                else if(filt[0]) {
+                    classifier.initRead(rds[0], nofw[0], norc[0], minsc[0], maxpen[0], filt[1]);
+                }
+                else if(filt[1]) {
+                    classifier.initRead(rds[1], nofw[1], norc[1], minsc[1], maxpen[1], filt[1]);
+                }
+                if(filt[0] || filt[1]) {
+                    classifier.go(sc, ebwtFw, ebwtBw, ref, wlm, prm, him, spm, rnd, msinkwrap);
+                    size_t mate = 0;
+                    if(!done[mate]) {
+                        TAlScore perfectScore = sc.perfectScore(rdlens[mate]);
+                        if(!done[mate] && minsc[mate] == perfectScore) {
+                            done[mate] = true;
+                        }
+                    }
+                }
+
+                for(size_t i = 0; i < 2; i++) {
+                    assert_leq(prm.nExIters, mxIter[i]);
+                    assert_leq(prm.nExDps,   mxDp[i]);
+                    assert_leq(prm.nMateDps, mxDp[i]);
+                    assert_leq(prm.nExUgs,   mxUg[i]);
+                    assert_leq(prm.nMateUgs, mxUg[i]);
+                    assert_leq(prm.nDpFail,  streak[i]);
+                    assert_leq(prm.nUgFail,  streak[i]);
+                    assert_leq(prm.nEeFail,  streak[i]);
+                }
+                
+                // Commit and report paired-end/unpaired alignments
+				msinkwrap.finishRead(
+                                     NULL,
+                                     NULL,
+                                     exhaustive[0],        // exhausted seed hits for mate 1?
+                                     exhaustive[1],        // exhausted seed hits for mate 2?
+                                     nfilt[0],
+                                     nfilt[1],
+                                     scfilt[0],
+                                     scfilt[1],
+                                     lenfilt[0],
+                                     lenfilt[1],
+                                     qcfilt[0],
+                                     qcfilt[1],
+                                     sortByScore,          // prioritize by alignment score
+                                     rnd,                  // pseudo-random generator
+                                     rpm,                  // reporting metrics
+									 spm,                  // species metrics
+                                     prm,                  // per-read metrics
+                                     !seedSumm,            // suppress seed summaries?
+                                     seedSumm);            // suppress alignments?
+				assert(!retry || msinkwrap.empty());
+            } // while(retry)
+		} // if(rdid >= skipReads && rdid < qUpto)
+		else if(rdid >= qUpto) {
+			break;
+		}
+
+		if(metricsPerRead) {
+			MERGE_METRICS(metricsPt, nthreads > 1);
+			nametmp = ps->bufa().name;
+			metricsPt.reportInterval(
+                                     metricsOfb, metricsStderr, true, true, &nametmp);
+			metricsPt.reset();
+		}
+	} // while(true)
+	
+	// One last metrics merge
+	MERGE_METRICS(metrics, nthreads > 1);
+    
+	return;
+}
+
+/**
+ * Called once per alignment job.  Sets up global pointers to the
+ * shared global data structures, creates per-thread structures, then
+ * enters the search loop.
+ */
+static void multiseedSearch(
+	Scoring& sc,
+	PairedPatternSource& patsrc,  // pattern source
+	AlnSink<index_t>& msink,      // hit sink
+	Ebwt<index_t>& ebwtFw,        // index of original text
+	Ebwt<index_t>& ebwtBw,        // index of mirror text
+    BitPairReference* refs,
+    const EList<string>& refnames,
+	OutFileBuf *metricsOfb)
+{
+
+    multiseed_patsrc = &patsrc;
+	multiseed_msink  = &msink;
+	multiseed_ebwtFw = &ebwtFw;
+	multiseed_ebwtBw = &ebwtBw;
+	multiseed_sc     = ≻
+	multiseed_metricsOfb      = metricsOfb;
+	multiseed_refs = refs;
+    multiseed_refnames = refnames;
+	AutoArray<tthread::thread*> threads(nthreads);
+	AutoArray<int> tids(nthreads);
+	{
+		// Load the other half of the index into memory
+		assert(!ebwtFw.isInMemory());
+		Timer _t(cerr, "Time loading forward index: ", timing);
+		ebwtFw.loadIntoMemory(
+			0,  // colorspace?
+			-1, // not the reverse index
+			true,         // load SA samp? (yes, need forward index's SA samp)
+			true,         // load ftab (in forward index)
+			true,         // load rstarts (in forward index)
+			!noRefNames,  // load names?
+			startVerbose);
+	}
+#if 0
+	if(multiseedMms > 0 || do1mmUpFront) {
+		// Load the other half of the index into memory
+		assert(!ebwtBw.isInMemory());
+		Timer _t(cerr, "Time loading mirror index: ", timing);
+		ebwtBw.loadIntoMemory(
+			0, // colorspace?
+			// It's bidirectional search, so we need the reverse to be
+			// constructed as the reverse of the concatenated strings.
+			1,
+			true,        // load SA samp in reverse index
+			true,         // yes, need ftab in reverse index
+			true,        // load rstarts in reverse index
+			!noRefNames,  // load names?
+			startVerbose);
+	}
+#endif
+	// Start the metrics thread
+	{
+		Timer _t(cerr, "Multiseed full-index search: ", timing);
+        
+        thread_rids.resize(nthreads);
+        thread_rids.fill(0);
+		for(int i = 0; i < nthreads; i++) {
+			// Thread IDs start at 1
+			tids[i] = i+1;
+            threads[i] = new tthread::thread(multiseedSearchWorker, (void*)&tids[i]);
+		}
+
+        for (int i = 0; i < nthreads; i++)
+            threads[i]->join();
+
+	}
+	if(!metricsPerRead && (metricsOfb != NULL || metricsStderr)) {
+		metrics.reportInterval(metricsOfb, metricsStderr, true, false, NULL);
+	}
+}
+
+static string argstr;
+
+extern void initializeCntLut();
+
+template<typename TStr>
+static void driver(
+	const char * type,
+	const string& bt2indexBase,
+	const string& outfile)
+{
+	if(gVerbose || startVerbose)  {
+		cerr << "Entered driver(): "; logTime(cerr, true);
+	}
+    
+    initializeCntLut();
+    
+	// Vector of the reference sequences; used for sanity-checking
+	EList<SString<char> > names, os;
+	EList<size_t> nameLens, seqLens;
+	// Read reference sequences from the command-line or from a FASTA file
+	if(!origString.empty()) {
+		// Read fasta file(s)
+		EList<string> origFiles;
+		tokenize(origString, ",", origFiles);
+		parseFastas(origFiles, names, nameLens, os, seqLens);
+	}
+	PatternParams pp(
+		format,        // file format
+		fileParallel,  // true -> wrap files with separate PairedPatternSources
+		seed,          // pseudo-random seed
+		useSpinlock,   // use spin locks instead of pthreads
+		solexaQuals,   // true -> qualities are on solexa64 scale
+		phred64Quals,  // true -> qualities are on phred64 scale
+		integerQuals,  // true -> qualities are space-separated numbers
+		fuzzy,         // true -> try to parse fuzzy fastq
+		fastaContLen,  // length of sampled reads for FastaContinuous...
+		fastaContFreq, // frequency of sampled reads for FastaContinuous...
+		skipReads      // skip the first 'skip' patterns
+	);
+	if(gVerbose || startVerbose) {
+		cerr << "Creating PatternSource: "; logTime(cerr, true);
+	}
+	PairedPatternSource *patsrc = PairedPatternSource::setupPatternSources(
+		queries,     // singles, from argv
+		mates1,      // mate1's, from -1 arg
+		mates2,      // mate2's, from -2 arg
+		mates12,     // both mates on each line, from --12 arg
+#ifdef USE_SRA
+                                                                           sra_accs,    // SRA accessions
+#endif
+		qualities,   // qualities associated with singles
+		qualities1,  // qualities associated with m1
+		qualities2,  // qualities associated with m2
+		pp,          // read read-in parameters
+                                                                           nthreads,
+		gVerbose || startVerbose); // be talkative
+	// Open hit output file
+	if(gVerbose || startVerbose) {
+		cerr << "Opening hit output file: "; logTime(cerr, true);
+	}
+	OutFileBuf *fout;
+	if(!outfile.empty()) {
+		fout = new OutFileBuf(outfile.c_str(), false);
+	} else {
+		fout = new OutFileBuf();
+	}
+	// Initialize Ebwt object and read in header
+	if(gVerbose || startVerbose) {
+		cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
+	}
+	adjIdxBase = adjustEbwtBase(argv0, bt2indexBase, gVerbose);
+	Ebwt<index_t> ebwt(
+		adjIdxBase,
+	    0,        // index is colorspace
+		-1,       // fw index
+	    true,     // index is for the forward direction
+	    /* overriding: */ offRate,
+		0, // amount to add to index offrate or <= 0 to do nothing
+	    useMm,    // whether to use memory-mapped files
+	    useShmem, // whether to use shared memory
+	    mmSweep,  // sweep memory-mapped files
+	    !noRefNames, // load names?
+		true,        // load SA sample?
+		true,        // load ftab?
+		true,        // load rstarts?
+	    gVerbose, // whether to be talkative
+	    startVerbose, // talkative during initialization
+	    false /*passMemExc*/,
+	    sanityCheck);
+	Ebwt<index_t>* ebwtBw = NULL;
+#if 0
+	// We need the mirror index if mismatches are allowed
+	if(multiseedMms > 0 || do1mmUpFront) {
+		if(gVerbose || startVerbose) {
+			cerr << "About to initialize rev Ebwt: "; logTime(cerr, true);
+		}
+		ebwtBw = new HierEbwt<index_t, local_index_t>(
+			adjIdxBase + ".rev",
+			0,       // index is colorspace
+			1,       // TODO: maybe not
+		    false, // index is for the reverse direction
+		    /* overriding: */ offRate,
+			0, // amount to add to index offrate or <= 0 to do nothing
+		    useMm,    // whether to use memory-mapped files
+		    useShmem, // whether to use shared memory
+		    mmSweep,  // sweep memory-mapped files
+		    !noRefNames, // load names?
+			true,        // load SA sample?
+			true,        // load ftab?
+			true,        // load rstarts?
+		    gVerbose,    // whether to be talkative
+		    startVerbose, // talkative during initialization
+		    false /*passMemExc*/,
+		    sanityCheck);
+	}
+#endif
+	if(sanityCheck && !os.empty()) {
+		// Sanity check number of patterns and pattern lengths in Ebwt
+		// against original strings
+		assert_eq(os.size(), ebwt.nPat());
+		for(size_t i = 0; i < os.size(); i++) {
+			assert_eq(os[i].length(), ebwt.plen()[i]);
+		}
+	}
+	// Sanity-check the restored version of the Ebwt
+	if(sanityCheck && !os.empty()) {
+		ebwt.loadIntoMemory(
+			0,
+			-1, // fw index
+			true, // load SA sample
+			true, // load ftab
+			true, // load rstarts
+			!noRefNames,
+			startVerbose);
+		ebwt.checkOrigs(os, false, false);
+		ebwt.evictFromMemory();
+	}
+	OutputQueue oq(
+		*fout,                   // out file buffer
+		reorder && nthreads > 1, // whether to reorder when there's >1 thread
+		nthreads,                // # threads
+		nthreads > 1,            // whether to be thread-safe
+		skipReads);              // first read will have this rdid
+	{
+		Timer _t(cerr, "Time searching: ", timing);
+		// Set up penalities
+		if(bonusMatch > 0 && !localAlign) {
+			cerr << "Warning: Match bonus always = 0 in --end-to-end mode; ignoring user setting" << endl;
+			bonusMatch = 0;
+		}
+		Scoring sc(
+                   bonusMatch,     // constant reward for match
+                   penMmcType,     // how to penalize mismatches
+                   penMmcMax,      // max mm pelanty
+                   penMmcMin,      // min mm pelanty
+                   scoreMin,       // min score as function of read len
+                   nCeil,          // max # Ns as function of read len
+                   penNType,       // how to penalize Ns in the read
+                   penN,           // constant if N pelanty is a constant
+                   penNCatPair,    // whether to concat mates before N filtering
+                   penRdGapConst,  // constant coeff for read gap cost
+                   penRfGapConst,  // constant coeff for ref gap cost
+                   penRdGapLinear, // linear coeff for read gap cost
+                   penRfGapLinear, // linear coeff for ref gap cost
+                   gGapBarrier);    // # rows at top/bot only entered diagonally
+
+		EList<string> refnames;
+		readEbwtRefnames<index_t>(adjIdxBase, refnames);
+
+		EList<size_t> reflens;
+		// Set up hit sink; if sanityCheck && !os.empty() is true,
+		// then instruct the sink to "retain" hits in a vector in
+		// memory so that we can easily sanity check them later on
+		AlnSink<index_t> *mssink = NULL;
+        Timer *_tRef = new Timer(cerr, "Time loading reference: ", timing);
+        auto_ptr<BitPairReference> refs;
+        delete _tRef;
+        switch(outType) {
+			case OUTPUT_SAM: {
+				mssink = new AlnSinkSam<index_t>(
+                                                 &ebwt,
+                                                 oq,           // output queue
+                                                 refnames,     // reference names
+                                                 gQuiet);      // don't print alignment summary at end
+                if(!samNoHead) {
+					BTString buf;
+					fout->writeString(buf);
+				}
+				// Write header for read-results file
+				fout->writeChars("readID\tseqID\ttaxID\tscore\t2ndBestScore\thitLength\tqueryLength\tnumMatches\n");
+				break;
+			}
+			default:
+				cerr << "Invalid output type: " << outType << endl;
+				throw 1;
+		}
+
+
+		if(gVerbose || startVerbose) {
+			cerr << "Dispatching to search driver: "; logTime(cerr, true);
+		}
+		// Set up global constraint
+		OutFileBuf *metricsOfb = NULL;
+		if(!metricsFile.empty() && metricsIval > 0) {
+			metricsOfb = new OutFileBuf(metricsFile);
+		}
+
+
+		// Do the search for all input reads
+		assert(patsrc != NULL);
+		assert(mssink != NULL);
+		multiseedSearch(
+			sc,      // scoring scheme
+			*patsrc, // pattern source
+			*mssink, // hit sink
+			ebwt,    // BWT
+			*ebwtBw, // BWT'
+            refs.get(),
+            refnames,
+			metricsOfb);
+		// Evict any loaded indexes from memory
+		if(ebwt.isInMemory()) {
+			ebwt.evictFromMemory();
+		}
+		if(ebwtBw != NULL) {
+			delete ebwtBw;
+		}
+		if(!gQuiet && !seedSumm) {
+			size_t repThresh = mhits;
+			if(repThresh == 0) {
+				repThresh = std::numeric_limits<size_t>::max();
+			}
+			mssink->finish(
+				repThresh,
+				gReportDiscordant,
+				gReportMixed,
+				hadoopOut);
+		}
+		
+		if (!reportFile.empty()) {
+            // write the species report into the corresponding file
+            cerr << "report file " << reportFile << endl;
+			ofstream reportOfb;
+			reportOfb.open(reportFile.c_str());
+			SpeciesMetrics& spm = metrics.spmu;
+            if(abundance_analysis) {
+                uint8_t rank = get_tax_rank_id(classification_rank.c_str());
+                Timer timer(cerr, "Calculating abundance: ");
+                spm.calculateAbundance(ebwt, rank);
+            }
+            const std::map<uint64_t, TaxonomyNode>& tree = ebwt.tree();
+            const std::map<uint64_t, string>& name_map = ebwt.name();
+            const std::map<uint64_t, uint64_t>& size_map = ebwt.size();
+            const map<uint64_t, double>& abundance = spm.abundance;
+            const map<uint64_t, double>& abundance_len = spm.abundance_len;
+			reportOfb << "name" << '\t' << "taxID" << '\t' << "taxRank" << '\t'
+					  << "genomeSize" << '\t' << "numReads" << '\t' << "numUniqueReads" << '\t';
+            if(false) {
+                reportOfb << "summedHitLen" << '\t' << "numWeightedReads" << '\t' << "numUniqueKmers" << '\t' << "sumScore" << '\t';
+            }
+            reportOfb << "abundance";
+            if(false) {
+                reportOfb << '\t' << "abundance_normalized_by_genome_size";
+            }
+            reportOfb << endl;
+			for(map<uint64_t, ReadCounts>::const_iterator it = spm.species_counts.begin(); it != spm.species_counts.end(); ++it) {
+                uint64_t taxid = it->first;
+                if(taxid == 0) continue;
+                std::map<uint64_t, string>::const_iterator name_itr = name_map.find(taxid);
+                if(name_itr != name_map.end()) {
+                    reportOfb << name_itr->second;
+                } else {
+                    reportOfb << taxid;
+                }
+                reportOfb << '\t' << taxid << '\t';
+                uint8_t rank = 0;
+                bool leaf = false;
+                std::map<uint64_t, TaxonomyNode>::const_iterator tree_itr = tree.find(taxid);
+                
+                if(tree_itr != tree.end()) {
+                    rank = tree_itr->second.rank;
+                    leaf = tree_itr->second.leaf;
+                }
+                if(rank == RANK_UNKNOWN && leaf) {
+                    reportOfb << "leaf";
+                } else {
+                    string rank_str = get_tax_rank_string(rank);
+                    reportOfb << rank_str;
+                }
+                reportOfb << '\t';
+                
+                std::map<uint64_t, uint64_t>::const_iterator size_itr = size_map.find(taxid);
+                uint64_t genome_size = 0;
+                if(size_itr != size_map.end()) {
+                    genome_size = size_itr->second;
+                }
+                
+                reportOfb << genome_size << '\t'
+						  << it->second.n_reads << '\t' << it->second.n_unique_reads << '\t';
+                if(false) {
+                    reportOfb << it->second.summed_hit_len << '\t' << it->second.weighted_reads << '\t'
+                              << spm.nDistinctKmers(taxid) << '\t' << it->second.sum_score << '\t';
+                }
+                map<uint64_t, double>::const_iterator ab_len_itr = abundance_len.find(taxid);
+                if(ab_len_itr != abundance_len.end()) {
+                    reportOfb << ab_len_itr->second;
+                } else {
+                    reportOfb << "0.0";
+                }
+                map<uint64_t, double>::const_iterator ab_itr = abundance.find(taxid);
+                if(false) {
+                    if(ab_itr != abundance.end() && ab_len_itr != abundance_len.end()) {
+                        reportOfb << '\t' << ab_itr->second;
+                    } else {
+                        reportOfb << "\t0.0";
+                    }
+                }
+                reportOfb << endl;
+
+			}
+			reportOfb.close();
+		}
+
+
+		oq.flush(true);
+		assert_eq(oq.numStarted(), oq.numFinished());
+		assert_eq(oq.numStarted(), oq.numFlushed());
+		delete patsrc;
+		delete mssink;
+		delete metricsOfb;
+		if(fout != NULL) {
+			delete fout;
+		}
+	}
+}
+
+// C++ name mangling is disabled for the bowtie() function to make it
+// easier to use Bowtie as a library.
+extern "C" {
+
+/**
+ * Main bowtie entry function.  Parses argc/argv style command-line
+ * options, sets global configuration variables, and calls the driver()
+ * function.
+ */
+int centrifuge(int argc, const char **argv) {
+	try {
+		// Reset all global state, including getopt state
+		opterr = optind = 1;
+		resetOptions();
+		for(int i = 0; i < argc; i++) {
+			argstr += argv[i];
+			if(i < argc-1) argstr += " ";
+		}
+		if(startVerbose) { cerr << "Entered main(): "; logTime(cerr, true); }
+
+		parseOptions(argc, argv);
+
+		argv0 = argv[0];
+		if(showVersion) {
+			cout << argv0 << " version " << CENTRIFUGE_VERSION << endl;
+			if(sizeof(void*) == 4) {
+				cout << "32-bit" << endl;
+			} else if(sizeof(void*) == 8) {
+				cout << "64-bit" << endl;
+			} else {
+				cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
+			}
+			cout << "Built on " << BUILD_HOST << endl;
+			cout << BUILD_TIME << endl;
+			cout << "Compiler: " << COMPILER_VERSION << endl;
+			cout << "Options: " << COMPILER_OPTIONS << endl;
+			cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
+				 << sizeof(int)
+				 << ", " << sizeof(long) << ", " << sizeof(long long)
+				 << ", " << sizeof(void *) << ", " << sizeof(size_t)
+				 << ", " << sizeof(off_t) << "}" << endl;
+			return 0;
+		}
+		{
+			Timer _t(cerr, "Overall time: ", timing);
+			if(startVerbose) {
+				cerr << "Parsing index and read arguments: "; logTime(cerr, true);
+			}
+
+			// Get index basename (but only if it wasn't specified via --index)
+			if(bt2index.empty()) {
+				if(optind >= argc) {
+					cerr << "No index, query, or output file specified!" << endl;
+					printUsage(cerr);
+					return 1;
+				}
+				bt2index = argv[optind++];
+			}
+
+			// Get query filename
+			bool got_reads = !queries.empty() || !mates1.empty() || !mates12.empty();
+#ifdef USE_SRA
+            got_reads = got_reads || !sra_accs.empty();
+#endif
+			if(optind >= argc) {
+				if(!got_reads) {
+					printUsage(cerr);
+					cerr << "***" << endl
+#ifdef USE_SRA
+                    << "Error: Must specify at least one read input with -U/-1/-2/--sra-acc" << endl;
+#else
+                    << "Error: Must specify at least one read input with -U/-1/-2" << endl;
+#endif
+					return 1;
+				}
+			} else if(!got_reads) {
+				// Tokenize the list of query files
+				tokenize(argv[optind++], ",", queries);
+				if(queries.empty()) {
+					cerr << "Tokenized query file list was empty!" << endl;
+					printUsage(cerr);
+					return 1;
+				}
+			}
+
+			// Get output filename
+			if(optind < argc && outfile.empty()) {
+				outfile = argv[optind++];
+				cerr << "Warning: Output file '" << outfile.c_str()
+				     << "' was specified without -S.  This will not work in "
+					 << "future Centrifuge versions.  Please use -S instead."
+					 << endl;
+			}
+
+			// Extra parametesr?
+			if(optind < argc) {
+				cerr << "Extra parameter(s) specified: ";
+				for(int i = optind; i < argc; i++) {
+					cerr << "\"" << argv[i] << "\"";
+					if(i < argc-1) cerr << ", ";
+				}
+				cerr << endl;
+				if(mates1.size() > 0) {
+					cerr << "Note that if <mates> files are specified using -1/-2, a <singles> file cannot" << endl
+						 << "also be specified.  Please run bowtie separately for mates and singles." << endl;
+				}
+				throw 1;
+			}
+
+			// Optionally summarize
+			if(gVerbose) {
+				cout << "Input bt2 file: \"" << bt2index.c_str() << "\"" << endl;
+				cout << "Query inputs (DNA, " << file_format_names[format].c_str() << "):" << endl;
+				for(size_t i = 0; i < queries.size(); i++) {
+					cout << "  " << queries[i].c_str() << endl;
+				}
+				cout << "Quality inputs:" << endl;
+				for(size_t i = 0; i < qualities.size(); i++) {
+					cout << "  " << qualities[i].c_str() << endl;
+				}
+				cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
+				cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
+				cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
+			#ifdef NDEBUG
+				cout << "Assertions: disabled" << endl;
+			#else
+				cout << "Assertions: enabled" << endl;
+			#endif
+			}
+			if(ipause) {
+				cout << "Press key to continue..." << endl;
+				getchar();
+			}
+			driver<SString<char> >("DNA", bt2index, outfile);
+		}
+		return 0;
+	} catch(std::exception& e) {
+		cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
+		cerr << "Command: ";
+		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+		cerr << endl;
+		return 1;
+	} catch(int e) {
+		if(e != 0) {
+			cerr << "Error: Encountered internal Centrifuge exception (#" << e << ")" << endl;
+			cerr << "Command: ";
+			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+			cerr << endl;
+		}
+		return e;
+	}
+} // bowtie()
+} // extern "C"
diff --git a/centrifuge.xcodeproj/project.pbxproj b/centrifuge.xcodeproj/project.pbxproj
new file mode 100644
index 0000000..d32bbe4
--- /dev/null
+++ b/centrifuge.xcodeproj/project.pbxproj
@@ -0,0 +1,825 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		E86143B81C20833200D5C240 /* alphabet.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438B1C20833200D5C240 /* alphabet.cpp */; };
+		E86143B91C20833200D5C240 /* bt2_idx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438C1C20833200D5C240 /* bt2_idx.cpp */; };
+		E86143BA1C20833200D5C240 /* ccnt_lut.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438D1C20833200D5C240 /* ccnt_lut.cpp */; };
+		E86143BB1C20833200D5C240 /* centrifuge_build_main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438E1C20833200D5C240 /* centrifuge_build_main.cpp */; };
+		E86143BC1C20833200D5C240 /* centrifuge_build.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438F1C20833200D5C240 /* centrifuge_build.cpp */; };
+		E86143C41C20833200D5C240 /* ds.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143971C20833200D5C240 /* ds.cpp */; };
+		E86143C71C20833200D5C240 /* limit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861439A1C20833200D5C240 /* limit.cpp */; };
+		E86143CF1C20833200D5C240 /* random_source.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A21C20833200D5C240 /* random_source.cpp */; };
+		E86143D31C20833200D5C240 /* ref_read.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A61C20833200D5C240 /* ref_read.cpp */; };
+		E86143D41C20833200D5C240 /* reference.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A71C20833200D5C240 /* reference.cpp */; };
+		E86143D61C20833200D5C240 /* shmem.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A91C20833200D5C240 /* shmem.cpp */; };
+		E86143DA1C20833200D5C240 /* tinythread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143AD1C20833200D5C240 /* tinythread.cpp */; };
+		E869A0671C209BCC007600C2 /* aligner_seed_policy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143831C20833200D5C240 /* aligner_seed_policy.cpp */; };
+		E869A0681C209BCC007600C2 /* mask.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861439C1C20833200D5C240 /* mask.cpp */; };
+		E869A0691C209BCC007600C2 /* outq.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861439D1C20833200D5C240 /* outq.cpp */; };
+		E869A06A1C209BCC007600C2 /* pat.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861439E1C20833200D5C240 /* pat.cpp */; };
+		E869A06B1C209BCC007600C2 /* pe.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861439F1C20833200D5C240 /* pe.cpp */; };
+		E869A06C1C209BCC007600C2 /* presets.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A01C20833200D5C240 /* presets.cpp */; };
+		E869A06D1C209BCC007600C2 /* qual.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A11C20833200D5C240 /* qual.cpp */; };
+		E869A06E1C209BCC007600C2 /* random_util.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A31C20833200D5C240 /* random_util.cpp */; };
+		E869A06F1C209BCC007600C2 /* read_qseq.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A41C20833200D5C240 /* read_qseq.cpp */; };
+		E869A0701C209BCC007600C2 /* ref_coord.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A51C20833200D5C240 /* ref_coord.cpp */; };
+		E869A0711C209BCC007600C2 /* scoring.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A81C20833200D5C240 /* scoring.cpp */; };
+		E869A0721C209BCC007600C2 /* simple_func.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143AA1C20833200D5C240 /* simple_func.cpp */; };
+		E869A0731C209BE6007600C2 /* centrifuge_main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143921C20833200D5C240 /* centrifuge_main.cpp */; };
+		E869A0741C209BE6007600C2 /* centrifuge.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143941C20833200D5C240 /* centrifuge.cpp */; };
+		E869A0751C20A308007600C2 /* limit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861439A1C20833200D5C240 /* limit.cpp */; };
+		E869A0761C20A425007600C2 /* alphabet.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438B1C20833200D5C240 /* alphabet.cpp */; };
+		E869A0771C20A425007600C2 /* bt2_idx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438C1C20833200D5C240 /* bt2_idx.cpp */; };
+		E869A0781C20A425007600C2 /* ccnt_lut.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438D1C20833200D5C240 /* ccnt_lut.cpp */; };
+		E869A0791C20A425007600C2 /* ds.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143971C20833200D5C240 /* ds.cpp */; };
+		E869A07A1C20A425007600C2 /* edit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143981C20833200D5C240 /* edit.cpp */; };
+		E869A07B1C20A425007600C2 /* random_source.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A21C20833200D5C240 /* random_source.cpp */; };
+		E869A07C1C20A425007600C2 /* ref_read.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A61C20833200D5C240 /* ref_read.cpp */; };
+		E869A07D1C20A425007600C2 /* reference.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A71C20833200D5C240 /* reference.cpp */; };
+		E869A07E1C20A425007600C2 /* shmem.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A91C20833200D5C240 /* shmem.cpp */; };
+		E869A07F1C20A425007600C2 /* tinythread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143AD1C20833200D5C240 /* tinythread.cpp */; };
+		E869A0801C20A50B007600C2 /* alphabet.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438B1C20833200D5C240 /* alphabet.cpp */; };
+		E869A0811C20A50B007600C2 /* bt2_idx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438C1C20833200D5C240 /* bt2_idx.cpp */; };
+		E869A0821C20A50B007600C2 /* ccnt_lut.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E861438D1C20833200D5C240 /* ccnt_lut.cpp */; };
+		E869A0831C20A50B007600C2 /* centrifuge_inspect.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143911C20833200D5C240 /* centrifuge_inspect.cpp */; };
+		E869A0841C20A50B007600C2 /* ds.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143971C20833200D5C240 /* ds.cpp */; };
+		E869A0851C20A50B007600C2 /* edit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143981C20833200D5C240 /* edit.cpp */; };
+		E869A0861C20A50B007600C2 /* random_source.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A21C20833200D5C240 /* random_source.cpp */; };
+		E869A0871C20A50B007600C2 /* ref_read.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A61C20833200D5C240 /* ref_read.cpp */; };
+		E869A0881C20A50B007600C2 /* reference.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A71C20833200D5C240 /* reference.cpp */; };
+		E869A0891C20A50B007600C2 /* shmem.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143A91C20833200D5C240 /* shmem.cpp */; };
+		E869A08A1C20A50B007600C2 /* tinythread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143AD1C20833200D5C240 /* tinythread.cpp */; };
+		E8AB5A231C209232009138A6 /* diff_sample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E86143951C20833200D5C240 /* diff_sample.cpp */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		E8485E111C207EF000F225FA /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = /usr/share/man/man1/;
+			dstSubfolderSpec = 0;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+		E869A0481C2095A8007600C2 /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = /usr/share/man/man1/;
+			dstSubfolderSpec = 0;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+		E869A0531C2095B5007600C2 /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = /usr/share/man/man1/;
+			dstSubfolderSpec = 0;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+		E869A05E1C2095CA007600C2 /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = /usr/share/man/man1/;
+			dstSubfolderSpec = 0;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		E8485E131C207EF000F225FA /* centrifuge-buildx */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "centrifuge-buildx"; sourceTree = BUILT_PRODUCTS_DIR; };
+		E861433C1C20833200D5C240 /* aligner_bt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_bt.h; sourceTree = "<group>"; };
+		E861433D1C20833200D5C240 /* aligner_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_cache.h; sourceTree = "<group>"; };
+		E861433E1C20833200D5C240 /* aligner_metrics.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_metrics.h; sourceTree = "<group>"; };
+		E861433F1C20833200D5C240 /* aligner_result.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_result.h; sourceTree = "<group>"; };
+		E86143401C20833200D5C240 /* aligner_seed_policy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_seed_policy.h; sourceTree = "<group>"; };
+		E86143411C20833200D5C240 /* aligner_seed.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_seed.h; sourceTree = "<group>"; };
+		E86143421C20833200D5C240 /* aligner_sw_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_sw_common.h; sourceTree = "<group>"; };
+		E86143431C20833200D5C240 /* aligner_sw_nuc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_sw_nuc.h; sourceTree = "<group>"; };
+		E86143441C20833200D5C240 /* aligner_sw.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_sw.h; sourceTree = "<group>"; };
+		E86143451C20833200D5C240 /* aligner_swsse.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligner_swsse.h; sourceTree = "<group>"; };
+		E86143461C20833200D5C240 /* aln_sink.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aln_sink.h; sourceTree = "<group>"; };
+		E86143471C20833200D5C240 /* alphabet.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alphabet.h; sourceTree = "<group>"; };
+		E86143481C20833200D5C240 /* assert_helpers.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = assert_helpers.h; sourceTree = "<group>"; };
+		E86143491C20833200D5C240 /* binary_sa_search.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = binary_sa_search.h; sourceTree = "<group>"; };
+		E861434A1C20833200D5C240 /* bitpack.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bitpack.h; sourceTree = "<group>"; };
+		E861434B1C20833200D5C240 /* blockwise_sa.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = blockwise_sa.h; sourceTree = "<group>"; };
+		E861434C1C20833200D5C240 /* bt2_idx.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bt2_idx.h; sourceTree = "<group>"; };
+		E861434D1C20833200D5C240 /* bt2_io.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bt2_io.h; sourceTree = "<group>"; };
+		E861434E1C20833200D5C240 /* bt2_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bt2_util.h; sourceTree = "<group>"; };
+		E861434F1C20833200D5C240 /* btypes.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = btypes.h; sourceTree = "<group>"; };
+		E86143501C20833200D5C240 /* classifier.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifier.h; sourceTree = "<group>"; };
+		E86143511C20833200D5C240 /* diff_sample.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = diff_sample.h; sourceTree = "<group>"; };
+		E86143521C20833200D5C240 /* dp_framer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dp_framer.h; sourceTree = "<group>"; };
+		E86143531C20833200D5C240 /* ds.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ds.h; sourceTree = "<group>"; };
+		E86143541C20833200D5C240 /* edit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = edit.h; sourceTree = "<group>"; };
+		E86143551C20833200D5C240 /* endian_swap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = endian_swap.h; sourceTree = "<group>"; };
+		E86143561C20833200D5C240 /* fast_mutex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fast_mutex.h; sourceTree = "<group>"; };
+		E86143571C20833200D5C240 /* filebuf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filebuf.h; sourceTree = "<group>"; };
+		E86143581C20833200D5C240 /* formats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formats.h; sourceTree = "<group>"; };
+		E86143591C20833200D5C240 /* group_walk.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = group_walk.h; sourceTree = "<group>"; };
+		E861435A1C20833200D5C240 /* hi_aligner.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = hi_aligner.h; sourceTree = "<group>"; };
+		E861435B1C20833200D5C240 /* hier_idx_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hier_idx_common.h; sourceTree = "<group>"; };
+		E861435C1C20833200D5C240 /* hier_idx.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hier_idx.h; sourceTree = "<group>"; };
+		E861435D1C20833200D5C240 /* hyperloglogbias.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hyperloglogbias.h; sourceTree = "<group>"; };
+		E861435E1C20833200D5C240 /* hyperloglogplus.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hyperloglogplus.h; sourceTree = "<group>"; };
+		E861435F1C20833200D5C240 /* limit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = limit.h; sourceTree = "<group>"; };
+		E86143601C20833200D5C240 /* ls.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ls.h; sourceTree = "<group>"; };
+		E86143611C20833200D5C240 /* mask.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mask.h; sourceTree = "<group>"; };
+		E86143621C20833200D5C240 /* mem_ids.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mem_ids.h; sourceTree = "<group>"; };
+		E86143631C20833200D5C240 /* mm.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mm.h; sourceTree = "<group>"; };
+		E86143641C20833200D5C240 /* multikey_qsort.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = multikey_qsort.h; sourceTree = "<group>"; };
+		E86143651C20833200D5C240 /* opts.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = opts.h; sourceTree = "<group>"; };
+		E86143661C20833200D5C240 /* outq.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = outq.h; sourceTree = "<group>"; };
+		E86143671C20833200D5C240 /* pat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pat.h; sourceTree = "<group>"; };
+		E86143681C20833200D5C240 /* pe.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pe.h; sourceTree = "<group>"; };
+		E86143691C20833200D5C240 /* presets.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = presets.h; sourceTree = "<group>"; };
+		E861436A1C20833200D5C240 /* processor_support.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = processor_support.h; sourceTree = "<group>"; };
+		E861436B1C20833200D5C240 /* qual.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = qual.h; sourceTree = "<group>"; };
+		E861436C1C20833200D5C240 /* random_source.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = random_source.h; sourceTree = "<group>"; };
+		E861436D1C20833200D5C240 /* random_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = random_util.h; sourceTree = "<group>"; };
+		E861436E1C20833200D5C240 /* read.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = read.h; sourceTree = "<group>"; };
+		E861436F1C20833200D5C240 /* ref_coord.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ref_coord.h; sourceTree = "<group>"; };
+		E86143701C20833200D5C240 /* ref_read.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ref_read.h; sourceTree = "<group>"; };
+		E86143711C20833200D5C240 /* reference.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reference.h; sourceTree = "<group>"; };
+		E86143721C20833200D5C240 /* scoring.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = scoring.h; sourceTree = "<group>"; };
+		E86143731C20833200D5C240 /* search_globals.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = search_globals.h; sourceTree = "<group>"; };
+		E86143741C20833200D5C240 /* sequence_io.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequence_io.h; sourceTree = "<group>"; };
+		E86143751C20833200D5C240 /* shmem.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shmem.h; sourceTree = "<group>"; };
+		E86143761C20833200D5C240 /* simple_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simple_func.h; sourceTree = "<group>"; };
+		E86143771C20833200D5C240 /* sse_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sse_util.h; sourceTree = "<group>"; };
+		E86143781C20833200D5C240 /* sstring.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sstring.h; sourceTree = "<group>"; };
+		E86143791C20833200D5C240 /* str_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = str_util.h; sourceTree = "<group>"; };
+		E861437A1C20833200D5C240 /* threading.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = threading.h; sourceTree = "<group>"; };
+		E861437B1C20833200D5C240 /* timer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = timer.h; sourceTree = "<group>"; };
+		E861437C1C20833200D5C240 /* tinythread.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tinythread.h; sourceTree = "<group>"; };
+		E861437D1C20833200D5C240 /* tokenize.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tokenize.h; sourceTree = "<group>"; };
+		E861437E1C20833200D5C240 /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = "<group>"; };
+		E861437F1C20833200D5C240 /* word_io.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = word_io.h; sourceTree = "<group>"; };
+		E86143801C20833200D5C240 /* zbox.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = zbox.h; sourceTree = "<group>"; };
+		E86143811C20833200D5C240 /* aligner_bt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_bt.cpp; sourceTree = "<group>"; };
+		E86143821C20833200D5C240 /* aligner_cache.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_cache.cpp; sourceTree = "<group>"; };
+		E86143831C20833200D5C240 /* aligner_seed_policy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_seed_policy.cpp; sourceTree = "<group>"; };
+		E86143841C20833200D5C240 /* aligner_seed.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_seed.cpp; sourceTree = "<group>"; };
+		E86143851C20833200D5C240 /* aligner_sw.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_sw.cpp; sourceTree = "<group>"; };
+		E86143861C20833200D5C240 /* aligner_swsse_ee_i16.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_swsse_ee_i16.cpp; sourceTree = "<group>"; };
+		E86143871C20833200D5C240 /* aligner_swsse_ee_u8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_swsse_ee_u8.cpp; sourceTree = "<group>"; };
+		E86143881C20833200D5C240 /* aligner_swsse_loc_i16.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_swsse_loc_i16.cpp; sourceTree = "<group>"; };
+		E86143891C20833200D5C240 /* aligner_swsse_loc_u8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_swsse_loc_u8.cpp; sourceTree = "<group>"; };
+		E861438A1C20833200D5C240 /* aligner_swsse.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligner_swsse.cpp; sourceTree = "<group>"; };
+		E861438B1C20833200D5C240 /* alphabet.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alphabet.cpp; sourceTree = "<group>"; };
+		E861438C1C20833200D5C240 /* bt2_idx.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bt2_idx.cpp; sourceTree = "<group>"; };
+		E861438D1C20833200D5C240 /* ccnt_lut.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccnt_lut.cpp; sourceTree = "<group>"; };
+		E861438E1C20833200D5C240 /* centrifuge_build_main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = centrifuge_build_main.cpp; sourceTree = "<group>"; };
+		E861438F1C20833200D5C240 /* centrifuge_build.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = centrifuge_build.cpp; sourceTree = "<group>"; };
+		E86143901C20833200D5C240 /* centrifuge_compress.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = centrifuge_compress.cpp; sourceTree = "<group>"; };
+		E86143911C20833200D5C240 /* centrifuge_inspect.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = centrifuge_inspect.cpp; sourceTree = "<group>"; };
+		E86143921C20833200D5C240 /* centrifuge_main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = centrifuge_main.cpp; sourceTree = "<group>"; };
+		E86143931C20833200D5C240 /* centrifuge_report.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = centrifuge_report.cpp; sourceTree = "<group>"; };
+		E86143941C20833200D5C240 /* centrifuge.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = centrifuge.cpp; sourceTree = "<group>"; };
+		E86143951C20833200D5C240 /* diff_sample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = diff_sample.cpp; sourceTree = "<group>"; };
+		E86143961C20833200D5C240 /* dp_framer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = dp_framer.cpp; sourceTree = "<group>"; };
+		E86143971C20833200D5C240 /* ds.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ds.cpp; sourceTree = "<group>"; };
+		E86143981C20833200D5C240 /* edit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = edit.cpp; sourceTree = "<group>"; };
+		E86143991C20833200D5C240 /* group_walk.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = group_walk.cpp; sourceTree = "<group>"; };
+		E861439A1C20833200D5C240 /* limit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = limit.cpp; sourceTree = "<group>"; };
+		E861439B1C20833200D5C240 /* ls.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ls.cpp; sourceTree = "<group>"; };
+		E861439C1C20833200D5C240 /* mask.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mask.cpp; sourceTree = "<group>"; };
+		E861439D1C20833200D5C240 /* outq.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = outq.cpp; sourceTree = "<group>"; };
+		E861439E1C20833200D5C240 /* pat.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pat.cpp; sourceTree = "<group>"; };
+		E861439F1C20833200D5C240 /* pe.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pe.cpp; sourceTree = "<group>"; };
+		E86143A01C20833200D5C240 /* presets.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = presets.cpp; sourceTree = "<group>"; };
+		E86143A11C20833200D5C240 /* qual.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = qual.cpp; sourceTree = "<group>"; };
+		E86143A21C20833200D5C240 /* random_source.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = random_source.cpp; sourceTree = "<group>"; };
+		E86143A31C20833200D5C240 /* random_util.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = random_util.cpp; sourceTree = "<group>"; };
+		E86143A41C20833200D5C240 /* read_qseq.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = read_qseq.cpp; sourceTree = "<group>"; };
+		E86143A51C20833200D5C240 /* ref_coord.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ref_coord.cpp; sourceTree = "<group>"; };
+		E86143A61C20833200D5C240 /* ref_read.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ref_read.cpp; sourceTree = "<group>"; };
+		E86143A71C20833200D5C240 /* reference.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reference.cpp; sourceTree = "<group>"; };
+		E86143A81C20833200D5C240 /* scoring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scoring.cpp; sourceTree = "<group>"; };
+		E86143A91C20833200D5C240 /* shmem.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shmem.cpp; sourceTree = "<group>"; };
+		E86143AA1C20833200D5C240 /* simple_func.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = simple_func.cpp; sourceTree = "<group>"; };
+		E86143AB1C20833200D5C240 /* sse_util.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sse_util.cpp; sourceTree = "<group>"; };
+		E86143AC1C20833200D5C240 /* sstring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sstring.cpp; sourceTree = "<group>"; };
+		E86143AD1C20833200D5C240 /* tinythread.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tinythread.cpp; sourceTree = "<group>"; };
+		E869A04A1C2095A8007600C2 /* centrifugex */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = centrifugex; sourceTree = BUILT_PRODUCTS_DIR; };
+		E869A0551C2095B5007600C2 /* centrifuge-inspectx */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "centrifuge-inspectx"; sourceTree = BUILT_PRODUCTS_DIR; };
+		E869A0601C2095CA007600C2 /* centrifuge-compressx */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "centrifuge-compressx"; sourceTree = BUILT_PRODUCTS_DIR; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		E8485E101C207EF000F225FA /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		E869A0471C2095A8007600C2 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		E869A0521C2095B5007600C2 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		E869A05D1C2095CA007600C2 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		E8485E0A1C207EF000F225FA = {
+			isa = PBXGroup;
+			children = (
+				E8485E1E1C207FCB00F225FA /* Document */,
+				E8485E1D1C207FC400F225FA /* Source */,
+				E8485E141C207EF000F225FA /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		E8485E141C207EF000F225FA /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				E8485E131C207EF000F225FA /* centrifuge-buildx */,
+				E869A04A1C2095A8007600C2 /* centrifugex */,
+				E869A0551C2095B5007600C2 /* centrifuge-inspectx */,
+				E869A0601C2095CA007600C2 /* centrifuge-compressx */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		E8485E1D1C207FC400F225FA /* Source */ = {
+			isa = PBXGroup;
+			children = (
+				E861433C1C20833200D5C240 /* aligner_bt.h */,
+				E861433D1C20833200D5C240 /* aligner_cache.h */,
+				E861433E1C20833200D5C240 /* aligner_metrics.h */,
+				E861433F1C20833200D5C240 /* aligner_result.h */,
+				E86143401C20833200D5C240 /* aligner_seed_policy.h */,
+				E86143411C20833200D5C240 /* aligner_seed.h */,
+				E86143421C20833200D5C240 /* aligner_sw_common.h */,
+				E86143431C20833200D5C240 /* aligner_sw_nuc.h */,
+				E86143441C20833200D5C240 /* aligner_sw.h */,
+				E86143451C20833200D5C240 /* aligner_swsse.h */,
+				E86143461C20833200D5C240 /* aln_sink.h */,
+				E86143471C20833200D5C240 /* alphabet.h */,
+				E86143481C20833200D5C240 /* assert_helpers.h */,
+				E86143491C20833200D5C240 /* binary_sa_search.h */,
+				E861434A1C20833200D5C240 /* bitpack.h */,
+				E861434B1C20833200D5C240 /* blockwise_sa.h */,
+				E861434C1C20833200D5C240 /* bt2_idx.h */,
+				E861434D1C20833200D5C240 /* bt2_io.h */,
+				E861434E1C20833200D5C240 /* bt2_util.h */,
+				E861434F1C20833200D5C240 /* btypes.h */,
+				E86143501C20833200D5C240 /* classifier.h */,
+				E86143511C20833200D5C240 /* diff_sample.h */,
+				E86143521C20833200D5C240 /* dp_framer.h */,
+				E86143531C20833200D5C240 /* ds.h */,
+				E86143541C20833200D5C240 /* edit.h */,
+				E86143551C20833200D5C240 /* endian_swap.h */,
+				E86143561C20833200D5C240 /* fast_mutex.h */,
+				E86143571C20833200D5C240 /* filebuf.h */,
+				E86143581C20833200D5C240 /* formats.h */,
+				E86143591C20833200D5C240 /* group_walk.h */,
+				E861435A1C20833200D5C240 /* hi_aligner.h */,
+				E861435B1C20833200D5C240 /* hier_idx_common.h */,
+				E861435C1C20833200D5C240 /* hier_idx.h */,
+				E861435D1C20833200D5C240 /* hyperloglogbias.h */,
+				E861435E1C20833200D5C240 /* hyperloglogplus.h */,
+				E861435F1C20833200D5C240 /* limit.h */,
+				E86143601C20833200D5C240 /* ls.h */,
+				E86143611C20833200D5C240 /* mask.h */,
+				E86143621C20833200D5C240 /* mem_ids.h */,
+				E86143631C20833200D5C240 /* mm.h */,
+				E86143641C20833200D5C240 /* multikey_qsort.h */,
+				E86143651C20833200D5C240 /* opts.h */,
+				E86143661C20833200D5C240 /* outq.h */,
+				E86143671C20833200D5C240 /* pat.h */,
+				E86143681C20833200D5C240 /* pe.h */,
+				E86143691C20833200D5C240 /* presets.h */,
+				E861436A1C20833200D5C240 /* processor_support.h */,
+				E861436B1C20833200D5C240 /* qual.h */,
+				E861436C1C20833200D5C240 /* random_source.h */,
+				E861436D1C20833200D5C240 /* random_util.h */,
+				E861436E1C20833200D5C240 /* read.h */,
+				E861436F1C20833200D5C240 /* ref_coord.h */,
+				E86143701C20833200D5C240 /* ref_read.h */,
+				E86143711C20833200D5C240 /* reference.h */,
+				E86143721C20833200D5C240 /* scoring.h */,
+				E86143731C20833200D5C240 /* search_globals.h */,
+				E86143741C20833200D5C240 /* sequence_io.h */,
+				E86143751C20833200D5C240 /* shmem.h */,
+				E86143761C20833200D5C240 /* simple_func.h */,
+				E86143771C20833200D5C240 /* sse_util.h */,
+				E86143781C20833200D5C240 /* sstring.h */,
+				E86143791C20833200D5C240 /* str_util.h */,
+				E861437A1C20833200D5C240 /* threading.h */,
+				E861437B1C20833200D5C240 /* timer.h */,
+				E861437C1C20833200D5C240 /* tinythread.h */,
+				E861437D1C20833200D5C240 /* tokenize.h */,
+				E861437E1C20833200D5C240 /* util.h */,
+				E861437F1C20833200D5C240 /* word_io.h */,
+				E86143801C20833200D5C240 /* zbox.h */,
+				E86143811C20833200D5C240 /* aligner_bt.cpp */,
+				E86143821C20833200D5C240 /* aligner_cache.cpp */,
+				E86143831C20833200D5C240 /* aligner_seed_policy.cpp */,
+				E86143841C20833200D5C240 /* aligner_seed.cpp */,
+				E86143851C20833200D5C240 /* aligner_sw.cpp */,
+				E86143861C20833200D5C240 /* aligner_swsse_ee_i16.cpp */,
+				E86143871C20833200D5C240 /* aligner_swsse_ee_u8.cpp */,
+				E86143881C20833200D5C240 /* aligner_swsse_loc_i16.cpp */,
+				E86143891C20833200D5C240 /* aligner_swsse_loc_u8.cpp */,
+				E861438A1C20833200D5C240 /* aligner_swsse.cpp */,
+				E861438B1C20833200D5C240 /* alphabet.cpp */,
+				E861438C1C20833200D5C240 /* bt2_idx.cpp */,
+				E861438D1C20833200D5C240 /* ccnt_lut.cpp */,
+				E861438E1C20833200D5C240 /* centrifuge_build_main.cpp */,
+				E861438F1C20833200D5C240 /* centrifuge_build.cpp */,
+				E86143901C20833200D5C240 /* centrifuge_compress.cpp */,
+				E86143911C20833200D5C240 /* centrifuge_inspect.cpp */,
+				E86143921C20833200D5C240 /* centrifuge_main.cpp */,
+				E86143931C20833200D5C240 /* centrifuge_report.cpp */,
+				E86143941C20833200D5C240 /* centrifuge.cpp */,
+				E86143951C20833200D5C240 /* diff_sample.cpp */,
+				E86143961C20833200D5C240 /* dp_framer.cpp */,
+				E86143971C20833200D5C240 /* ds.cpp */,
+				E86143981C20833200D5C240 /* edit.cpp */,
+				E86143991C20833200D5C240 /* group_walk.cpp */,
+				E861439A1C20833200D5C240 /* limit.cpp */,
+				E861439B1C20833200D5C240 /* ls.cpp */,
+				E861439C1C20833200D5C240 /* mask.cpp */,
+				E861439D1C20833200D5C240 /* outq.cpp */,
+				E861439E1C20833200D5C240 /* pat.cpp */,
+				E861439F1C20833200D5C240 /* pe.cpp */,
+				E86143A01C20833200D5C240 /* presets.cpp */,
+				E86143A11C20833200D5C240 /* qual.cpp */,
+				E86143A21C20833200D5C240 /* random_source.cpp */,
+				E86143A31C20833200D5C240 /* random_util.cpp */,
+				E86143A41C20833200D5C240 /* read_qseq.cpp */,
+				E86143A51C20833200D5C240 /* ref_coord.cpp */,
+				E86143A61C20833200D5C240 /* ref_read.cpp */,
+				E86143A71C20833200D5C240 /* reference.cpp */,
+				E86143A81C20833200D5C240 /* scoring.cpp */,
+				E86143A91C20833200D5C240 /* shmem.cpp */,
+				E86143AA1C20833200D5C240 /* simple_func.cpp */,
+				E86143AB1C20833200D5C240 /* sse_util.cpp */,
+				E86143AC1C20833200D5C240 /* sstring.cpp */,
+				E86143AD1C20833200D5C240 /* tinythread.cpp */,
+			);
+			name = Source;
+			sourceTree = "<group>";
+		};
+		E8485E1E1C207FCB00F225FA /* Document */ = {
+			isa = PBXGroup;
+			children = (
+			);
+			name = Document;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		E8485E121C207EF000F225FA /* centrifuge-buildx */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = E8485E1A1C207EF000F225FA /* Build configuration list for PBXNativeTarget "centrifuge-buildx" */;
+			buildPhases = (
+				E8485E0F1C207EF000F225FA /* Sources */,
+				E8485E101C207EF000F225FA /* Frameworks */,
+				E8485E111C207EF000F225FA /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = "centrifuge-buildx";
+			productName = centrifuge;
+			productReference = E8485E131C207EF000F225FA /* centrifuge-buildx */;
+			productType = "com.apple.product-type.tool";
+		};
+		E869A0491C2095A8007600C2 /* centrifugex */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = E869A0501C2095A8007600C2 /* Build configuration list for PBXNativeTarget "centrifugex" */;
+			buildPhases = (
+				E869A0461C2095A8007600C2 /* Sources */,
+				E869A0471C2095A8007600C2 /* Frameworks */,
+				E869A0481C2095A8007600C2 /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = centrifugex;
+			productName = centrifugex;
+			productReference = E869A04A1C2095A8007600C2 /* centrifugex */;
+			productType = "com.apple.product-type.tool";
+		};
+		E869A0541C2095B5007600C2 /* centrifuge-inspectx */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = E869A0591C2095B5007600C2 /* Build configuration list for PBXNativeTarget "centrifuge-inspectx" */;
+			buildPhases = (
+				E869A0511C2095B5007600C2 /* Sources */,
+				E869A0521C2095B5007600C2 /* Frameworks */,
+				E869A0531C2095B5007600C2 /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = "centrifuge-inspectx";
+			productName = "centrifuge-inspectx";
+			productReference = E869A0551C2095B5007600C2 /* centrifuge-inspectx */;
+			productType = "com.apple.product-type.tool";
+		};
+		E869A05F1C2095CA007600C2 /* centrifuge-compressx */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = E869A0641C2095CA007600C2 /* Build configuration list for PBXNativeTarget "centrifuge-compressx" */;
+			buildPhases = (
+				E869A05C1C2095CA007600C2 /* Sources */,
+				E869A05D1C2095CA007600C2 /* Frameworks */,
+				E869A05E1C2095CA007600C2 /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = "centrifuge-compressx";
+			productName = "centrifuge-compressx";
+			productReference = E869A0601C2095CA007600C2 /* centrifuge-compressx */;
+			productType = "com.apple.product-type.tool";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		E8485E0B1C207EF000F225FA /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0720;
+				ORGANIZATIONNAME = "Daehwan Kim";
+				TargetAttributes = {
+					E8485E121C207EF000F225FA = {
+						CreatedOnToolsVersion = 7.2;
+					};
+					E869A0491C2095A8007600C2 = {
+						CreatedOnToolsVersion = 7.2;
+					};
+					E869A0541C2095B5007600C2 = {
+						CreatedOnToolsVersion = 7.2;
+					};
+					E869A05F1C2095CA007600C2 = {
+						CreatedOnToolsVersion = 7.2;
+					};
+				};
+			};
+			buildConfigurationList = E8485E0E1C207EF000F225FA /* Build configuration list for PBXProject "centrifuge" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+			);
+			mainGroup = E8485E0A1C207EF000F225FA;
+			productRefGroup = E8485E141C207EF000F225FA /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				E8485E121C207EF000F225FA /* centrifuge-buildx */,
+				E869A0491C2095A8007600C2 /* centrifugex */,
+				E869A0541C2095B5007600C2 /* centrifuge-inspectx */,
+				E869A05F1C2095CA007600C2 /* centrifuge-compressx */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		E8485E0F1C207EF000F225FA /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				E8AB5A231C209232009138A6 /* diff_sample.cpp in Sources */,
+				E86143BA1C20833200D5C240 /* ccnt_lut.cpp in Sources */,
+				E86143D41C20833200D5C240 /* reference.cpp in Sources */,
+				E86143CF1C20833200D5C240 /* random_source.cpp in Sources */,
+				E86143DA1C20833200D5C240 /* tinythread.cpp in Sources */,
+				E86143BB1C20833200D5C240 /* centrifuge_build_main.cpp in Sources */,
+				E86143C71C20833200D5C240 /* limit.cpp in Sources */,
+				E86143B91C20833200D5C240 /* bt2_idx.cpp in Sources */,
+				E86143B81C20833200D5C240 /* alphabet.cpp in Sources */,
+				E86143D31C20833200D5C240 /* ref_read.cpp in Sources */,
+				E86143C41C20833200D5C240 /* ds.cpp in Sources */,
+				E86143BC1C20833200D5C240 /* centrifuge_build.cpp in Sources */,
+				E86143D61C20833200D5C240 /* shmem.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		E869A0461C2095A8007600C2 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				E869A0761C20A425007600C2 /* alphabet.cpp in Sources */,
+				E869A0771C20A425007600C2 /* bt2_idx.cpp in Sources */,
+				E869A0781C20A425007600C2 /* ccnt_lut.cpp in Sources */,
+				E869A0791C20A425007600C2 /* ds.cpp in Sources */,
+				E869A07A1C20A425007600C2 /* edit.cpp in Sources */,
+				E869A07B1C20A425007600C2 /* random_source.cpp in Sources */,
+				E869A07C1C20A425007600C2 /* ref_read.cpp in Sources */,
+				E869A07D1C20A425007600C2 /* reference.cpp in Sources */,
+				E869A07E1C20A425007600C2 /* shmem.cpp in Sources */,
+				E869A07F1C20A425007600C2 /* tinythread.cpp in Sources */,
+				E869A0751C20A308007600C2 /* limit.cpp in Sources */,
+				E869A0731C209BE6007600C2 /* centrifuge_main.cpp in Sources */,
+				E869A0741C209BE6007600C2 /* centrifuge.cpp in Sources */,
+				E869A0671C209BCC007600C2 /* aligner_seed_policy.cpp in Sources */,
+				E869A0681C209BCC007600C2 /* mask.cpp in Sources */,
+				E869A0691C209BCC007600C2 /* outq.cpp in Sources */,
+				E869A06A1C209BCC007600C2 /* pat.cpp in Sources */,
+				E869A06B1C209BCC007600C2 /* pe.cpp in Sources */,
+				E869A06C1C209BCC007600C2 /* presets.cpp in Sources */,
+				E869A06D1C209BCC007600C2 /* qual.cpp in Sources */,
+				E869A06E1C209BCC007600C2 /* random_util.cpp in Sources */,
+				E869A06F1C209BCC007600C2 /* read_qseq.cpp in Sources */,
+				E869A0701C209BCC007600C2 /* ref_coord.cpp in Sources */,
+				E869A0711C209BCC007600C2 /* scoring.cpp in Sources */,
+				E869A0721C209BCC007600C2 /* simple_func.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		E869A0511C2095B5007600C2 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				E869A0801C20A50B007600C2 /* alphabet.cpp in Sources */,
+				E869A0811C20A50B007600C2 /* bt2_idx.cpp in Sources */,
+				E869A0821C20A50B007600C2 /* ccnt_lut.cpp in Sources */,
+				E869A0831C20A50B007600C2 /* centrifuge_inspect.cpp in Sources */,
+				E869A0841C20A50B007600C2 /* ds.cpp in Sources */,
+				E869A0851C20A50B007600C2 /* edit.cpp in Sources */,
+				E869A0861C20A50B007600C2 /* random_source.cpp in Sources */,
+				E869A0871C20A50B007600C2 /* ref_read.cpp in Sources */,
+				E869A0881C20A50B007600C2 /* reference.cpp in Sources */,
+				E869A0891C20A50B007600C2 /* shmem.cpp in Sources */,
+				E869A08A1C20A50B007600C2 /* tinythread.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		E869A05C1C2095CA007600C2 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		E8485E181C207EF000F225FA /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"$(inherited)",
+					"DEBUG=1",
+					BOWTIE_MM,
+					"MACOS=1",
+					POPCNT_CAPABILITY,
+					BOWTIE2,
+					BOWTIE_64BIT_INDEX,
+					CENTRIFUGE,
+					"CENTRIFUGE_VERSION=\"\\\"`cat VERSION`\\\"\"",
+					"BUILD_HOST=\"\\\"`hostname`\\\"\"",
+					"BUILD_TIME=\"\\\"`date`\\\"\"",
+					"COMPILER_VERSION=\"\\\"`$(CXX) -v 2>&1 | tail -1`\\\"\"",
+					"COMPILER_OPTIONS=\"\\\"test\\\"\"",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				MACOSX_DEPLOYMENT_TARGET = 10.11;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = macosx;
+			};
+			name = Debug;
+		};
+		E8485E191C207EF000F225FA /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"$(inherited)",
+					"DEBUG=0",
+					BOWTIE_MM,
+					"MACOS=1",
+					POPCNT_CAPABILITY,
+					BOWTIE2,
+					BOWTIE_64BIT_INDEX,
+					CENTRIFUGE,
+					"CENTRIFUGE_VERSION=\"\\\"`cat VERSION`\\\"\"",
+					"BUILD_HOST=\"\\\"`hostname`\\\"\"",
+					"BUILD_TIME=\"\\\"`date`\\\"\"",
+					"COMPILER_VERSION=\"\\\"`$(CXX) -v 2>&1 | tail -1`\\\"\"",
+					"COMPILER_OPTIONS=\"\\\"test\\\"\"",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				MACOSX_DEPLOYMENT_TARGET = 10.11;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = macosx;
+			};
+			name = Release;
+		};
+		E8485E1B1C207EF000F225FA /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_PREPROCESSOR_DEFINITIONS = "$(inherited)";
+				HEADER_SEARCH_PATHS = /usr/local/include;
+				LIBRARY_SEARCH_PATHS = /usr/local/lib;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		E8485E1C1C207EF000F225FA /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				HEADER_SEARCH_PATHS = /usr/local/include;
+				LIBRARY_SEARCH_PATHS = /usr/local/lib;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+		E869A04E1C2095A8007600C2 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		E869A04F1C2095A8007600C2 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+		E869A05A1C2095B5007600C2 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		E869A05B1C2095B5007600C2 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+		E869A0651C2095CA007600C2 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		E869A0661C2095CA007600C2 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		E8485E0E1C207EF000F225FA /* Build configuration list for PBXProject "centrifuge" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				E8485E181C207EF000F225FA /* Debug */,
+				E8485E191C207EF000F225FA /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		E8485E1A1C207EF000F225FA /* Build configuration list for PBXNativeTarget "centrifuge-buildx" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				E8485E1B1C207EF000F225FA /* Debug */,
+				E8485E1C1C207EF000F225FA /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		E869A0501C2095A8007600C2 /* Build configuration list for PBXNativeTarget "centrifugex" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				E869A04E1C2095A8007600C2 /* Debug */,
+				E869A04F1C2095A8007600C2 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		E869A0591C2095B5007600C2 /* Build configuration list for PBXNativeTarget "centrifuge-inspectx" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				E869A05A1C2095B5007600C2 /* Debug */,
+				E869A05B1C2095B5007600C2 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		E869A0641C2095CA007600C2 /* Build configuration list for PBXNativeTarget "centrifuge-compressx" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				E869A0651C2095CA007600C2 /* Debug */,
+				E869A0661C2095CA007600C2 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = E8485E0B1C207EF000F225FA /* Project object */;
+}
diff --git a/centrifuge_build.cpp b/centrifuge_build.cpp
new file mode 100644
index 0000000..df333f5
--- /dev/null
+++ b/centrifuge_build.cpp
@@ -0,0 +1,748 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cassert>
+#include <getopt.h>
+#include "assert_helpers.h"
+#include "endian_swap.h"
+#include "bt2_idx.h"
+#include "bt2_io.h"
+#include "bt2_util.h"
+#include "formats.h"
+#include "sequence_io.h"
+#include "tokenize.h"
+#include "timer.h"
+#include "ref_read.h"
+#include "filebuf.h"
+#include "reference.h"
+#include "ds.h"
+
+/**
+ * \file Driver for the bowtie-build indexing tool.
+ */
+
+// Build parameters
+int verbose;
+static int sanityCheck;
+static int format;
+static TIndexOffU bmax;
+static TIndexOffU bmaxMultSqrt;
+static uint32_t bmaxDivN;
+static int dcv;
+static int noDc;
+static int entireSA;
+static int seed;
+static int showVersion;
+static int nthreads;      // number of pthreads operating concurrently
+//   Ebwt parameters
+static int32_t lineRate;
+static int32_t linesPerSide;
+static int32_t offRate;
+static int32_t ftabChars;
+static int32_t localOffRate;
+static int32_t localFtabChars;
+static string conversion_table_fname; // conversion table file name
+static string taxonomy_fname; // taxonomy tree file name
+static string name_table_fname; // name table file name
+static string size_table_fname; // contig size table file name
+static int  bigEndian;
+static bool nsToAs;
+static bool doSaFile;  // make a file with just the suffix array in it
+static bool doBwtFile; // make a file with just the BWT string in it
+static bool autoMem;
+static bool packed;
+static bool writeRef;
+static bool justRef;
+static bool reverseEach;
+static string wrapper;
+static int kmer_count;
+
+static void resetOptions() {
+	verbose        = true;  // be talkative (default)
+	sanityCheck    = 0;     // do slow sanity checks
+	format         = FASTA; // input sequence format
+	bmax           = OFF_MASK; // max blockwise SA bucket size
+	bmaxMultSqrt   = OFF_MASK; // same, as multplier of sqrt(n)
+	bmaxDivN       = 4;          // same, as divisor of n
+	dcv            = 1024;  // bwise SA difference-cover sample sz
+	noDc           = 0;     // disable difference-cover sample
+	entireSA       = 0;     // 1 = disable blockwise SA
+	seed           = 0;     // srandom seed
+	showVersion    = 0;     // just print version and quit?
+    nthreads       = 1;
+	//   Ebwt parameters
+	lineRate       = Ebwt<TIndexOffU>::default_lineRate;
+	linesPerSide   = 1;  // 1 64-byte line on a side
+	offRate        = 4;  // sample 1 out of 16 SA elts
+	ftabChars      = 10; // 10 chars in initial lookup table
+    localOffRate   = 3;
+    localFtabChars = 6;
+    conversion_table_fname    = "";
+    taxonomy_fname = "";
+    name_table_fname = "";
+    size_table_fname = "";
+	bigEndian      = 0;  // little endian
+	nsToAs         = false; // convert reference Ns to As prior to indexing
+    doSaFile       = false; // make a file with just the suffix array in it
+    doBwtFile      = false; // make a file with just the BWT string in it
+	autoMem        = true;  // automatically adjust memory usage parameters
+	packed         = false; //
+	writeRef       = true;  // write compact reference to .3.bt2/.4.bt2
+	justRef        = false; // *just* write compact reference, don't index
+	reverseEach    = false;
+    wrapper.clear();
+    kmer_count     = 0; // k : k-mer to be counted
+}
+
+// Argument constants for getopts
+enum {
+	ARG_BMAX = 256,
+	ARG_BMAX_MULT,
+	ARG_BMAX_DIV,
+	ARG_DCV,
+	ARG_SEED,
+	ARG_CUTOFF,
+	ARG_PMAP,
+	ARG_NTOA,
+	ARG_USAGE,
+	ARG_REVERSE_EACH,
+    ARG_SA,
+	ARG_WRAPPER,
+    ARG_THREADS,
+    ARG_LOCAL_OFFRATE,
+    ARG_LOCAL_FTABCHARS,
+    ARG_CONVERSION_TABLE,
+    ARG_TAXONOMY_TREE,
+    ARG_NAME_TABLE,
+    ARG_SIZE_TABLE,
+    ARG_KMER_COUNT,
+};
+
+/**
+ * Print a detailed usage message to the provided output stream.
+ */
+static void printUsage(ostream& out) {
+	out << "Centrifuge version " << string(CENTRIFUGE_VERSION).c_str() << " by Daehwan Kim (infphilo at gmail.com, http://www.ccb.jhu.edu/people/infphilo)" << endl;
+	string tool_name = "centrifuge-build-bin";
+	if(wrapper == "basic-0") {
+		tool_name = "centrifuge-build";
+	}
+    
+	out << "Usage: centrifuge-build [options]* --conversion-table <table file> --taxonomy-tree <taxonomy tree file> <reference_in> <cf_index_base>" << endl
+	    << "    reference_in            comma-separated list of files with ref sequences" << endl
+	    << "    centrifuge_index_base          write " << gEbwt_ext << " data to files with this dir/basename" << endl
+        << "Options:" << endl
+        << "    -c                      reference sequences given on cmd line (as" << endl
+        << "                            <reference_in>)" << endl;
+    if(wrapper == "basic-0") {
+        out << "    --large-index           force generated index to be 'large', even if ref" << endl
+		<< "                            has fewer than 4 billion nucleotides" << endl;
+	}
+    out << "    -a/--noauto             disable automatic -p/--bmax/--dcv memory-fitting" << endl
+	    << "    --bmax <int>            max bucket sz for blockwise suffix-array builder" << endl
+	    << "    --bmaxdivn <int>        max bucket sz as divisor of ref len (default: 4)" << endl
+	    << "    --dcv <int>             diff-cover period for blockwise (default: 1024)" << endl
+	    << "    --nodc                  disable diff-cover (algorithm becomes quadratic)" << endl
+	    << "    -r/--noref              don't build .3/.4.bt2 (packed reference) portion" << endl
+	    << "    -3/--justref            just build .3/.4.bt2 (packed reference) portion" << endl
+	    << "    -o/--offrate <int>      SA is sampled every 2^offRate BWT chars (default: 5)" << endl
+	    << "    -t/--ftabchars <int>    # of chars consumed in initial lookup (default: 10)" << endl
+        << "    --conversion-table <file name>  a table that converts any id to a taxonomy id" << endl
+        << "    --taxonomy-tree    <file name>  taxonomy tree" << endl
+        << "    --name-table       <file name>  names corresponding to taxonomic IDs" << endl
+        << "    --size-table       <file name>  table of contig (or genome) sizes" << endl
+	    << "    --seed <int>            seed for random number generator" << endl
+	    << "    -q/--quiet              verbose output (for debugging)" << endl
+        << "    -p/--threads <int>      number of alignment threads to launch (1)" << endl
+        << "    --kmer-count <int>      k size for counting the number of distinct k-mer" << endl
+	    << "    -h/--help               print detailed description of tool and its options" << endl
+	    << "    --usage                 print this usage message" << endl
+	    << "    --version               print version information and quit" << endl
+	    ;
+    
+    if(wrapper.empty()) {
+		cerr << endl
+        << "*** Warning ***" << endl
+        << "'" << tool_name << "' was run directly.  It is recommended "
+        << "that you run the wrapper script 'bowtie2-build' instead."
+        << endl << endl;
+	}
+}
+
+static const char *short_options = "qrap:h?nscfl:i:o:t:h:3C";
+
+static struct option long_options[] = {
+	{(char*)"quiet",          no_argument,       0,            'q'},
+	{(char*)"sanity",         no_argument,       0,            's'},
+	{(char*)"packed",         no_argument,       0,            'p'},
+	{(char*)"little",         no_argument,       &bigEndian,   0},
+	{(char*)"big",            no_argument,       &bigEndian,   1},
+	{(char*)"bmax",           required_argument, 0,            ARG_BMAX},
+	{(char*)"bmaxmultsqrt",   required_argument, 0,            ARG_BMAX_MULT},
+	{(char*)"bmaxdivn",       required_argument, 0,            ARG_BMAX_DIV},
+	{(char*)"dcv",            required_argument, 0,            ARG_DCV},
+	{(char*)"nodc",           no_argument,       &noDc,        1},
+	{(char*)"seed",           required_argument, 0,            ARG_SEED},
+	{(char*)"entiresa",       no_argument,       &entireSA,    1},
+	{(char*)"version",        no_argument,       &showVersion, 1},
+	{(char*)"noauto",         no_argument,       0,            'a'},
+	{(char*)"noblocks",       required_argument, 0,            'n'},
+    {(char*)"threads",        required_argument, 0,            ARG_THREADS},
+	{(char*)"linerate",       required_argument, 0,            'l'},
+	{(char*)"linesperside",   required_argument, 0,            'i'},
+	{(char*)"offrate",        required_argument, 0,            'o'},
+	{(char*)"ftabchars",      required_argument, 0,            't'},
+    {(char*)"localoffrate",   required_argument, 0,            ARG_LOCAL_OFFRATE},
+	{(char*)"localftabchars", required_argument, 0,            ARG_LOCAL_FTABCHARS},
+    {(char*)"conversion-table", required_argument, 0,          ARG_CONVERSION_TABLE},
+    {(char*)"taxonomy-tree",    required_argument, 0,          ARG_TAXONOMY_TREE},
+    {(char*)"name-table",       required_argument, 0,          ARG_NAME_TABLE},
+    {(char*)"size-table",       required_argument, 0,          ARG_SIZE_TABLE},
+	{(char*)"help",           no_argument,       0,            'h'},
+	{(char*)"ntoa",           no_argument,       0,            ARG_NTOA},
+	{(char*)"justref",        no_argument,       0,            '3'},
+	{(char*)"noref",          no_argument,       0,            'r'},
+	{(char*)"kmer-count",     required_argument, 0,            ARG_KMER_COUNT},
+    {(char*)"sa",             no_argument,       0,            ARG_SA},
+	{(char*)"reverse-each",   no_argument,       0,            ARG_REVERSE_EACH},
+	{(char*)"usage",          no_argument,       0,            ARG_USAGE},
+    {(char*)"wrapper",        required_argument, 0,            ARG_WRAPPER},
+	{(char*)0, 0, 0, 0} // terminator
+};
+
+/**
+ * Parse an int out of optarg and enforce that it be at least 'lower';
+ * if it is less than 'lower', then output the given error message and
+ * exit with an error and a usage message.
+ */
+template<typename T>
+static T parseNumber(T lower, const char *errmsg) {
+	char *endPtr= NULL;
+	T t = (T)strtoll(optarg, &endPtr, 10);
+	if (endPtr != NULL) {
+		if (t < lower) {
+			cerr << errmsg << endl;
+			printUsage(cerr);
+			throw 1;
+		}
+		return t;
+	}
+	cerr << errmsg << endl;
+	printUsage(cerr);
+	throw 1;
+	return -1;
+}
+
+/**
+ * Read command-line arguments
+ */
+static void parseOptions(int argc, const char **argv) {
+	int option_index = 0;
+	int next_option;
+	do {
+		next_option = getopt_long(
+			argc, const_cast<char**>(argv),
+			short_options, long_options, &option_index);
+		switch (next_option) {
+            case ARG_WRAPPER:
+				wrapper = optarg;
+				break;
+			case 'f': format = FASTA; break;
+			case 'c': format = CMDLINE; break;
+            case ARG_THREADS:
+			case 'p': nthreads = parseNumber<int>(1, "-p arg must be at least 1");
+                break;
+			case 'C':
+				cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl;
+				throw 1;
+				break;
+			case 'l':
+				lineRate = parseNumber<int>(3, "-l/--lineRate arg must be at least 3");
+				break;
+			case 'i':
+				linesPerSide = parseNumber<int>(1, "-i/--linesPerSide arg must be at least 1");
+				break;
+			case 'o':
+				offRate = parseNumber<int>(0, "-o/--offRate arg must be at least 0");
+				break;
+            case ARG_LOCAL_OFFRATE:
+                localOffRate = parseNumber<int>(0, "-o/--localoffrate arg must be at least 0");
+                break;
+			case '3':
+				justRef = true;
+				break;
+			case 't':
+				ftabChars = parseNumber<int>(1, "-t/--ftabChars arg must be at least 1");
+				break;
+            case ARG_LOCAL_FTABCHARS:
+				localFtabChars = parseNumber<int>(1, "-t/--localftabchars arg must be at least 1");
+				break;
+			case 'n':
+				// all f-s is used to mean "not set", so put 'e' on end
+				bmax = 0xfffffffe;
+				break;
+			case 'h':
+			case ARG_USAGE:
+				printUsage(cout);
+				throw 0;
+				break;
+			case ARG_BMAX:
+				bmax = parseNumber<TIndexOffU>(1, "--bmax arg must be at least 1");
+				bmaxMultSqrt = OFF_MASK; // don't use multSqrt
+				bmaxDivN = 0xffffffff;     // don't use multSqrt
+				break;
+			case ARG_BMAX_MULT:
+				bmaxMultSqrt = parseNumber<TIndexOffU>(1, "--bmaxmultsqrt arg must be at least 1");
+				bmax = OFF_MASK;     // don't use bmax
+				bmaxDivN = 0xffffffff; // don't use multSqrt
+				break;
+			case ARG_BMAX_DIV:
+				bmaxDivN = parseNumber<uint32_t>(1, "--bmaxdivn arg must be at least 1");
+				bmax = OFF_MASK;         // don't use bmax
+				bmaxMultSqrt = OFF_MASK; // don't use multSqrt
+				break;
+			case ARG_DCV:
+				dcv = parseNumber<int>(3, "--dcv arg must be at least 3");
+				break;
+			case ARG_SEED:
+				seed = parseNumber<int>(0, "--seed arg must be at least 0");
+				break;
+			case ARG_REVERSE_EACH:
+				reverseEach = true;
+				break;
+            case ARG_SA:
+                doSaFile = true;
+                break;
+			case ARG_NTOA: nsToAs = true; break;
+            case ARG_CONVERSION_TABLE:
+                conversion_table_fname = optarg;
+                break;
+            case ARG_TAXONOMY_TREE:
+                taxonomy_fname = optarg;
+                break;
+            case ARG_NAME_TABLE:
+                name_table_fname = optarg;
+                break;
+            case ARG_SIZE_TABLE:
+                size_table_fname = optarg;
+                break;
+            case ARG_KMER_COUNT:
+                kmer_count = parseNumber<int>(1, "--kmer-count arg must be at least 1");
+                break;
+			case 'a': autoMem = false; break;
+			case 'q': verbose = false; break;
+			case 's': sanityCheck = true; break;
+			case 'r': writeRef = false; break;
+
+			case -1: /* Done with options. */
+				break;
+			case 0:
+				if (long_options[option_index].flag != 0)
+					break;
+			default:
+				printUsage(cerr);
+				throw 1;
+		}
+	} while(next_option != -1);
+	if(bmax < 40) {
+		cerr << "Warning: specified bmax is very small (" << bmax << ").  This can lead to" << endl
+		     << "extremely slow performance and memory exhaustion.  Perhaps you meant to specify" << endl
+		     << "a small --bmaxdivn?" << endl;
+	}
+}
+
+EList<string> filesWritten;
+
+/**
+ * Delete all the index files that we tried to create.  For when we had to
+ * abort the index-building process due to an error.
+ */
+static void deleteIdxFiles(
+                           const string& outfile,
+                           bool doRef,
+                           bool justRef)
+{
+	
+	for(size_t i = 0; i < filesWritten.size(); i++) {
+		cerr << "Deleting \"" << filesWritten[i].c_str()
+		     << "\" file written during aborted indexing attempt." << endl;
+		remove(filesWritten[i].c_str());
+	}
+}
+
+extern void initializeCntLut();
+
+/**
+ * Drive the index construction process and optionally sanity-check the
+ * result.
+ */
+template<typename TStr>
+static void driver(
+                   const string& infile,
+                   EList<string>& infiles,
+                   const string& conversion_table_fname,
+                   const string& taxonomy_fname,
+                   const string& name_table_fname,
+                   const string& size_table_fname,
+                   const string& outfile,
+                   bool packed,
+                   int reverse)
+{
+    initializeCntLut();
+	EList<FileBuf*> is(MISC_CAT);
+	bool bisulfite = false;
+	RefReadInParams refparams(false, reverse, nsToAs, bisulfite);
+	assert_gt(infiles.size(), 0);
+	if(format == CMDLINE) {
+		// Adapt sequence strings to stringstreams open for input
+		stringstream *ss = new stringstream();
+		for(size_t i = 0; i < infiles.size(); i++) {
+			(*ss) << ">" << i << endl << infiles[i].c_str() << endl;
+		}
+		FileBuf *fb = new FileBuf(ss);
+		assert(fb != NULL);
+		assert(!fb->eof());
+		assert(fb->get() == '>');
+		ASSERT_ONLY(fb->reset());
+		assert(!fb->eof());
+		is.push_back(fb);
+	} else {
+		// Adapt sequence files to ifstreams
+		for(size_t i = 0; i < infiles.size(); i++) {
+			FILE *f = fopen(infiles[i].c_str(), "r");
+			if (f == NULL) {
+				cerr << "Error: could not open "<< infiles[i].c_str() << endl;
+				throw 1;
+			}
+			FileBuf *fb = new FileBuf(f);
+			assert(fb != NULL);
+			if(fb->peek() == -1 || fb->eof()) {
+				cerr << "Warning: Empty fasta file: '" << infile.c_str() << "'" << endl;
+				continue;
+			}
+			assert(!fb->eof());
+			assert(fb->get() == '>');
+			ASSERT_ONLY(fb->reset());
+			assert(!fb->eof());
+			is.push_back(fb);
+		}
+	}
+	if(is.empty()) {
+		cerr << "Warning: All fasta inputs were empty" << endl;
+		throw 1;
+	}
+	// Vector for the ordered list of "records" comprising the input
+	// sequences.  A record represents a stretch of unambiguous
+	// characters in one of the input sequences.
+	EList<RefRecord> szs(MISC_CAT);
+	std::pair<size_t, size_t> sztot;
+	{
+		if(verbose) cout << "Reading reference sizes" << endl;
+		Timer _t(cout, "  Time reading reference sizes: ", verbose);
+        sztot = BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck);
+	}
+	if(justRef) return;
+	assert_gt(sztot.first, 0);
+	assert_gt(sztot.second, 0);
+	assert_gt(szs.size(), 0);
+	// Construct index from input strings and parameters
+	filesWritten.push_back(outfile + ".1." + gEbwt_ext);
+	filesWritten.push_back(outfile + ".2." + gEbwt_ext);
+    filesWritten.push_back(outfile + ".3." + gEbwt_ext);
+	TStr s;
+	Ebwt<TIndexOffU> ebwt(
+                          s,
+                          packed,
+                          0,
+                          1,  // TODO: maybe not?
+                          lineRate,
+                          offRate,      // suffix-array sampling rate
+                          ftabChars,    // number of chars in initial arrow-pair calc
+                          outfile,      // basename for .?.ebwt files
+                          reverse == 0, // fw
+                          !entireSA,    // useBlockwise
+                          bmax,         // block size for blockwise SA builder
+                          bmaxMultSqrt, // block size as multiplier of sqrt(len)
+                          bmaxDivN,     // block size as divisor of len
+                          noDc? 0 : dcv,// difference-cover period
+                          nthreads,
+                          is,           // list of input streams
+                          szs,          // list of reference sizes
+                          (TIndexOffU)sztot.first,  // total size of all unambiguous ref chars
+                          conversion_table_fname,
+                          taxonomy_fname,
+                          name_table_fname,
+                          size_table_fname,
+                          refparams,    // reference read-in parameters
+                          seed,         // pseudo-random number generator seed
+                          -1,           // override offRate
+                          doSaFile,     // make a file with just the suffix array in it
+                          doBwtFile,    // make a file with just the BWT string in it
+                          kmer_count,   // Count the number of distinct k-mers if non-zero
+                          verbose,      // be talkative
+                          autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
+                          sanityCheck); // verify results and internal consistency
+	// Note that the Ebwt is *not* resident in memory at this time.  To
+	// load it into memory, call ebwt.loadIntoMemory()
+	if(verbose) {
+		// Print Ebwt's vital stats
+		ebwt.eh().print(cout);
+	}
+	if(sanityCheck) {
+		// Try restoring the original string (if there were
+		// multiple texts, what we'll get back is the joined,
+		// padded string, not a list)
+		ebwt.loadIntoMemory(
+                            0,
+                            reverse ? (refparams.reverse == REF_READ_REVERSE) : 0,
+                            true,  // load SA sample?
+                            true,  // load ftab?
+                            true,  // load rstarts?
+                            false,
+                            false);
+		SString<char> s2;
+		ebwt.restore(s2);
+		ebwt.evictFromMemory();
+		{
+			SString<char> joinedss = Ebwt<TIndexOffU>::join<SString<char> >(
+                                                                            is,          // list of input streams
+                                                                            szs,         // list of reference sizes
+                                                                            (TIndexOffU)sztot.first, // total size of all unambiguous ref chars
+                                                                            refparams,   // reference read-in parameters
+                                                                            seed);       // pseudo-random number generator seed
+			if(refparams.reverse == REF_READ_REVERSE) {
+				joinedss.reverse();
+			}
+			assert_eq(joinedss.length(), s2.length());
+			assert(sstr_eq(joinedss, s2));
+		}
+		if(verbose) {
+			if(s2.length() < 1000) {
+				cout << "Passed restore check: " << s2.toZBuf() << endl;
+			} else {
+				cout << "Passed restore check: (" << s2.length() << " chars)" << endl;
+			}
+		}
+	}
+}
+
+static const char *argv0 = NULL;
+
+extern "C" {
+/**
+ * main function.  Parses command-line arguments.
+ */
+int centrifuge_build(int argc, const char **argv) {
+	string outfile;
+	try {
+		// Reset all global state, including getopt state
+		opterr = optind = 1;
+		resetOptions();
+
+		string infile;
+		EList<string> infiles(MISC_CAT);
+
+		parseOptions(argc, argv);
+		argv0 = argv[0];
+		if(showVersion) {
+			cout << argv0 << " version " << string(CENTRIFUGE_VERSION).c_str() << endl;
+			if(sizeof(void*) == 4) {
+				cout << "32-bit" << endl;
+			} else if(sizeof(void*) == 8) {
+				cout << "64-bit" << endl;
+			} else {
+				cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
+			}
+			cout << "Built on " << BUILD_HOST << endl;
+			cout << BUILD_TIME << endl;
+			cout << "Compiler: " << COMPILER_VERSION << endl;
+			cout << "Options: " << COMPILER_OPTIONS << endl;
+			cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
+				 << sizeof(int)
+				 << ", " << sizeof(long) << ", " << sizeof(long long)
+				 << ", " << sizeof(void *) << ", " << sizeof(size_t)
+				 << ", " << sizeof(off_t) << "}" << endl;
+			return 0;
+		}
+
+		// Get input filename
+		if(optind >= argc) {
+			cerr << "No input sequence or sequence file specified!" << endl;
+			printUsage(cerr);
+			return 1;
+		}
+		infile = argv[optind++];
+
+		// Get output filename
+		if(optind >= argc) {
+			cerr << "No output file specified!" << endl;
+			printUsage(cerr);
+			return 1;
+		}
+		outfile = argv[optind++];
+
+		tokenize(infile, ",", infiles);
+		if(infiles.size() < 1) {
+			cerr << "Tokenized input file list was empty!" << endl;
+			printUsage(cerr);
+			return 1;
+		}
+        
+        if(conversion_table_fname == "") {
+            cerr << "Please specify --conversion-table!" << endl;
+            printUsage(cerr);
+            return 1;
+        }
+        
+        if(taxonomy_fname == "") {
+            cerr << "Please specify --taxonomy-tree!" << endl;
+            printUsage(cerr);
+            return 1;
+        }
+        
+        if(name_table_fname == "") {
+            cerr << "Please specify --name-table!" << endl;
+            printUsage(cerr);
+            return 1;
+        }
+
+		// Optionally summarize
+		if(verbose) {
+			cout << "Settings:" << endl
+				 << "  Output files: \"" << outfile.c_str() << ".*." << gEbwt_ext << "\"" << endl
+				 << "  Line rate: " << lineRate << " (line is " << (1<<lineRate) << " bytes)" << endl
+				 << "  Lines per side: " << linesPerSide << " (side is " << ((1<<lineRate)*linesPerSide) << " bytes)" << endl
+				 << "  Offset rate: " << offRate << " (one in " << (1<<offRate) << ")" << endl
+				 << "  FTable chars: " << ftabChars << endl
+				 << "  Strings: " << (packed? "packed" : "unpacked") << endl
+                 << "  Local offset rate: " << localOffRate << " (one in " << (1<<localOffRate) << ")" << endl
+                 << "  Local fTable chars: " << localFtabChars << endl
+				 ;
+			if(bmax == OFF_MASK) {
+				cout << "  Max bucket size: default" << endl;
+			} else {
+				cout << "  Max bucket size: " << bmax << endl;
+			}
+			if(bmaxMultSqrt == OFF_MASK) {
+				cout << "  Max bucket size, sqrt multiplier: default" << endl;
+			} else {
+				cout << "  Max bucket size, sqrt multiplier: " << bmaxMultSqrt << endl;
+			}
+			if(bmaxDivN == 0xffffffff) {
+				cout << "  Max bucket size, len divisor: default" << endl;
+			} else {
+				cout << "  Max bucket size, len divisor: " << bmaxDivN << endl;
+			}
+			cout << "  Difference-cover sample period: " << dcv << endl;
+			cout << "  Endianness: " << (bigEndian? "big":"little") << endl
+				 << "  Actual local endianness: " << (currentlyBigEndian()? "big":"little") << endl
+				 << "  Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
+	#ifdef NDEBUG
+			cout << "  Assertions: disabled" << endl;
+	#else
+			cout << "  Assertions: enabled" << endl;
+	#endif
+			cout << "  Random seed: " << seed << endl;
+			cout << "  Sizeofs: void*:" << sizeof(void*) << ", int:" << sizeof(int) << ", long:" << sizeof(long) << ", size_t:" << sizeof(size_t) << endl;
+			cout << "Input files DNA, " << file_format_names[format].c_str() << ":" << endl;
+			for(size_t i = 0; i < infiles.size(); i++) {
+				cout << "  " << infiles[i].c_str() << endl;
+			}
+		}
+		// Seed random number generator
+		srand(seed);
+		{
+			Timer timer(cout, "Total time for call to driver() for forward index: ", verbose);
+			if(!packed) {
+				try {
+					driver<SString<char> >(
+                                           infile,
+                                           infiles,
+                                           conversion_table_fname,
+                                           taxonomy_fname,
+                                           size_table_fname,
+                                           name_table_fname,
+                                           outfile,
+                                           false,
+                                           REF_READ_FORWARD);
+				} catch(bad_alloc& e) {
+					if(autoMem) {
+						cerr << "Switching to a packed string representation." << endl;
+						packed = true;
+					} else {
+						throw e;
+					}
+				}
+			}
+			if(packed) {
+				driver<S2bDnaString>(
+                                     infile,
+                                     infiles,
+                                     conversion_table_fname,
+                                     taxonomy_fname,
+                                     name_table_fname,
+                                     size_table_fname,
+                                     outfile,
+                                     true,
+                                     REF_READ_FORWARD);
+			}
+		}
+#if 0
+		int reverseType = reverseEach ? REF_READ_REVERSE_EACH : REF_READ_REVERSE;
+		srand(seed);
+		Timer timer(cout, "Total time for backward call to driver() for mirror index: ", verbose);
+		if(!packed) {
+			try {
+				driver<SString<char> >(infile, infiles, outfile + ".rev", false, reverseType);
+			} catch(bad_alloc& e) {
+				if(autoMem) {
+					cerr << "Switching to a packed string representation." << endl;
+					packed = true;
+				} else {
+					throw e;
+				}
+			}
+		}
+		if(packed) {
+			driver<S2bDnaString>(infile, infiles, outfile + ".rev", true, reverseType);
+		}
+#endif
+		return 0;
+	} catch(std::exception& e) {
+		cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
+		cerr << "Command: ";
+		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+		cerr << endl;
+		deleteIdxFiles(outfile, writeRef || justRef, justRef);
+		return 1;
+	} catch(int e) {
+		if(e != 0) {
+			cerr << "Error: Encountered internal Centrifuge exception (#" << e << ")" << endl;
+			cerr << "Command: ";
+			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+			cerr << endl;
+		}
+		deleteIdxFiles(outfile, writeRef || justRef, justRef);
+		return e;
+	}
+}
+}
diff --git a/centrifuge_build_main.cpp b/centrifuge_build_main.cpp
new file mode 100644
index 0000000..97eb09c
--- /dev/null
+++ b/centrifuge_build_main.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string.h>
+#include <stdlib.h>
+#include "tokenize.h"
+#include "ds.h"
+#include "mem_ids.h"
+
+using namespace std;
+
+extern "C" {
+	int centrifuge_build(int argc, const char **argv);
+}
+
+/**
+ * bowtie-build main function.  It is placed in a separate source file
+ * to make it slightly easier to compile as a library.
+ *
+ * If the user specifies -A <file> as the first two arguments, main
+ * will interpret that file as having one set of command-line arguments
+ * per line, and will dispatch each batch of arguments one at a time to
+ * bowtie-build.
+ */
+int main(int argc, const char **argv) {
+	if(argc > 2 && strcmp(argv[1], "-A") == 0) {
+		const char *file = argv[2];
+		ifstream in;
+		in.open(file);
+		char buf[4096];
+		int lastret = -1;
+		while(in.getline(buf, 4095)) {
+			EList<string> args(MISC_CAT);
+			args.push_back(string(argv[0]));
+			tokenize(buf, " \t", args);
+			const char **myargs = (const char**)malloc(sizeof(char*)*args.size());
+			for(size_t i = 0; i < args.size(); i++) {
+				myargs[i] = args[i].c_str();
+			}
+			if(args.size() == 1) continue;
+			lastret = centrifuge_build((int)args.size(), myargs);
+			free(myargs);
+		}
+		if(lastret == -1) {
+			cerr << "Warning: No arg strings parsed from " << file << endl;
+			return 0;
+		}
+		return lastret;
+	} else {
+		return centrifuge_build(argc, argv);
+	}
+}
diff --git a/centrifuge_compress.cpp b/centrifuge_compress.cpp
new file mode 100644
index 0000000..1ca1698
--- /dev/null
+++ b/centrifuge_compress.cpp
@@ -0,0 +1,1433 @@
+/*
+ * Copyright 2015, Daehwan Kim <infphilo at gmail.com>
+ *
+ * This file is part of Centrifuge.
+ *
+ * Centrifuge is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Centrifuge is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Centrifuge.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cassert>
+#include <getopt.h>
+#include "assert_helpers.h"
+#include "endian_swap.h"
+#include "bt2_idx.h"
+#include "bt2_io.h"
+#include "bt2_util.h"
+#include "formats.h"
+#include "sequence_io.h"
+#include "tokenize.h"
+#include "timer.h"
+#include "ref_read.h"
+#include "filebuf.h"
+#include "reference.h"
+#include "ds.h"
+#include "aligner_sw.h"
+
+/**
+ * \file Driver for the bowtie-build indexing tool.
+ */
+
+// Build parameters
+int verbose;
+static int sanityCheck;
+static int format;
+static TIndexOffU bmax;
+static TIndexOffU bmaxMultSqrt;
+static uint32_t bmaxDivN;
+static int dcv;
+static int noDc;
+static int entireSA;
+static int seed;
+static int showVersion;
+//   Ebwt parameters
+static int32_t lineRate;
+static int32_t linesPerSide;
+static int32_t offRate;
+static int32_t ftabChars;
+static int32_t localOffRate;
+static int32_t localFtabChars;
+static int  bigEndian;
+static bool nsToAs;
+static bool doSaFile;  // make a file with just the suffix array in it
+static bool doBwtFile; // make a file with just the BWT string in it
+static bool autoMem;
+static bool packed;
+static bool writeRef;
+static bool justRef;
+static bool reverseEach;
+static string wrapper;
+static int across;
+static size_t minSimLen;  // minimum similar length
+static bool printN;
+
+static void resetOptions() {
+	verbose        = true;  // be talkative (default)
+	sanityCheck    = 0;     // do slow sanity checks
+	format         = FASTA; // input sequence format
+	bmax           = OFF_MASK; // max blockwise SA bucket size
+	bmaxMultSqrt   = OFF_MASK; // same, as multplier of sqrt(n)
+	bmaxDivN       = 4;          // same, as divisor of n
+	dcv            = 1024;  // bwise SA difference-cover sample sz
+	noDc           = 0;     // disable difference-cover sample
+	entireSA       = 0;     // 1 = disable blockwise SA
+	seed           = 0;     // srandom seed
+	showVersion    = 0;     // just print version and quit?
+	//   Ebwt parameters
+	lineRate       = Ebwt<TIndexOffU>::default_lineRate;
+	linesPerSide   = 1;  // 1 64-byte line on a side
+	offRate        = 4;  // sample 1 out of 16 SA elts
+	ftabChars      = 10; // 10 chars in initial lookup table
+    localOffRate   = 3;
+    localFtabChars = 6;
+	bigEndian      = 0;  // little endian
+	nsToAs         = false; // convert reference Ns to As prior to indexing
+    doSaFile       = false; // make a file with just the suffix array in it
+    doBwtFile      = false; // make a file with just the BWT string in it
+	autoMem        = true;  // automatically adjust memory usage parameters
+	packed         = false; //
+	writeRef       = true;  // write compact reference to .3.bt2/.4.bt2
+	justRef        = false; // *just* write compact reference, don't index
+	reverseEach    = false;
+    across         = 60; // number of characters across in FASTA output
+    minSimLen      = 100;
+    printN         = false;
+    wrapper.clear();
+}
+
+// Argument constants for getopts
+enum {
+	ARG_BMAX = 256,
+	ARG_BMAX_MULT,
+	ARG_BMAX_DIV,
+	ARG_DCV,
+	ARG_SEED,
+	ARG_CUTOFF,
+	ARG_PMAP,
+	ARG_NTOA,
+	ARG_USAGE,
+	ARG_REVERSE_EACH,
+    ARG_SA,
+	ARG_WRAPPER,
+    ARG_LOCAL_OFFRATE,
+    ARG_LOCAL_FTABCHARS,
+    ARG_MIN_SIMLEN,
+    ARG_PRINTN,
+};
+
+/**
+ * Print a detailed usage message to the provided output stream.
+ */
+static void printUsage(ostream& out) {
+	out << "Centrifuge version " << string(CENTRIFUGE_VERSION).c_str() << " by Daehwan Kim (infphilo at gmail.com, http://www.ccb.jhu.edu/people/infphilo)" << endl;
+    
+#ifdef BOWTIE_64BIT_INDEX
+	string tool_name = "hisat-build-l";
+#else
+	string tool_name = "hisat-build-s";
+#endif
+	if(wrapper == "basic-0") {
+		tool_name = "hisat-build";
+	}
+    
+	out << "Usage: hisat2-build [options]* <reference_in> <bt2_index_base>" << endl
+	    << "    reference_in            comma-separated list of files with ref sequences" << endl
+	    << "    hisat_index_base          write " << gEbwt_ext << " data to files with this dir/basename" << endl
+        << "Options:" << endl
+        << "    -c                      reference sequences given on cmd line (as" << endl
+        << "                            <reference_in>)" << endl;
+    if(wrapper == "basic-0") {
+        out << "    --large-index           force generated index to be 'large', even if ref" << endl
+		<< "                            has fewer than 4 billion nucleotides" << endl;
+	}
+    out << "    -a/--noauto             disable automatic -p/--bmax/--dcv memory-fitting" << endl
+	    << "    -p/--packed             use packed strings internally; slower, uses less mem" << endl
+	    << "    --bmax <int>            max bucket sz for blockwise suffix-array builder" << endl
+	    << "    --bmaxdivn <int>        max bucket sz as divisor of ref len (default: 4)" << endl
+	    << "    --dcv <int>             diff-cover period for blockwise (default: 1024)" << endl
+	    << "    --nodc                  disable diff-cover (algorithm becomes quadratic)" << endl
+	    << "    -r/--noref              don't build .3/.4.bt2 (packed reference) portion" << endl
+	    << "    -3/--justref            just build .3/.4.bt2 (packed reference) portion" << endl
+	    << "    -o/--offrate <int>      SA is sampled every 2^offRate BWT chars (default: 5)" << endl
+	    << "    -t/--ftabchars <int>    # of chars consumed in initial lookup (default: 10)" << endl
+        << "    --localoffrate <int>    SA (local) is sampled every 2^offRate BWT chars (default: 3)" << endl
+        << "    --localftabchars <int>  # of chars consumed in initial lookup in a local index (default: 6)" << endl
+	    << "    --seed <int>            seed for random number generator" << endl
+	    << "    -q/--quiet              verbose output (for debugging)" << endl
+        << "    --printN                print original sequence with mask" << endl
+	    << "    -h/--help               print detailed description of tool and its options" << endl
+	    << "    --usage                 print this usage message" << endl
+	    << "    --version               print version information and quit" << endl
+	    ;
+    
+    if(wrapper.empty()) {
+		cerr << endl
+        << "*** Warning ***" << endl
+        << "'" << tool_name << "' was run directly.  It is recommended "
+        << "that you run the wrapper script 'bowtie2-build' instead."
+        << endl << endl;
+	}
+}
+
+static const char *short_options = "qraph?nscfl:i:o:t:h:3C";
+
+static struct option long_options[] = {
+	{(char*)"quiet",          no_argument,       0,            'q'},
+	{(char*)"sanity",         no_argument,       0,            's'},
+	{(char*)"packed",         no_argument,       0,            'p'},
+	{(char*)"little",         no_argument,       &bigEndian,   0},
+	{(char*)"big",            no_argument,       &bigEndian,   1},
+	{(char*)"bmax",           required_argument, 0,            ARG_BMAX},
+	{(char*)"bmaxmultsqrt",   required_argument, 0,            ARG_BMAX_MULT},
+	{(char*)"bmaxdivn",       required_argument, 0,            ARG_BMAX_DIV},
+	{(char*)"dcv",            required_argument, 0,            ARG_DCV},
+	{(char*)"nodc",           no_argument,       &noDc,        1},
+	{(char*)"seed",           required_argument, 0,            ARG_SEED},
+	{(char*)"entiresa",       no_argument,       &entireSA,    1},
+	{(char*)"version",        no_argument,       &showVersion, 1},
+	{(char*)"noauto",         no_argument,       0,            'a'},
+	{(char*)"noblocks",       required_argument, 0,            'n'},
+	{(char*)"linerate",       required_argument, 0,            'l'},
+	{(char*)"linesperside",   required_argument, 0,            'i'},
+	{(char*)"offrate",        required_argument, 0,            'o'},
+	{(char*)"ftabchars",      required_argument, 0,            't'},
+    {(char*)"localoffrate",   required_argument, 0,            ARG_LOCAL_OFFRATE},
+	{(char*)"localftabchars", required_argument, 0,            ARG_LOCAL_FTABCHARS},
+	{(char*)"help",           no_argument,       0,            'h'},
+	{(char*)"ntoa",           no_argument,       0,            ARG_NTOA},
+	{(char*)"justref",        no_argument,       0,            '3'},
+	{(char*)"noref",          no_argument,       0,            'r'},
+	{(char*)"color",          no_argument,       0,            'C'},
+    {(char*)"sa",             no_argument,       0,            ARG_SA},
+	{(char*)"reverse-each",   no_argument,       0,            ARG_REVERSE_EACH},
+    {(char*)"min-simlen",     required_argument, 0,            ARG_MIN_SIMLEN},
+    {(char*)"printN",         no_argument,       0,            ARG_PRINTN},
+	{(char*)"usage",          no_argument,       0,            ARG_USAGE},
+    {(char*)"wrapper",        required_argument, 0,            ARG_WRAPPER},
+	{(char*)0, 0, 0, 0} // terminator
+};
+
+/**
+ * Parse an int out of optarg and enforce that it be at least 'lower';
+ * if it is less than 'lower', then output the given error message and
+ * exit with an error and a usage message.
+ */
+template<typename T>
+static int parseNumber(T lower, const char *errmsg) {
+	char *endPtr= NULL;
+	T t = (T)strtoll(optarg, &endPtr, 10);
+	if (endPtr != NULL) {
+		if (t < lower) {
+			cerr << errmsg << endl;
+			printUsage(cerr);
+			throw 1;
+		}
+		return t;
+	}
+	cerr << errmsg << endl;
+	printUsage(cerr);
+	throw 1;
+	return -1;
+}
+
+/**
+ * Read command-line arguments
+ */
+static void parseOptions(int argc, const char **argv) {
+	int option_index = 0;
+	int next_option;
+	do {
+		next_option = getopt_long(
+			argc, const_cast<char**>(argv),
+			short_options, long_options, &option_index);
+		switch (next_option) {
+            case ARG_WRAPPER:
+				wrapper = optarg;
+				break;
+			case 'f': format = FASTA; break;
+			case 'c': format = CMDLINE; break;
+			case 'p': packed = true; break;
+			case 'C':
+				cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl;
+				throw 1;
+				break;
+			case 'l':
+				lineRate = parseNumber<int>(3, "-l/--lineRate arg must be at least 3");
+				break;
+			case 'i':
+				linesPerSide = parseNumber<int>(1, "-i/--linesPerSide arg must be at least 1");
+				break;
+			case 'o':
+				offRate = parseNumber<int>(0, "-o/--offRate arg must be at least 0");
+				break;
+            case ARG_LOCAL_OFFRATE:
+                localOffRate = parseNumber<int>(0, "-o/--localoffrate arg must be at least 0");
+                break;
+			case '3':
+				justRef = true;
+				break;
+			case 't':
+				ftabChars = parseNumber<int>(1, "-t/--ftabChars arg must be at least 1");
+				break;
+            case ARG_LOCAL_FTABCHARS:
+				localFtabChars = parseNumber<int>(1, "-t/--localftabchars arg must be at least 1");
+				break;
+			case 'n':
+				// all f-s is used to mean "not set", so put 'e' on end
+				bmax = 0xfffffffe;
+				break;
+			case 'h':
+			case ARG_USAGE:
+				printUsage(cout);
+				throw 0;
+				break;
+			case ARG_BMAX:
+				bmax = parseNumber<TIndexOffU>(1, "--bmax arg must be at least 1");
+				bmaxMultSqrt = OFF_MASK; // don't use multSqrt
+				bmaxDivN = 0xffffffff;     // don't use multSqrt
+				break;
+			case ARG_BMAX_MULT:
+				bmaxMultSqrt = parseNumber<TIndexOffU>(1, "--bmaxmultsqrt arg must be at least 1");
+				bmax = OFF_MASK;     // don't use bmax
+				bmaxDivN = 0xffffffff; // don't use multSqrt
+				break;
+			case ARG_BMAX_DIV:
+				bmaxDivN = parseNumber<uint32_t>(1, "--bmaxdivn arg must be at least 1");
+				bmax = OFF_MASK;         // don't use bmax
+				bmaxMultSqrt = OFF_MASK; // don't use multSqrt
+				break;
+			case ARG_DCV:
+				dcv = parseNumber<int>(3, "--dcv arg must be at least 3");
+				break;
+			case ARG_SEED:
+				seed = parseNumber<int>(0, "--seed arg must be at least 0");
+				break;
+			case ARG_REVERSE_EACH:
+				reverseEach = true;
+				break;
+            case ARG_SA:
+                doSaFile = true;
+                break;
+			case ARG_NTOA: nsToAs = true; break;
+            case ARG_MIN_SIMLEN:
+                minSimLen = parseNumber<size_t>(2, "--min-simlen arg must be at least 2");
+                break;
+            case ARG_PRINTN: printN = true; break;
+			case 'a': autoMem = false; break;
+			case 'q': verbose = false; break;
+			case 's': sanityCheck = true; break;
+			case 'r': writeRef = false; break;
+
+			case -1: /* Done with options. */
+				break;
+			case 0:
+				if (long_options[option_index].flag != 0)
+					break;
+			default:
+				printUsage(cerr);
+				throw 1;
+		}
+	} while(next_option != -1);
+	if(bmax < 40) {
+		cerr << "Warning: specified bmax is very small (" << bmax << ").  This can lead to" << endl
+		     << "extremely slow performance and memory exhaustion.  Perhaps you meant to specify" << endl
+		     << "a small --bmaxdivn?" << endl;
+	}
+}
+
+EList<string> filesWritten;
+
+static void print_fasta_record(
+                               ostream& fout,
+                               const string& defline,
+                               const SString<char>& seq,
+                               size_t len)
+{
+    fout << ">";
+    fout << defline.c_str() << endl;
+    
+    if(across > 0) {
+        size_t i = 0;
+        while (i + across < len)
+        {
+            for(size_t j = 0; j < (unsigned)across; j++) {
+                int base = seq.get(i + j);
+                assert_lt(base, 4);
+                fout << "ACGTN"[base];
+            }
+            fout << endl;
+            i += across;
+        }
+        if (i < len) {
+            for(size_t j = i; j < len; j++) {
+                int base = seq.get(j);
+                assert_lt(base, 4);
+                fout << "ACGTN"[base];
+            }
+            fout << endl;
+        }
+    } else {
+        for(size_t j = 0; j < len; j++) {
+            int base = seq.get(j);
+            assert_lt(base, 4);
+            fout << "ACGTN"[base];
+        }
+        fout << endl;
+    }
+}
+
+struct RegionSimilar {
+    bool fw;
+    size_t pos;
+    uint32_t fw_length; // this includes the base at 'pos'
+    uint32_t bw_length;
+    uint32_t mismatches;
+    uint32_t gaps;
+    
+    void reset() {
+        fw = false;
+        pos = 0;
+        fw_length = bw_length = 0;
+        mismatches = gaps = 0;
+    }
+    
+    bool operator< (const RegionSimilar& o) const {
+        return pos < o.pos;
+    }
+};
+
+struct Region {
+    size_t pos;
+    uint32_t fw_length;
+    uint32_t bw_length; // used for merging
+    bool   low_complexity;
+    
+    uint32_t match_begin;
+    uint32_t match_end;
+    
+    uint32_t match_size() {
+        assert_leq(match_begin, match_end);
+        return match_end - match_begin;
+    }
+    
+    void reset() {
+        pos = fw_length = bw_length = 0, low_complexity = false;
+        match_begin = match_end = 0;
+    }
+    
+    bool operator< (const Region& o) const {
+        return pos < o.pos;
+    }
+};
+
+struct RegionToMerge {
+    bool processed;
+    EList<pair<uint32_t, uint32_t> > list;
+    
+    void reset() {
+        processed = false;
+        list.clear();
+    }
+};
+
+/**
+ * Drive the index construction process and optionally sanity-check the
+ * result.
+ */
+static void driver(
+	const string& fafile,
+	const string& safile,
+	bool packed,
+	int reverse)
+{
+    EList<FileBuf*> is(MISC_CAT);
+	bool bisulfite = false;
+	RefReadInParams refparams(false, reverse, nsToAs, bisulfite);
+    FILE *f = fopen(fafile.c_str(), "r");
+    if (f == NULL) {
+        cerr << "Error: could not open "<<fafile.c_str() << endl;
+        throw 1;
+    }
+    FileBuf *fb = new FileBuf(f);
+    assert(fb != NULL);
+    if(fb->peek() == -1 || fb->eof()) {
+        cerr << "Warning: Empty fasta file: '" << fafile.c_str() << "'" << endl;
+        throw 1;
+    }
+    assert(!fb->eof());
+    assert(fb->get() == '>');
+    ASSERT_ONLY(fb->reset());
+    assert(!fb->eof());
+    is.push_back(fb);
+    if(is.empty()) {
+		cerr << "Warning: All fasta inputs were empty" << endl;
+		throw 1;
+	}
+	// Vector for the ordered list of "records" comprising the input
+	// sequences.  A record represents a stretch of unambiguous
+	// characters in one of the input sequences.
+	EList<RefRecord> szs(MISC_CAT);
+	BitPairReference::szsFromFasta(is, string(), bigEndian, refparams, szs, sanityCheck);
+	assert_gt(szs.size(), 0);
+    
+    EList<string> refnames;
+    
+    SString<char> s;
+    assert_eq(szs.size(), 1);
+    size_t jlen = szs[0].len;
+    try {
+        Timer _t(cerr, "  (1/5) Time reading reference sequence: ", verbose);
+        
+        s.resize(jlen);
+        RefReadInParams rpcp = refparams;
+        // For each filebuf
+        assert_eq(is.size(), 1);
+        FileBuf *fb = is[0];
+        assert(!fb->eof());
+        // For each *fragment* (not necessary an entire sequence) we
+        // can pull out of istream l[i]...
+        if(!fb->eof()) {
+            // Push a new name onto our vector
+            refnames.push_back("");
+            TIndexOffU distoff = 0;
+            fastaRefReadAppend(*fb, true, s, distoff, rpcp, &refnames.back());
+        }
+        fb->reset();
+        assert(!fb->eof());
+        // Joined reference sequence now in 's'
+    } catch(bad_alloc& e) {
+        // If we throw an allocation exception in the try block,
+        // that means that the joined version of the reference
+        // string itself is too larger to fit in memory.  The only
+        // alternatives are to tell the user to give us more memory
+        // or to try again with a packed representation of the
+        // reference (if we haven't tried that already).
+        cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
+        // There's no point passing this exception on.  The fact
+        // that we couldn't allocate the joined string means that
+        // --bmax is irrelevant - the user should re-run with
+        // ebwt-build-packed
+        if(packed) {
+            cerr << "Please try running bowtie-build on a computer with more memory." << endl;
+        } else {
+            cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
+            << "mode (-a/--auto), or try again on a computer with more memory." << endl;
+        }
+        if(sizeof(void*) == 4) {
+            cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
+            << "this executable is 32-bit." << endl;
+        }
+        throw 1;
+    }
+    // Succesfully obtained joined reference string
+    assert_eq(s.length(), jlen);
+    size_t sense_seq_len = s.length();
+    size_t both_seq_len = sense_seq_len * 2;
+    assert_geq(sense_seq_len, 2);
+    
+    SwAligner sw;
+
+    SimpleFunc scoreMin; scoreMin.init(SIMPLE_FUNC_LINEAR, DEFAULT_MIN_CONST, DEFAULT_MIN_LINEAR);
+    SimpleFunc nCeil; nCeil.init(SIMPLE_FUNC_LINEAR, 0.0f, std::numeric_limits<double>::max(), 2.0f, 0.1f);
+    const int gGapBarrier = 4;
+    Scoring sc(
+               DEFAULT_MATCH_BONUS,          // constant reward for match
+               DEFAULT_MATCH_BONUS_TYPE,     // how to penalize mismatches
+               DEFAULT_MM_PENALTY_MAX,       // max mm pelanty
+               DEFAULT_MM_PENALTY_MIN,       // min mm pelanty
+               scoreMin,                     // min score as function of read len
+               nCeil,                        // max # Ns as function of read len
+               DEFAULT_N_PENALTY_TYPE,       // how to penalize Ns in the read
+               DEFAULT_N_PENALTY,            // constant if N pelanty is a constant
+               DEFAULT_N_CAT_PAIR,           // whether to concat mates before N filtering
+               DEFAULT_READ_GAP_CONST,       // constant coeff for read gap cost
+               DEFAULT_REF_GAP_CONST,        // constant coeff for ref gap cost
+               DEFAULT_READ_GAP_LINEAR,      // linear coeff for read gap cost
+               DEFAULT_REF_GAP_LINEAR,       // linear coeff for ref gap cost
+               gGapBarrier);                 // # rows at top/bot only entered diagonally
+    
+    size_t tmp_sense_seq_len = sense_seq_len;
+    size_t min_kmer = 0;
+    while(tmp_sense_seq_len > 0) {
+        tmp_sense_seq_len >>= 2;
+        min_kmer++;
+    }
+    //
+    min_kmer += 4;
+    const size_t min_seed_length = min_kmer * 2;
+    
+    //
+    min_kmer += 10;
+    
+    EList<Region> regions;
+    EList<RegionSimilar> regions_similar;
+    {
+        EList<size_t> sa;
+        EList<uint16_t> prefix_lengths;
+        prefix_lengths.resizeExact(sense_seq_len);
+        prefix_lengths.fillZero();
+        {
+            Timer _t(cerr, "  (2/5) Time finding seeds using suffix array and reference sequence: ", verbose);
+            
+            ifstream in(safile.c_str(), ios::binary);
+            const size_t sa_size = readIndex<uint64_t>(in, false);
+#if 0
+            for(size_t i = 0; i < sa_size; i++) {
+                size_t pos = (size_t)readIndex<uint64_t>(in, false);
+                if(pos == sa_size) continue;
+                size_t opos = both_seq_len - pos - 1;
+                size_t cpos = min(pos, opos);
+                bool fw = pos == cpos;
+                cout << i << " ";
+                for(size_t j = 0; j < 20; j++) {
+                    int base = 0;
+                    if(fw) {
+                        if(pos + j >= sense_seq_len) break;
+                        base = s[pos+j];
+                    } else {
+                        if(cpos < j) break;
+                        base = 3 - s[cpos-j];
+                    }
+                    cout << "ACGT"[base];
+                }
+                cout << " " << pos << endl;
+            }
+            exit(1);
+#endif
+            
+            assert_eq(both_seq_len + 1, sa_size);
+            size_t sa_begin = 0, sa_end = 0;
+            
+            // Compress sequences by removing redundant sub-sequences
+            size_t last_i1 = 0;
+            for(size_t i1 = 0; i1 < sa_size - 1; i1++) {
+                // daehwan - for debugging purposes
+                if((i1 + 1) % 1000000 == 0) {
+                    cerr << "\t\t" << (i1 + 1) / 1000000 << " million" << endl;
+                }
+                if(i1 >= sa_end) {
+                    assert_leq(sa_begin, sa_end);
+                    assert_eq(i1, sa_end);
+                    size_t sa_pos = (size_t)readIndex<uint64_t>(in, false);
+                    sa.push_back(sa_pos);
+                    sa_end++;
+                    assert_eq(sa_end - sa_begin, sa.size());
+                }
+                
+                assert_geq(i1, sa_begin); assert_lt(i1, sa_end);
+                size_t pos1 = sa[i1-sa_begin];
+                
+                if(pos1 == both_seq_len) continue;
+                if(pos1 + min_seed_length >= sense_seq_len) continue;
+                
+                // Compare with the following sequences
+                bool expanded = false;
+                size_t i2 = last_i1 + 1;
+                for(; i2 < sa_size; i2++) {
+                    if(i1 == i2) continue;
+                    if(i2 >= sa_end) {
+                        assert_leq(sa_begin, sa_end);
+                        assert_eq(i2, sa_end);
+                        size_t sa_pos = (size_t)readIndex<uint64_t>(in, false);
+                        sa.push_back(sa_pos);
+                        sa_end++;
+                        assert_eq(sa_end - sa_begin, sa.size());
+                    }
+                    
+                    assert_geq(i2, sa_begin); assert_lt(i2, sa_end);
+                    size_t pos2 = sa[i2-sa_begin];
+                    if(pos2 == both_seq_len) continue;
+                    // opos2 is relative pos of pos2 on the other strand
+                    size_t opos2 = both_seq_len - pos2 - 1;
+                    // cpos2 is canonical pos on the sense strand
+                    size_t cpos2 = min(pos2, opos2);
+                    bool fw = pos2 == cpos2;
+                    if(fw) {
+                        if(pos2 + min_kmer > sense_seq_len) continue;
+                    } else {
+                        if(pos2 + min_kmer > both_seq_len) continue;
+                    }
+                    
+                    size_t j1 = 0; // includes the base at 'pos1'
+                    while(pos1 + j1 < sense_seq_len && pos2 + j1 < (fw ? sense_seq_len : both_seq_len)) {
+                        if(!fw) {
+                            if(pos1 < cpos2 && pos1 + (j1 * 2) >= cpos2) break;
+                        }
+                        int base1 = s[pos1 + j1];
+                        int base2;
+                        if(fw) {
+                            base2 = s[pos2 + j1];
+                        } else {
+                            assert_geq(cpos2, j1);
+                            base2 = 3 - s[cpos2 - j1];
+                        }
+                        if(base1 > 3 || base2 > 3) break;
+                        if(base1 != base2) break;
+                        j1++;
+                    }
+                    if(j1 < min_kmer) {
+                        if(i2 > i1) break;
+                        else continue;
+                    }
+                    
+                    size_t j2 = 0; // doesn't include the base at 'pos1'
+                    while(j2 <= pos1 && (fw ? 0 : sense_seq_len) + j2 <= pos2) {
+                        if(!fw) {
+                            if(cpos2 < pos1 && cpos2 + (j2 * 2) >= pos1) break;
+                        }
+                        int base1 = s[pos1 - j2];
+                        int base2;
+                        if(fw) {
+                            base2 = s[pos2 - j2];
+                        } else {
+                            assert_lt(cpos2 + j2, s.length());
+                            base2 = 3 - s[cpos2 + j2];
+                        }
+                        if(base1 > 3 || base2 > 3) break;
+                        if(base1 != base2) break;
+                        j2++;
+                    }
+                    if(j2 > 0) j2--;
+                    
+                    size_t j = j1 + j2;
+                    
+                    // Do not proceed if two sequences are not similar
+                    if(j < min_seed_length) continue;
+                    
+                    assert_leq(pos1 + j1, prefix_lengths.size());
+                    if(!expanded && j1 <= prefix_lengths[pos1]) continue;
+                    
+                    if(!expanded) {
+                        regions.expand();
+                        regions.back().reset();
+                        regions.back().pos = pos1;
+                        regions.back().fw_length = j1;
+                        regions.back().match_begin = regions.back().match_end = regions_similar.size();
+                        for(size_t k = 0; k < j1; k++) {
+                            size_t tmp_length = j1 - k;
+                            if(prefix_lengths[pos1 + k] < tmp_length) {
+                                prefix_lengths[pos1 + k] = tmp_length;
+                            }
+                        }
+                        expanded = true;
+                    }
+                    
+                    if(regions.back().fw_length < j1) {
+                        regions.back().fw_length = j1;
+                    }
+                    
+                    regions_similar.expand();
+                    regions_similar.back().reset();
+                    regions_similar.back().fw = fw;
+                    regions_similar.back().pos = cpos2;
+                    if(fw) {
+                        regions_similar.back().fw_length = j1;
+                        regions_similar.back().bw_length = j2;
+                    } else {
+                        regions_similar.back().fw_length = j1 > 0 ? j2 + 1 : 0;
+                        regions_similar.back().bw_length = j1 > 0 ? j1 - 1 : 0;
+                    }
+                    
+                    regions.back().match_end = regions_similar.size();
+                    if(regions.back().match_size() >= 20) break;
+                }
+                
+                last_i1 = i1;
+                assert_geq(last_i1, sa_begin);
+                if(last_i1 >= sa_begin + 1024) {
+                    assert_lt(last_i1, sa_end);
+                    sa.erase(0, last_i1 - sa_begin);
+                    sa_begin = last_i1;                    
+                    assert_eq(sa_end - sa_begin, sa.size());
+                }
+                
+                if(expanded) {
+                    assert_gt(regions.size(), 0);
+                    assert_lt(regions.back().match_begin, regions.back().match_end);
+                    assert_eq(regions.back().match_end, regions_similar.size());
+                    if(regions.back().match_size() > 1) {
+                        regions_similar.sortPortion(regions.back().match_begin, regions.back().match_size());
+                        size_t cur_pos = regions.back().match_begin + 1;
+                        for(size_t i = regions.back().match_begin + 1; i < regions.back().match_end; i++) {
+                            assert_gt(cur_pos, 0);
+                            const RegionSimilar& last_region = regions_similar[cur_pos-1];
+                            const RegionSimilar& new_region = regions_similar[i];
+                            if(last_region.fw == new_region.fw) {
+                                if(last_region.fw) {
+                                    if(last_region.pos + last_region.fw_length >= new_region.pos) {
+                                        continue;
+                                    }
+                                } else {
+                                    if(last_region.pos + last_region.fw_length >= new_region.pos) {
+                                        regions_similar[cur_pos-1] = new_region;
+                                        continue;
+                                    }
+                                }
+                            }
+                            if(cur_pos != i) {
+                                assert_lt(cur_pos, regions_similar.size());
+                                regions_similar[cur_pos] = new_region;
+                            }
+                            cur_pos++;
+                        }
+                        if(cur_pos < regions.back().match_end) {
+                            regions.back().low_complexity = true;
+                        }
+                        regions_similar.resize(cur_pos);
+                        regions.back().match_end = regions_similar.size();
+                    }
+                }
+                
+                // daehwan - for debugging purposes
+#if 1
+                assert_lt(i1, i2);
+                if(i1 + 8 < i2) {
+                    i1 = i1 + (i2 - i1) / 2;
+                }
+#endif
+            }
+        }
+        
+        {
+            Timer _t(cerr, "  (3/5) Time sorting seeds and then removing redundant seeds: ", verbose);
+            regions.sort();
+            
+            if(regions.size() > 1) {
+                size_t cur_pos = 1;
+                for(size_t i = 1; i < regions.size(); i++) {
+                    assert_gt(cur_pos, 0);
+                    const Region& last_region = regions[cur_pos-1];
+                    const Region& new_region = regions[i];
+                    if(last_region.low_complexity && last_region.pos + last_region.fw_length > new_region.pos) continue;
+                    if(last_region.pos + last_region.fw_length >= new_region.pos + new_region.fw_length) continue;
+                    if(cur_pos != i) {
+                        assert_lt(cur_pos, regions.size());
+                        regions[cur_pos] = new_region;
+                    }
+                    cur_pos++;
+                }
+                regions.resizeExact(cur_pos);
+            }
+        }
+    }
+
+    // Print regions
+#if 0
+    cout << "no. of regions: " << regions.size() << endl << endl;
+    for(size_t i = 0; i < regions.size(); i++) {
+        const Region& region = regions[i];
+        cout << "At " << region.pos << "\t" << region.fw_length << " bps" << endl;
+        for(size_t j = 0; j < region.hits.size(); j++) {
+            const RegionSimilar& region2 = region.hits[j];
+            cout << "\t" << (region2.fw ? "+" : "-") << "\tat " << region2.pos
+            << "\t-" << region2.bw_length
+            << "\t+" << region2.fw_length << endl;
+        }
+        cout << endl << endl;
+    }
+#endif
+    
+    const size_t min_sim_length = minSimLen;
+    
+    {
+        Timer _t(cerr, "  (4/5) Time merging seeds and masking sequence: ", verbose);
+        
+        EList<uint8_t> mask;
+        mask.resizeExact(sense_seq_len);
+        mask.fillZero();
+        
+        EList<RegionToMerge> merge_list;
+        for(size_t i = 0; i < regions.size(); i++) {
+            const Region& region = regions[i];
+            if(i == 0) {
+                for(size_t j = region.match_begin; j < region.match_end; j++) {
+                    merge_list.expand();
+                    merge_list.back().reset();
+                    merge_list.back().list.expand();
+                    merge_list.back().list.back().first = i;
+                    merge_list.back().list.back().second = j;
+                }
+            } else {
+                assert_gt(i, 0);
+                for(size_t j = region.match_begin; j < region.match_end; j++) {
+                    assert_lt(j, regions_similar.size());
+                    const RegionSimilar& cmp_region = regions_similar[j];
+                    bool added = false;
+                    for(size_t k = 0; k < merge_list.size(); k++) {
+                        RegionToMerge& merge = merge_list[k];
+                        uint32_t region_id1 = merge.list.back().first;
+                        if(region_id1 >= i) break;
+                        uint32_t region_id2 = merge.list.back().second;
+                        assert_lt(region_id1, regions.size());
+                        const Region& prev_region = regions[region_id1];
+                        assert_lt(region_id2, regions_similar.size());
+                        
+                        assert_lt(prev_region.pos, region.pos);
+                        size_t gap = region.pos - prev_region.pos;
+                        
+                        const RegionSimilar& prev_cmp_region = regions_similar[region_id2];
+                        if(prev_cmp_region.fw != cmp_region.fw) continue;
+                        if(prev_cmp_region.pos + cmp_region.bw_length == cmp_region.pos + prev_cmp_region.bw_length &&
+                           prev_cmp_region.pos + prev_cmp_region.fw_length == cmp_region.pos + cmp_region.fw_length)
+                            continue;
+                        
+                        if(!cmp_region.fw) {
+                            if(cmp_region.pos >= prev_region.pos && cmp_region.pos < region.pos) continue;
+                        }
+                        
+                        size_t cmp_gap = 0;
+                        if(cmp_region.fw) {
+                            if(prev_cmp_region.pos >= cmp_region.pos) continue;
+                            cmp_gap = cmp_region.pos - prev_cmp_region.pos;
+                        } else {
+                            if(prev_cmp_region.pos <= cmp_region.pos) continue;
+                            cmp_gap = prev_cmp_region.pos - cmp_region.pos;
+                        }
+                        if(cmp_gap + 10 < gap || gap + 10 < cmp_gap) continue;
+                        
+                        if(prev_region.fw_length + 200 < gap) continue;
+                        if(cmp_region.fw) {
+                            if(prev_cmp_region.fw_length + 200 < cmp_gap) continue;
+                        } else {
+                            if(cmp_region.fw_length + 200 < cmp_gap) continue;
+                        }
+                        
+                        added = true;
+                        merge.list.expand();
+                        merge.list.back().first = i;
+                        merge.list.back().second = j;
+                    }
+                    
+                    if(!added) {
+                        added = true;
+                        merge_list.expand();
+                        merge_list.back().reset();
+                        merge_list.back().list.expand();
+                        merge_list.back().list.back().first = i;
+                        merge_list.back().list.back().second = j;
+                    }
+                }
+            }
+            
+            for(size_t j = 0; j < merge_list.size(); j++) {
+                RegionToMerge& merge = merge_list[j];
+                uint32_t region_id1 = merge.list.back().first;
+                if(i + 1 < regions.size()) {
+                    if(region_id1 == i) continue;
+                    assert_lt(region_id1, regions.size());
+                    const Region& prev_region = regions[region_id1];
+                    if(prev_region.pos + 200 > region.pos) continue;
+                }
+                merge_list[j].processed = true;
+                
+#if 0
+                bool skip_merge = true;
+                for(size_t k = 0; k < merge.list.size(); k++) {
+                    uint32_t region_id1 = merge.list[k].first;
+                    uint32_t region_id2 = merge.list[k].second;
+                    assert_lt(region_id1, regions.size());
+                    const Region& region = regions[region_id1];
+                    assert_lt(region_id2, region.hits.size());
+                    const RegionSimilar& sim_region = region.hits[region_id2];
+                    assert_lt(region.pos, mask.size()); assert_lt(sim_region.pos, mask.size());
+                    if(mask[region.pos] == 0 || mask[sim_region.pos] == 0) {
+                        skip_merge = false;
+                        break;
+                    }
+                }
+                if(skip_merge) continue;
+#endif
+                
+#if 0
+                bool output_merge = merge.list.size() > 1;
+                if(!output_merge) {
+                    assert_gt(merge.list.size(), 0);
+                    uint32_t region_id2 = merge.list[0].second;
+                    assert_lt(region_id2, regions_similar.size());
+                    const RegionSimilar& sim_region = regions_similar[region_id2];
+                    if(sim_region.bw_length + sim_region.fw_length >= min_sim_length) {
+                        output_merge = true;
+                    }
+                }
+                if(output_merge) {
+                    cout << endl << ":" << endl;
+                    for(size_t k = 0; k < merge.list.size(); k++) {
+                        uint32_t region_id1 = merge.list[k].first;
+                        uint32_t region_id2 = merge.list[k].second;
+                        assert_lt(region_id1, regions.size());
+                        const Region& region = regions[region_id1];
+                        assert_lt(region_id2, regions_similar.size());
+                        const RegionSimilar& sim_region = regions_similar[region_id2];
+                        
+                        cout << "\t";
+                        cout << k << ") at " << region.pos << "\t" << region.fw_length << " bps\t"
+                        << (sim_region.fw ? "+" : "-") << "\tat " << sim_region.pos << "\t-" << sim_region.bw_length << "\t+" << sim_region.fw_length << endl;
+                    }
+                    cout << endl << endl;
+                }
+#endif
+                
+                assert_gt(merge.list.size(), 0);
+                for(size_t k = 0; k < merge.list.size(); k++) {
+                    uint32_t region_id1 = merge.list[k].first;
+                    assert_lt(region_id1, regions.size());
+                    Region& region1 = regions[region_id1];
+                    
+                    uint32_t cmp_region_id1 = merge.list[k].second;
+                    assert_lt(cmp_region_id1, regions_similar.size());
+                    const RegionSimilar& cmp_region1 = regions_similar[cmp_region_id1];
+                    
+                    if(cmp_region1.fw) {
+                        region1.fw_length = cmp_region1.fw_length;
+                        region1.bw_length = cmp_region1.bw_length;
+                    } else {
+                        region1.fw_length = cmp_region1.fw_length > 0 ? cmp_region1.bw_length + 1 : 0;
+                        region1.bw_length = cmp_region1.fw_length > 0 ? cmp_region1.fw_length - 1 : 0;
+                    }
+                }
+                
+                for(size_t k = 0; k < merge.list.size(); k++) {
+                    uint32_t region_id1 = merge.list[k].first;
+                    assert_lt(region_id1, regions.size());
+                    const Region& region1 = regions[region_id1];
+
+                    uint32_t cmp_region_id1 = merge.list[k].second;
+                    assert_lt(cmp_region_id1, regions_similar.size());
+                    const RegionSimilar& cmp_region1 = regions_similar[cmp_region_id1];
+
+                    const bool fw = cmp_region1.fw;
+                    bool combined = false;
+                    if(k + 1 < merge.list.size()) {
+                        uint32_t region_id2 = merge.list[k+1].first;
+                        assert_lt(region_id1, region_id2);
+                        assert_lt(region_id2, regions.size());
+                        Region& region2 = regions[region_id2];
+                        
+                        uint32_t cmp_region_id2 = merge.list[k+1].second;
+                        assert_lt(cmp_region_id2, regions_similar.size());
+                        RegionSimilar& cmp_region2 = regions_similar[cmp_region_id2];
+                        
+                        assert_eq(cmp_region1.fw, cmp_region2.fw);
+                        size_t query_len, left = region1.pos, right = region2.pos, cmp_left, cmp_right;
+                        if(fw) {
+                            assert_lt(cmp_region1.pos, cmp_region2.pos);
+                            query_len = cmp_region2.pos - cmp_region1.pos + cmp_region2.fw_length + cmp_region1.bw_length;
+                            cmp_left = cmp_region1.pos, cmp_right = cmp_region2.pos;
+                            
+                            assert_gt(cmp_region1.fw_length, 0);
+                            left = left + cmp_region1.fw_length - 1;
+                            cmp_left = cmp_left + cmp_region1.fw_length - 1;
+                            
+                            assert_geq(right, cmp_region2.bw_length);
+                            right = right - cmp_region2.bw_length;
+                            assert_geq(cmp_right, cmp_region2.bw_length);
+                            cmp_right = cmp_right - cmp_region2.bw_length;
+                            
+                        } else {
+                            assert_lt(cmp_region2.pos, cmp_region1.pos);
+                            query_len = cmp_region1.pos - cmp_region2.pos + cmp_region1.fw_length + cmp_region2.bw_length;
+                            cmp_left = cmp_region2.pos, cmp_right = cmp_region1.pos;
+                            
+                            left = left + cmp_region1.bw_length;
+                            assert_gt(cmp_region2.fw_length, 0);
+                            cmp_left = cmp_left + cmp_region2.fw_length - 1;
+                            
+                            assert_geq(right + 1, cmp_region2.fw_length);
+                            right = right + 1 - cmp_region2.fw_length;
+                            assert_geq(cmp_right, cmp_region1.bw_length);
+                            cmp_right = cmp_right - cmp_region1.bw_length;
+                        }
+                        
+#if 0
+                        cout << "query length: " << query_len << endl;
+                        cout << "left-right: " << left << "\t" << right << endl;
+                        cout << "cmp left-right: " << cmp_left << "\t" << cmp_right << endl;
+#endif
+                        
+                        size_t max_diffs = (query_len + 9) / 10;
+                        if(max_diffs > cmp_region1.mismatches + cmp_region1.gaps) {
+                            max_diffs -= (cmp_region1.mismatches + cmp_region1.gaps);
+                        } else {
+                            max_diffs = 0;
+                        }
+                        
+                        bool do_swalign = max_diffs > 0;
+                        if(left >= right && cmp_left >= cmp_right) {
+                            combined = true;
+                        } else if(left >= right) {
+                            assert_lt(cmp_left, cmp_right);
+                            size_t gap = cmp_right - cmp_left + 1 + left - right;
+                            if(gap <= max_diffs) {
+                                combined = true;
+                                cmp_region2.gaps += gap;
+                            } else {
+                                do_swalign = false;
+                            }
+                        } else if(cmp_left >= cmp_right) {
+                            assert_lt(left, right);
+                            size_t gap = right - left + 1 + cmp_left - cmp_right;
+                            if(gap <= max_diffs) {
+                                combined = true;
+                                cmp_region2.gaps += gap;
+                            } else {
+                                do_swalign = false;
+                            }
+                        }
+                        /*else if(left + max_diffs >= right && cmp_left + max_diffs >= cmp_right) {
+                            combined = true;
+                        }*/
+                        
+                        if(!combined && do_swalign) {
+                            BTString seq;
+                            BTDnaString cmp_seq;
+                            BTString cmp_qual;
+                            
+                            assert_lt(region1.pos, region2.pos);
+                            for(size_t pos = left; pos <= right; pos++) {
+                                assert_lt(pos, s.length());
+                                seq.append(1 << s[pos]);
+                            }
+                            
+                            for(size_t pos = cmp_left; pos <= cmp_right; pos++) {
+                                assert_lt(pos, s.length());
+                                cmp_seq.append(s[pos]);
+                            }
+                            cmp_qual.resize(cmp_seq.length());
+                            cmp_qual.fill('I');
+                            if(!fw) {
+                                cmp_seq.reverseComp();
+                                cmp_qual.reverse();
+                            }
+                            
+                            sw.initRead(cmp_seq, cmp_seq, cmp_qual, cmp_qual, 0, cmp_seq.length(), sc);
+                            
+                            DPRect rect;
+                            rect.refl = rect.refl_pretrim = rect.corel = 0;
+                            rect.refr = rect.refr_pretrim = rect.corer = seq.length();
+                            rect.triml = rect.trimr = 0;
+                            rect.maxgap = 10;
+                            
+                            TAlScore minsc = -max_diffs * 6;
+                            if(minsc < 0) {
+                                sw.initRef(
+                                           true, // fw
+                                           0, // refidx
+                                           rect,
+                                           const_cast<char *>(seq.toZBuf()),
+                                           0,
+                                           seq.length(),
+                                           seq.length(),
+                                           sc,
+                                           minsc,
+                                           true, // enable8
+                                           2000, // cminlen
+                                           4, // cpow2
+                                           false, // doTri
+                                           true); // extend);
+                                
+                                // Perform dynamic programing
+                                RandomSource rnd(seed);
+                                TAlScore bestCell = std::numeric_limits<TAlScore>::min();
+                                if(seq.length() <= 200) {
+                                    combined = sw.align(rnd, bestCell);
+                                }
+#if 0
+                                if(combined) {
+                                    BTDnaString seqstr;
+                                    for(size_t bi = 0; bi < seq.length(); bi++) {
+                                        seqstr.append(firsts5[(int)seq[bi]]);
+                                    }
+                                    cout << seqstr << endl;
+                                    cout << cmp_seq << endl;
+                                    
+                                    SwResult res;
+                                    res.reset();
+                                    sw.nextAlignment(res, minsc, rnd);
+                                    res.alres.ned().reverse();
+                                    cout << "Succeeded (" << bestCell << "): "; Edit::print(cout, res.alres.ned()); cout << endl;
+                                }
+#endif
+                            }
+                        }
+                        
+                        if(combined) {
+                            assert_lt(region1.pos, region2.pos);
+                            region2.bw_length = region2.pos - region1.pos + region1.bw_length;
+                            if(fw) {
+                                assert_lt(cmp_region1.pos, cmp_region2.pos);
+                                cmp_region2.bw_length = cmp_region2.pos - cmp_region1.pos + cmp_region1.bw_length;
+                            } else {
+                                assert_lt(cmp_region2.pos, cmp_region1.pos);
+                                cmp_region2.fw_length = cmp_region1.pos - cmp_region2.pos + cmp_region1.fw_length;
+                            }
+                        }
+                    }
+                    
+                    // Mask sequence
+                    if(!combined || k + 1 == merge.list.size()) {
+                        if(cmp_region1.bw_length + cmp_region1.fw_length >= min_sim_length) {
+                            size_t mask_begin = 0, mask_end = 0;
+                            if(region1.pos < cmp_region1.pos) {
+                                assert_leq(cmp_region1.bw_length, cmp_region1.pos);
+                                mask_begin = cmp_region1.pos - cmp_region1.bw_length;
+                                assert_leq(cmp_region1.pos + cmp_region1.fw_length, s.length());
+                                mask_end = cmp_region1.pos + cmp_region1.fw_length;
+                            } else {
+                                assert_gt(region1.pos, cmp_region1.pos);
+                                assert_leq(region1.bw_length, region1.pos);
+                                mask_begin = region1.pos - region1.bw_length;
+                                assert_leq(region1.pos + region1.fw_length, s.length());
+                                mask_end = region1.pos + region1.fw_length;
+                            }
+                            for(size_t mask_pos = mask_begin; mask_pos < mask_end; mask_pos ++) {
+                                assert_lt(mask_pos, mask.size());
+                                mask[mask_pos] = 1;
+                            }
+                        }
+                    }
+                }
+                
+                merge.list.resizeExact(0);
+            }
+            
+            size_t cur_pos = 0;
+            for(size_t j = 0; j < merge_list.size(); j++) {
+                if(merge_list[j].processed) continue;
+                if(j != cur_pos) {
+                    merge_list[cur_pos] = merge_list[j];
+                }
+                cur_pos++;
+            }
+             merge_list.resize(cur_pos);
+        }
+        
+        assert_eq(merge_list.size(), 0);
+        assert_eq(mask.size(), sense_seq_len);
+        for(size_t i = 0; i < mask.size(); i++){
+            if(mask[i] != 0) {
+                s.set(4, i);
+            }
+        }
+    }
+    
+    // Output compressed sequence
+    const size_t min_seq_len = 31;
+    size_t cur_pos = 0;
+    {
+        Timer _t(cerr, "  (5/5) Time outputing compressed sequence: ", verbose);
+
+        if(printN) {
+            print_fasta_record(cout, refnames[0], s, s.length());
+        }
+        size_t cur_seq_len = 0;
+        for(size_t i = 0; i < sense_seq_len; i++) {
+            int base = s[i];
+            assert_leq(base, 4);
+            if(base < 4) {
+                s.set(base, cur_pos);
+                cur_pos++;
+                cur_seq_len++;
+            } else {
+                if(cur_seq_len < min_seq_len) {
+                    assert_leq(cur_seq_len, i);
+                    assert_leq(cur_seq_len, cur_pos);
+                    for(size_t j = i - cur_seq_len; j < i; j++) {
+                        assert_lt(s[j], 4);
+                        s.set(4, j);
+                    }
+                    cur_pos -= cur_seq_len;
+                }
+                cur_seq_len = 0;
+            }
+        }
+        if(!printN) {
+            print_fasta_record(cout, refnames[0], s, cur_pos);
+        }
+    }
+    
+    cerr << endl;
+    cerr << "Compressed: " << sense_seq_len << " to " << cur_pos
+         << " bps (" << (sense_seq_len - cur_pos) * 100.0 / sense_seq_len << "%)" << endl;
+}
+
+static const char *argv0 = NULL;
+
+/**
+ * main function.  Parses command-line arguments.
+ */
+int centrifuge_compress(int argc, const char **argv) {
+	string outfile;
+	try {
+		// Reset all global state, including getopt state
+		opterr = optind = 1;
+		resetOptions();
+
+		string fafile;
+        string safile;
+		
+		parseOptions(argc, argv);
+		argv0 = argv[0];
+		if(showVersion) {
+			cout << argv0 << " version " << string(CENTRIFUGE_VERSION).c_str() << endl;
+			if(sizeof(void*) == 4) {
+				cout << "32-bit" << endl;
+			} else if(sizeof(void*) == 8) {
+				cout << "64-bit" << endl;
+			} else {
+				cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
+			}
+			cout << "Built on " << BUILD_HOST << endl;
+			cout << BUILD_TIME << endl;
+			cout << "Compiler: " << COMPILER_VERSION << endl;
+			cout << "Options: " << COMPILER_OPTIONS << endl;
+			cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
+				 << sizeof(int)
+				 << ", " << sizeof(long) << ", " << sizeof(long long)
+				 << ", " << sizeof(void *) << ", " << sizeof(size_t)
+				 << ", " << sizeof(off_t) << "}" << endl;
+			return 0;
+		}
+
+		// Get input filename
+		if(optind >= argc) {
+			cerr << "No input sequence or sequence file specified!" << endl;
+			printUsage(cerr);
+			return 1;
+		}
+		fafile = argv[optind++];
+
+		// Get output filename
+		if(optind >= argc) {
+			cerr << "No output file specified!" << endl;
+			printUsage(cerr);
+			return 1;
+		}
+		safile = argv[optind++];
+
+		// Optionally summarize
+		if(verbose) {
+#if 0
+			cout << "Settings:" << endl
+				 << "  Output files: \"" << outfile.c_str() << ".*." << gEbwt_ext << "\"" << endl
+				 << "  Line rate: " << lineRate << " (line is " << (1<<lineRate) << " bytes)" << endl
+				 << "  Lines per side: " << linesPerSide << " (side is " << ((1<<lineRate)*linesPerSide) << " bytes)" << endl
+				 << "  Offset rate: " << offRate << " (one in " << (1<<offRate) << ")" << endl
+				 << "  FTable chars: " << ftabChars << endl
+				 << "  Strings: " << (packed? "packed" : "unpacked") << endl
+                 << "  Local offset rate: " << localOffRate << " (one in " << (1<<localOffRate) << ")" << endl
+                 << "  Local fTable chars: " << localFtabChars << endl
+				 ;
+			if(bmax == OFF_MASK) {
+				cout << "  Max bucket size: default" << endl;
+			} else {
+				cout << "  Max bucket size: " << bmax << endl;
+			}
+			if(bmaxMultSqrt == OFF_MASK) {
+				cout << "  Max bucket size, sqrt multiplier: default" << endl;
+			} else {
+				cout << "  Max bucket size, sqrt multiplier: " << bmaxMultSqrt << endl;
+			}
+			if(bmaxDivN == 0xffffffff) {
+				cout << "  Max bucket size, len divisor: default" << endl;
+			} else {
+				cout << "  Max bucket size, len divisor: " << bmaxDivN << endl;
+			}
+			cout << "  Difference-cover sample period: " << dcv << endl;
+			cout << "  Endianness: " << (bigEndian? "big":"little") << endl
+				 << "  Actual local endianness: " << (currentlyBigEndian()? "big":"little") << endl
+				 << "  Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
+	#ifdef NDEBUG
+			cout << "  Assertions: disabled" << endl;
+	#else
+			cout << "  Assertions: enabled" << endl;
+	#endif
+			cout << "  Random seed: " << seed << endl;
+			cout << "  Sizeofs: void*:" << sizeof(void*) << ", int:" << sizeof(int) << ", long:" << sizeof(long) << ", size_t:" << sizeof(size_t) << endl;
+			cout << "Input files DNA, " << file_format_names[format].c_str() << ":" << endl;
+			for(size_t i = 0; i < infiles.size(); i++) {
+				cout << "  " << infiles[i].c_str() << endl;
+			}
+#endif
+        }
+		// Seed random number generator
+		srand(seed);
+		{
+			try {
+                driver(fafile, safile, false, REF_READ_FORWARD);
+            } catch(bad_alloc& e) {
+                if(autoMem) {
+                    cerr << "Switching to a packed string representation." << endl;
+                    packed = true;
+                } else {
+                    throw e;
+                }
+            }
+		}
+		return 0;
+	} catch(std::exception& e) {
+		cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
+		cerr << "Command: ";
+		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+		cerr << endl;
+		return 1;
+	} catch(int e) {
+		if(e != 0) {
+			cerr << "Error: Encountered internal Centrifuge exception (#" << e << ")" << endl;
+			cerr << "Command: ";
+			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+			cerr << endl;
+		}
+		return e;
+	}
+}
+
+/**
+ * bowtie-build main function.  It is placed in a separate source file
+ * to make it slightly easier to compile as a library.
+ *
+ * If the user specifies -A <file> as the first two arguments, main
+ * will interpret that file as having one set of command-line arguments
+ * per line, and will dispatch each batch of arguments one at a time to
+ * bowtie-build.
+ */
+int main(int argc, const char **argv) {
+    if(argc > 2 && strcmp(argv[1], "-A") == 0) {
+        const char *file = argv[2];
+        ifstream in;
+        in.open(file);
+        char buf[4096];
+        int lastret = -1;
+        while(in.getline(buf, 4095)) {
+            EList<string> args(MISC_CAT);
+            args.push_back(string(argv[0]));
+            tokenize(buf, " \t", args);
+            const char **myargs = (const char**)malloc(sizeof(char*)*args.size());
+            for(size_t i = 0; i < args.size(); i++) {
+                myargs[i] = args[i].c_str();
+            }
+            if(args.size() == 1) continue;
+            lastret = centrifuge_compress((int)args.size(), myargs);
+            free(myargs);
+        }
+        if(lastret == -1) {
+            cerr << "Warning: No arg strings parsed from " << file << endl;
+            return 0;
+        }
+        return lastret;
+    } else {
+        return centrifuge_compress(argc, argv);
+    }
+}
diff --git a/centrifuge_inspect.cpp b/centrifuge_inspect.cpp
new file mode 100644
index 0000000..398a972
--- /dev/null
+++ b/centrifuge_inspect.cpp
@@ -0,0 +1,674 @@
+/*
+ * Copyright 2016
+ *
+ * This file is part of Centrifuge and based on code from Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string>
+#include <iostream>
+#include <getopt.h>
+#include <stdexcept>
+#include <set>
+
+#include "assert_helpers.h"
+#include "endian_swap.h"
+#include "hier_idx.h"
+#include "reference.h"
+#include "ds.h"
+#include "hyperloglogplus.h"
+
+using namespace std;
+
+static bool showVersion = false; // just print version and quit?
+int verbose             = 0;  // be talkative
+static int names_only   = 0;  // just print the sequence names in the index
+static int summarize_only = 0; // just print summary of index and quit
+static int across       = 60; // number of characters across in FASTA output
+static bool refFromEbwt = false; // true -> when printing reference, decode it from Ebwt instead of reading it from BitPairReference
+static string wrapper;
+static const char *short_options = "vhnsea:";
+static int conversion_table = 0;
+static int taxonomy_tree = 0;
+static int name_table = 0;
+static int size_table = 0;
+static int count_kmers = 0;
+
+//#define TEST_KMER_COUNTING
+
+enum {
+	ARG_VERSION = 256,
+    ARG_WRAPPER,
+	ARG_USAGE,
+    ARG_CONVERSION_TABLE,
+    ARG_TAXONOMY_TREE,
+    ARG_NAME_TABLE,
+    ARG_SIZE_TABLE,
+	ARG_COUNT_KMERS
+};
+
+static struct option long_options[] = {
+	{(char*)"verbose",  no_argument,        0, 'v'},
+	{(char*)"version",  no_argument,        0, ARG_VERSION},
+	{(char*)"usage",    no_argument,        0, ARG_USAGE},
+	{(char*)"names",    no_argument,        0, 'n'},
+	{(char*)"summary",  no_argument,        0, 's'},
+	{(char*)"help",     no_argument,        0, 'h'},
+	{(char*)"across",   required_argument,  0, 'a'},
+	{(char*)"ebwt-ref", no_argument,        0, 'e'},
+    {(char*)"wrapper",  required_argument,  0, ARG_WRAPPER},
+    {(char*)"conversion-table", no_argument,  0, ARG_CONVERSION_TABLE},
+    {(char*)"taxonomy-tree",    no_argument,  0, ARG_TAXONOMY_TREE},
+    {(char*)"name-table",       no_argument,  0, ARG_NAME_TABLE},
+    {(char*)"size-table",       no_argument,  0, ARG_SIZE_TABLE},
+	{(char*)"estimate-n-kmers",      no_argument,  0, ARG_COUNT_KMERS},
+	{(char*)0, 0, 0, 0} // terminator
+};
+
+/**
+ * Print a summary usage message to the provided output stream.
+ */
+static void printUsage(ostream& out) {
+	out << "Centrifuge version " << string(CENTRIFUGE_VERSION).c_str() << " by Daehwan Kim (infphilo at gmail.com, http://www.ccb.jhu.edu/people/infphilo)" << endl;
+	out
+	<< "Usage: centrifuge-inspect [options]* <cf_base>" << endl
+	<< "  <cf_base>         cf filename minus trailing .1." << gEbwt_ext << "/.2." << gEbwt_ext << "/.3." << gEbwt_ext << endl
+	<< endl
+	<< "  By default, prints FASTA records of the indexed nucleotide sequences to" << endl
+	<< "  standard out.  With -n, just prints names.  With -s, just prints a summary of" << endl
+	<< "  the index parameters and sequences.  With -e, preserves colors if applicable." << endl
+	<< endl
+	<< "Options:" << endl;
+    if(wrapper == "basic-0") {
+		out << "  --large-index      force inspection of the 'large' index, even if a" << endl
+        << "                     'small' one is present." << endl;
+	}
+	out << "  -a/--across <int>  Number of characters across in FASTA output (default: 60)" << endl
+	<< "  -n/--names         Print reference sequence names only" << endl
+	<< "  -s/--summary       Print summary incl. ref names, lengths, index properties" << endl
+	<< "  -e/--bt2-ref       Reconstruct reference from ." << gEbwt_ext << " (slow, preserves colors)" << endl
+    << "  --conversion-table Print conversion table" << endl
+    << "  --taxonomy-tree    Print taxonomy tree" << endl
+    << "  --name-table       Print names corresponding to taxonomic IDs" << endl
+    << "  --size-table       Print the lengths of the sequences belonging to the same taxonomic ID" << endl
+	<< "  -v/--verbose       Verbose output (for debugging)" << endl
+	<< "  -h/--help          print detailed description of tool and its options" << endl
+	<< "  --help             print this usage message" << endl
+	;
+    if(wrapper.empty()) {
+		cerr << endl
+        << "*** Warning ***" << endl
+        << "'centrifuge-inspect-bin' was run directly.  It is recommended "
+        << "to use the wrapper script instead."
+        << endl << endl;
+	}
+}
+
+/**
+ * Parse an int out of optarg and enforce that it be at least 'lower';
+ * if it is less than 'lower', than output the given error message and
+ * exit with an error and a usage message.
+ */
+static int parseInt(int lower, const char *errmsg) {
+	long l;
+	char *endPtr= NULL;
+	l = strtol(optarg, &endPtr, 10);
+	if (endPtr != NULL) {
+		if (l < lower) {
+			cerr << errmsg << endl;
+			printUsage(cerr);
+			throw 1;
+		}
+		return (int32_t)l;
+	}
+	cerr << errmsg << endl;
+	printUsage(cerr);
+	throw 1;
+	return -1;
+}
+
+/**
+ * Read command-line arguments
+ */
+static void parseOptions(int argc, char **argv) {
+	int option_index = 0;
+	int next_option;
+	do {
+		next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
+		switch (next_option) {
+            case ARG_WRAPPER:
+				wrapper = optarg;
+				break;
+			case ARG_USAGE:
+			case 'h':
+				printUsage(cout);
+				throw 0;
+				break;
+			case 'v': verbose = true; break;
+			case ARG_VERSION: showVersion = true; break;
+            case ARG_CONVERSION_TABLE:
+                conversion_table = true;
+                break;
+            case ARG_TAXONOMY_TREE:
+                taxonomy_tree = true;
+                break;
+            case ARG_NAME_TABLE:
+                name_table = true;
+                break;
+            case ARG_SIZE_TABLE:
+                size_table = true;
+                break;
+			case ARG_COUNT_KMERS:
+				count_kmers = true;
+				break;
+			case 'e': refFromEbwt = true; break;
+			case 'n': names_only = true; break;
+			case 's': summarize_only = true; break;
+			case 'a': across = parseInt(-1, "-a/--across arg must be at least 1"); break;
+			case -1: break; /* Done with options. */
+			case 0:
+				if (long_options[option_index].flag != 0)
+					break;
+			default:
+				printUsage(cerr);
+				throw 1;
+		}
+	} while(next_option != -1);
+}
+
+static void print_fasta_record(
+	ostream& fout,
+	const string& defline,
+	const string& seq)
+{
+	fout << ">";
+	fout << defline.c_str() << endl;
+
+	if(across > 0) {
+		size_t i = 0;
+		while (i + across < seq.length())
+		{
+			fout << seq.substr(i, across).c_str() << endl;
+			i += across;
+		}
+		if (i < seq.length())
+			fout << seq.substr(i).c_str() << endl;
+	} else {
+		fout << seq.c_str() << endl;
+	}
+}
+
+/**
+ * Counts the number of unique k-mers in the reference sequence
+ * that's reconstructed from the index
+ */
+template<typename index_t, typename TStr>
+static uint64_t count_idx_kmers ( Ebwt<index_t>& ebwt)
+{
+	TStr cat_ref;
+	ebwt.restore(cat_ref);
+	cerr << "Index loaded" << endl;
+#ifdef TEST_KMER_COUNTING
+	std::set<uint64_t> my_set;
+#endif
+
+	HyperLogLogPlusMinus<uint64_t> kmer_counter(16);
+	uint64_t word = 0;
+	uint64_t curr_length = 0;
+	uint8_t k = 32;
+
+	TIndexOffU curr_ref = OFF_MASK;
+	TIndexOffU last_text_off = 0;
+	size_t orig_len = cat_ref.length();
+	TIndexOffU tlen = OFF_MASK;
+	bool first = true;
+
+	for(size_t i = 0; i < orig_len; i++) {
+		TIndexOffU tidx = OFF_MASK;
+		TIndexOffU textoff = OFF_MASK;
+		tlen = OFF_MASK;
+		bool straddled = false;
+		ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);
+
+		if (tidx != OFF_MASK && textoff < tlen) {
+			if (curr_ref != tidx) {
+				// End of the sequence - reset word and counter
+				curr_ref = tidx;
+				word = 0; curr_length = 0;
+				last_text_off = 0;
+				first = true;
+			}
+
+			TIndexOffU textoff_adj = textoff;
+			if(first && textoff > 0) textoff_adj++;
+			if (textoff_adj - last_text_off > 1) {
+				// there's an N - reset word and counter
+				word = 0; curr_length = 0;
+			}
+			// add another char.
+            int bp = (int)cat_ref[i];
+
+            // shift the first two bits off the word
+            word = word << 2;
+            // put the base-pair code from pos at that position
+            word |= bp;
+			++curr_length;
+			//cerr << "[" << i << "; " << curr_length << "; " << word << ":" << kmer_counter.cardinality()  << "]" << endl;
+			if (curr_length >= k) {
+				kmer_counter.add(word);
+#ifdef TEST_KMER_COUNTING
+				my_set.insert(word);
+				cerr << " " << kmer_counter.cardinality()  << " vs " << my_set.size() << endl;
+#endif
+			}
+
+			last_text_off = textoff;
+			first = false;
+
+		}
+	}
+	if (curr_length >= k) {
+		kmer_counter.add(word);
+#ifdef TEST_KMER_COUNTING
+		my_set.insert(word);
+#endif
+	}
+
+#ifdef TEST_KMER_COUNTING
+	cerr << "Exact count: " << my_set.size() << endl;
+#endif
+
+	return kmer_counter.cardinality();
+}
+
+/**
+ * Given output stream, BitPairReference, reference index, name and
+ * length, print the whole nucleotide reference with the appropriate
+ * number of columns.
+ */
+static void print_ref_sequence(
+	ostream& fout,
+	BitPairReference& ref,
+	const string& name,
+	size_t refi,
+	size_t len)
+{
+	bool newlines = across > 0;
+	int myacross = across > 0 ? across : 60;
+	size_t incr = myacross * 1000;
+	uint32_t *buf = new uint32_t[(incr + 128)/4];
+	fout << ">" << name.c_str() << "\n";
+	ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
+	for(size_t i = 0; i < len; i += incr) {
+		size_t amt = min(incr, len-i);
+		assert_leq(amt, incr);
+		int off = ref.getStretch(buf, refi, i, amt ASSERT_ONLY(, destU32));
+		uint8_t *cb = ((uint8_t*)buf) + off;
+		for(size_t j = 0; j < amt; j++) {
+			if(newlines && j > 0 && (j % myacross) == 0) fout << "\n";
+			assert_range(0, 4, (int)cb[j]);
+			fout << "ACGTN"[(int)cb[j]];
+		}
+		fout << "\n";
+	}
+	delete [] buf;
+}
+
+/**
+ * Create a BitPairReference encapsulating the reference portion of the
+ * index at the given basename.  Iterate through the reference
+ * sequences, sending each one to print_ref_sequence to print.
+ */
+static void print_ref_sequences(
+	ostream& fout,
+	bool color,
+	const EList<string>& refnames,
+	const TIndexOffU* plen,
+	const string& adjustedEbwtFileBase)
+{
+	BitPairReference ref(
+		adjustedEbwtFileBase, // input basename
+		color,                // true -> expect colorspace reference
+		false,                // sanity-check reference
+		NULL,                 // infiles
+		NULL,                 // originals
+		false,                // infiles are sequences
+		false,                // memory-map
+		false,                // use shared memory
+		false,                // sweep mm-mapped ref
+		verbose,              // be talkative
+		verbose);             // be talkative at startup
+	assert_eq(ref.numRefs(), refnames.size());
+	for(size_t i = 0; i < ref.numRefs(); i++) {
+		print_ref_sequence(
+			fout,
+			ref,
+			refnames[i],
+			i,
+			plen[i] + (color ? 1 : 0));
+	}
+}
+
+/**
+ * Given an index, reconstruct the reference by LF mapping through the
+ * entire thing.
+ */
+template<typename index_t, typename TStr>
+static void print_index_sequences(ostream& fout, Ebwt<index_t>& ebwt)
+{
+	EList<string>* refnames = &(ebwt.refnames());
+
+	TStr cat_ref;
+	ebwt.restore(cat_ref);
+
+	HyperLogLogPlusMinus<uint64_t> kmer_counter;
+	TIndexOffU curr_ref = OFF_MASK;
+	string curr_ref_seq = "";
+	TIndexOffU curr_ref_len = OFF_MASK;
+	TIndexOffU last_text_off = 0;
+	size_t orig_len = cat_ref.length();
+	TIndexOffU tlen = OFF_MASK;
+	bool first = true;
+	for(size_t i = 0; i < orig_len; i++) {
+		TIndexOffU tidx = OFF_MASK;
+		TIndexOffU textoff = OFF_MASK;
+		tlen = OFF_MASK;
+		bool straddled = false;
+		ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);
+
+		if (tidx != OFF_MASK && textoff < tlen)
+		{
+			if (curr_ref != tidx)
+			{
+				if (curr_ref != OFF_MASK)
+				{
+					// Add trailing gaps, if any exist
+					if(curr_ref_seq.length() < curr_ref_len) {
+						curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
+					}
+					print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
+				}
+				curr_ref = tidx;
+				curr_ref_seq = "";
+				curr_ref_len = tlen;
+				last_text_off = 0;
+				first = true;
+			}
+
+			TIndexOffU textoff_adj = textoff;
+			if(first && textoff > 0) textoff_adj++;
+			if (textoff_adj - last_text_off > 1)
+				curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');
+
+            curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]);			
+			last_text_off = textoff;
+			first = false;
+		}
+	}
+	if (curr_ref < refnames->size())
+	{
+		// Add trailing gaps, if any exist
+		if(curr_ref_seq.length() < curr_ref_len) {
+			curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
+		}
+		print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
+	}
+
+}
+
+static char *argv0 = NULL;
+
+template <typename index_t>
+static void print_index_sequence_names(const string& fname, ostream& fout)
+{
+	EList<string> p_refnames;
+	readEbwtRefnames<index_t>(fname, p_refnames);
+	for(size_t i = 0; i < p_refnames.size(); i++) {
+		cout << p_refnames[i].c_str() << endl;
+	}
+}
+
+/**
+ * Print a short summary of what's in the index and its flags.
+ */
+template <typename index_t>
+static void print_index_summary(
+	const string& fname,
+	ostream& fout)
+{
+	int32_t flags = Ebwt<index_t>::readFlags(fname);
+	bool color = readEbwtColor(fname);
+	Ebwt<index_t> ebwt(
+					   fname,
+					   color,                // index is colorspace
+					   -1,                   // don't require entire reverse
+					   true,                 // index is for the forward direction
+					   -1,                   // offrate (-1 = index default)
+					   0,                    // offrate-plus (0 = index default)
+					   false,                // use memory-mapped IO
+					   false,                // use shared memory
+					   false,                // sweep memory-mapped memory
+					   true,                 // load names?
+					   false,                // load SA sample?
+					   false,                // load ftab?
+					   false,                // load rstarts?
+					   verbose,              // be talkative?
+					   verbose,              // be talkative at startup?
+					   false,                // pass up memory exceptions?
+					   false);               // sanity check?
+	EList<string> p_refnames;
+	readEbwtRefnames<index_t>(fname, p_refnames);
+	cout << "Flags" << '\t' << (-flags) << endl;
+	cout << "SA-Sample" << "\t1 in " << (1 << ebwt.eh().offRate()) << endl;
+	cout << "FTab-Chars" << '\t' << ebwt.eh().ftabChars() << endl;
+	assert_eq(ebwt.nPat(), p_refnames.size());
+	for(size_t i = 0; i < p_refnames.size(); i++) {
+		cout << "Sequence-" << (i+1)
+		     << '\t' << p_refnames[i].c_str()
+		     << '\t' << (ebwt.plen()[i] + (color ? 1 : 0))
+		     << endl;
+	}
+}
+
+extern void initializeCntLut();
+
+static void driver(
+	const string& ebwtFileBase,
+	const string& query)
+{
+    initializeCntLut();
+    
+	// Adjust
+	string adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
+
+	if(names_only) {
+		print_index_sequence_names<TIndexOffU>(adjustedEbwtFileBase, cout);
+	} else if(summarize_only) {
+		print_index_summary<TIndexOffU>(adjustedEbwtFileBase, cout);
+    } else {
+        // Initialize Ebwt object
+		bool color = readEbwtColor(adjustedEbwtFileBase);
+		HierEbwt<TIndexOffU, uint16_t> ebwt(
+                                            adjustedEbwtFileBase,
+                                            color,                // index is colorspace
+                                            -1,                   // don't care about entire-reverse
+                                            true,                 // index is for the forward direction
+                                            -1,                   // offrate (-1 = index default)
+                                            0,                    // offrate-plus (0 = index default)
+                                            false,                // use memory-mapped IO
+                                            false,                // use shared memory
+                                            false,                // sweep memory-mapped memory
+                                            true,                 // load names?
+                                            true,                 // load SA sample?
+                                            true,                 // load ftab?
+                                            true,                 // load rstarts?
+                                            false,                // be talkative?
+                                            false,                // be talkative at startup?
+                                            false,                // pass up memory exceptions?
+                                            false);               // sanity check?        
+        
+        if(conversion_table) {
+            const EList<pair<string, uint64_t> >& uid_to_tid = ebwt.uid_to_tid();
+            for(size_t i = 0; i < uid_to_tid.size(); i++) {
+                uint64_t tid = uid_to_tid[i].second;
+                cout << uid_to_tid[i].first << "\t"
+                     << (tid & 0xffffffff);
+                tid >>= 32;
+                if(tid > 0) {
+                    cout << "." << tid;
+                }
+                cout << endl;
+            }
+        } else if(taxonomy_tree) {
+            const map<uint64_t, TaxonomyNode>& tree = ebwt.tree();
+            for(map<uint64_t, TaxonomyNode>::const_iterator itr = tree.begin(); itr != tree.end(); itr++) {
+                string rank = get_tax_rank_string(itr->second.rank);
+                cout << itr->first << "\t|\t" << itr->second.parent_tid << "\t|\t" << rank << endl;
+            }
+        } else if(name_table) {
+            const std::map<uint64_t, string>& name_map = ebwt.name();
+            for(std::map<uint64_t, string>::const_iterator itr = name_map.begin(); itr != name_map.end(); itr++) {
+                uint64_t tid = itr->first;
+                cout << (tid & 0xffffffff);
+                tid >>= 32;
+                if(tid > 0) {
+                    cout << "." << tid;
+                }
+                cout << "\t" << itr->second << endl;
+            }
+        } else if(size_table) {
+            const std::map<uint64_t, uint64_t>& size_map = ebwt.size();
+            for(std::map<uint64_t, uint64_t>::const_iterator itr = size_map.begin(); itr != size_map.end(); itr++) {
+                uint64_t tid = itr->first;
+                uint64_t size = itr->second;
+                cout << (tid & 0xffffffff);
+                tid >>= 32;
+                if(tid > 0) {
+                    cout << "." << tid;
+                }
+                cout << "\t" << size << endl;
+            }
+        } else if (count_kmers) {
+        	ebwt.loadIntoMemory(
+        	                                -1,     // color
+        	                                -1,     // need entire reverse
+        	                                true,   // load SA sample
+        	                                true,   // load ftab
+        	                                true,   // load rstarts
+        	                                true,   // load names
+        	                                verbose);  // verbose
+        	uint64_t n_kmers = count_idx_kmers<TIndexOffU, SString<char> >(ebwt);
+        	cout << "Approximate number of kmers in the reference sequence: " << n_kmers << endl;
+
+        } else {
+            ebwt.loadIntoMemory(
+                                -1,     // color
+                                -1,     // need entire reverse
+                                true,   // load SA sample
+                                true,   // load ftab
+                                true,   // load rstarts
+                                true,   // load names
+                                verbose);  // verbose
+            
+            // Load whole index into memory
+            if(refFromEbwt || true) {
+                print_index_sequences<TIndexOffU, SString<char> >(cout, ebwt);
+            } else {
+                EList<string> refnames;
+                readEbwtRefnames<TIndexOffU>(adjustedEbwtFileBase, refnames);
+                print_ref_sequences(
+                                    cout,
+                                    readEbwtColor(ebwtFileBase),
+                                    refnames,
+                                    ebwt.plen(),
+                                    adjustedEbwtFileBase);
+            }
+        }
+		// Evict any loaded indexes from memory
+		if(ebwt.isInMemory()) {
+			ebwt.evictFromMemory();
+		}
+	}
+}
+
+/**
+ * main function.  Parses command-line arguments.
+ */
+int main(int argc, char **argv) {
+	try {
+		string ebwtFile;  // read serialized Ebwt from this file
+		string query;   // read query string(s) from this file
+		EList<string> queries;
+		string outfile; // write query results to this file
+		argv0 = argv[0];
+		parseOptions(argc, argv);
+		if(showVersion) {
+			cout << argv0 << " version " << CENTRIFUGE_VERSION << endl;
+			if(sizeof(void*) == 4) {
+				cout << "32-bit" << endl;
+			} else if(sizeof(void*) == 8) {
+				cout << "64-bit" << endl;
+			} else {
+				cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
+			}
+			cout << "Built on " << BUILD_HOST << endl;
+			cout << BUILD_TIME << endl;
+			cout << "Compiler: " << COMPILER_VERSION << endl;
+			cout << "Options: " << COMPILER_OPTIONS << endl;
+			cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
+				 << sizeof(int)
+				 << ", " << sizeof(long) << ", " << sizeof(long long)
+				 << ", " << sizeof(void *) << ", " << sizeof(size_t)
+				 << ", " << sizeof(off_t) << "}" << endl;
+			return 0;
+		}
+
+		// Get input filename
+		if(optind >= argc) {
+			cerr << "No index name given!" << endl;
+			printUsage(cerr);
+			return 1;
+		}
+		ebwtFile = argv[optind++];
+
+		// Optionally summarize
+		if(verbose) {
+			cout << "Input ebwt file: \"" << ebwtFile.c_str() << "\"" << endl;
+			cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
+			cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
+#ifdef NDEBUG
+			cout << "Assertions: disabled" << endl;
+#else
+			cout << "Assertions: enabled" << endl;
+#endif
+		}
+		driver(ebwtFile, query);
+		return 0;
+	} catch(std::exception& e) {
+		cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
+		cerr << "Command: ";
+		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+		cerr << endl;
+		return 1;
+	} catch(int e) {
+		if(e != 0) {
+			cerr << "Error: Encountered internal Centrifuge exception (#" << e << ")" << endl;
+			cerr << "Command: ";
+			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
+			cerr << endl;
+		}
+		return e;
+	}
+}
diff --git a/centrifuge_main.cpp b/centrifuge_main.cpp
new file mode 100644
index 0000000..a182b6c
--- /dev/null
+++ b/centrifuge_main.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string.h>
+#include <stdlib.h>
+#include "tokenize.h"
+#include "ds.h"
+
+using namespace std;
+
+extern "C" {
+	int centrifuge(int argc, const char **argv);
+}
+
+/**
+ * Bowtie main function.  It is placed in a separate source file to
+ * make it slightly easier to compile Bowtie as a library.
+ *
+ * If the user specifies -A <file> as the first two arguments, main
+ * will interpret that file as having one set of command-line arguments
+ * per line, and will dispatch each batch of arguments one at a time to
+ * bowtie.
+ */
+int main(int argc, const char **argv) {
+	if(argc > 2 && strcmp(argv[1], "-A") == 0) {
+		const char *file = argv[2];
+		ifstream in;
+		in.open(file);
+		char buf[4096];
+		int lastret = -1;
+		while(in.getline(buf, 4095)) {
+			EList<string> args;
+			args.push_back(string(argv[0]));
+			tokenize(buf, " \t", args);
+			const char **myargs = (const char**)malloc(sizeof(char*)*args.size());
+			for(size_t i = 0; i < args.size(); i++) {
+				myargs[i] = args[i].c_str();
+			}
+			if(args.size() == 1) continue;
+			lastret = centrifuge((int)args.size(), myargs);
+			free(myargs);
+		}
+		if(lastret == -1) {
+			cerr << "Warning: No arg strings parsed from " << file << endl;
+			return 0;
+		}
+		return lastret;
+	} else {
+		return centrifuge(argc, argv);
+	}
+}
diff --git a/centrifuge_report.cpp b/centrifuge_report.cpp
new file mode 100644
index 0000000..ee94f43
--- /dev/null
+++ b/centrifuge_report.cpp
@@ -0,0 +1,186 @@
+/*
+ * centrifuge-build.cpp
+ *
+ *  Created on: Apr 8, 2015
+ *      Author: fbreitwieser
+ */
+
+#include<iostream>
+#include<fstream>
+#include<sstream>
+#include<map>
+#include<vector>
+#include "assert_helpers.h"
+#include "sstring.h"
+#include "ds.h"      // EList
+#include "bt2_idx.h" // Ebwt
+#include "bt2_io.h"
+#include "util.h"
+
+using namespace std;
+typedef TIndexOffU index_t;
+
+static bool startVerbose = true; // be talkative at startup
+int gVerbose = 1; // be talkative always
+static const char *argv0 = NULL;
+static string adjIdxBase;
+static bool useShmem				= false; // use shared memory to hold the index
+static bool useMm					= false; // use memory-mapped files to hold the index
+static bool mmSweep					= false; // sweep through memory-mapped files immediately after mapping
+static int offRate					= -1;    // keep default offRate
+static bool noRefNames				= false; // true -> print reference indexes; not names
+static int sanityCheck				= 0;  // enable expensive sanity checks
+/**
+ * Print a summary usage message to the provided output stream.
+ */
+static void printUsage(ostream& out) {
+	out << "Centrifuge version " << string(CENTRIFUGE_VERSION).c_str() << " by Daehwan Kim (infphilo at gmail.com, www.ccb.jhu.edu/people/infphilo)" << endl;
+	string tool_name = "centrifuge-class";
+
+	out << "Usage: " << endl
+	    << "  " << tool_name.c_str() << " <bt2-idx> <centrifuge-out>" << endl
+	    << endl
+		<<     "  <bt2-idx>  Index filename prefix (minus trailing .X." << gEbwt_ext << ")." << endl
+		<<     "  <centrifuge-out>  Centrifuge result file." << endl;
+}
+
+template <typename T>
+class Pair2ndComparator{
+public:
+     bool operator()(const pair<T,T> &left, const pair<T,T> &right){
+    	 return left.second < right.second;
+     }
+};
+
+
+template<typename TStr>
+static void driver(
+	const char * type,
+	const string& bt2indexBase,
+	const string& cf_out)
+{
+	if(gVerbose || startVerbose)  {
+		cerr << "Entered driver(): "; logTime(cerr, true);
+	}
+
+    //initializeCntLut();  // FB: test commenting
+
+	// Vector of the reference sequences; used for sanity-checking
+	EList<SString<char> > names, os;
+	EList<size_t> nameLens, seqLens;
+
+	// Initialize Ebwt object and read in header
+	if(gVerbose || startVerbose) {
+		cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
+	}
+	adjIdxBase = adjustEbwtBase(argv0, bt2indexBase, gVerbose);
+	Ebwt<index_t> ebwt(
+		adjIdxBase,
+	    0,        // index is colorspace
+		-1,       // fw index
+	    true,     // index is for the forward direction
+	    /* overriding: */ offRate,
+		0, // amount to add to index offrate or <= 0 to do nothing
+	    useMm,    // whether to use memory-mapped files
+	    useShmem, // whether to use shared memory
+	    mmSweep,  // sweep memory-mapped files
+	    !noRefNames, // load names?
+		true,        // load SA sample?
+		true,        // load ftab?
+		true,        // load rstarts?
+	    gVerbose, // whether to be talkative
+	    startVerbose, // talkative during initialization
+	    false /*passMemExc*/,
+	    sanityCheck);
+	//Ebwt<index_t>* ebwtBw = NULL;
+
+
+	EList<size_t> reflens;
+	EList<string> refnames;
+	readEbwtRefnames<index_t>(adjIdxBase, refnames);
+	map<uint32_t,pair<string,uint64_t> > speciesID_to_name_len;
+	for(size_t i = 0; i < ebwt.nPat(); i++) {
+		// cerr << "Push back to reflens: "<<  refnames[i] << " is so long: " << ebwt.plen()[i] << endl;
+		reflens.push_back(ebwt.plen()[i]);
+
+		// extract numeric id from refName
+		const string& refName = refnames[i];
+		uint64_t id = extractIDFromRefName(refName);
+		uint32_t speciesID = (uint32_t)(id >> 32);
+
+		// extract name from refName
+		const string& name_part = refName.substr(refName.find_first_of(' '));
+
+		//uint32_t genusID = (uint32_t)(id & 0xffffffff);
+		speciesID_to_name_len[speciesID] = pair<string,uint64_t>(name_part,ebwt.plen()[i]);
+
+	}
+//	EList<string> refnames;
+//	readEbwtRefnames<index_t>(adjIdxBase, refnames);
+
+	// Read Centrifuge output file
+	ifstream infile(cf_out.c_str());
+
+	string line;
+	map<uint32_t,uint32_t> species_to_score;
+
+	while (getline(infile,line)) {
+		string rd_name;
+		uint32_t genusID;
+		uint32_t speciesID;
+		uint32_t score;
+		uint32_t secbest_score;
+
+		istringstream iss(line);
+		iss >> rd_name >> genusID >> speciesID >> score >> secbest_score;
+		// cerr << rd_name << " -> " << genusID << " -> " << speciesID << " -> " << score << " -> " << secbest_score << "\n";
+		species_to_score[speciesID] += score;
+	}
+
+	// Sort the species by their score
+	vector<pair<uint32_t,uint32_t> > species_to_score_v(species_to_score.begin(), species_to_score.end());
+
+	sort(species_to_score_v.begin(),species_to_score_v.end(),Pair2ndComparator<uint32_t>());
+
+	cout << "Name\tTaxonID\tLength\tSummed Score\tNormalized Score\n";
+	// Output the summed species scores
+	for (vector<pair<uint32_t,uint32_t> >::iterator species_score = species_to_score_v.begin();
+			species_score != species_to_score_v.end();
+			++species_score) {
+		uint32_t speciesID = species_score->first;
+		pair<string,uint64_t> name_len = speciesID_to_name_len[speciesID];
+		uint64_t slength = name_len.second;
+		uint64_t sumscore = species_score->second;
+
+		cout << name_len.first << "\t" <<
+				speciesID << "\t" <<
+				slength << "\t" <<
+				sumscore << "\t" <<
+				(float)sumscore/slength << "\n";
+	}
+
+
+
+}
+
+//int centrifuge_report(int argc, const char **argv) {
+int main(int argc, const char **argv) {
+
+	if (argc < 3) {
+		cerr << "Number of arguments is " << argc << endl;
+		printUsage(cerr);
+		exit(1);
+	}
+
+	argv0 = argv[0];
+	const string bt2index = argv[1];
+	const string cf_out = argv[2];
+	//static string outfile;        // write SAM output to this file
+
+	cout << "Input bt2 file: \"" << bt2index.c_str() << "\"" << endl;
+	cout << "Centrifuge results file: \"" << cf_out.c_str() << "\"" << endl;
+
+	driver<SString<char> >("DNA", bt2index, cf_out);
+	return 0;
+}
+
diff --git a/classifier.h b/classifier.h
new file mode 100644
index 0000000..4c47b6d
--- /dev/null
+++ b/classifier.h
@@ -0,0 +1,1053 @@
+/*
+ * Copyright 2014, Daehwan Kim <infphilo at gmail.com>
+ *
+ * This file is part of HISAT.
+ *
+ * HISAT is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * HISAT is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with HISAT.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CLASSIFIER_H_
+#define CLASSIFIER_H_
+
+//#define LI_DEBUG
+
+#include <algorithm>
+#include <vector>
+#include "hi_aligner.h"
+#include "util.h"
+
+template<typename index_t>
+struct HitCount {
+    uint64_t uniqueID;
+    uint64_t taxID;
+    uint32_t count;
+    uint32_t score;
+    uint32_t scores[2][2];      // scores[rdi][fwi]
+    double   summedHitLen;
+    double   summedHitLens[2][2]; // summedHitLens[rdi][fwi]
+    uint32_t timeStamp;
+    EList<pair<uint32_t,uint32_t> > readPositions;
+    bool     leaf;
+    uint32_t num_leaves;
+    
+    uint8_t rank;
+    EList<uint64_t> path;
+    
+    void reset() {
+        uniqueID = taxID = count = score = timeStamp = 0;
+        scores[0][0] = scores[0][1] = scores[1][0] = scores[1][1] = 0;
+        summedHitLen = 0.0;
+        summedHitLens[0][0] = summedHitLens[0][1] = summedHitLens[1][0] = summedHitLens[1][1] = 0.0;
+        readPositions.clear();
+        rank = 0;
+        path.clear();
+        leaf = true;
+        num_leaves = 1;
+    }
+    
+    HitCount& operator=(const HitCount& o) {
+        if(this == &o)
+            return *this;
+        
+        uniqueID = o.uniqueID;
+        taxID = o.taxID;
+        count = o.count;
+        score = o.score;
+        scores[0][0] = o.scores[0][0];
+        scores[0][1] = o.scores[0][1];
+        scores[1][0] = o.scores[1][0];
+        scores[1][1] = o.scores[1][1];
+        summedHitLen = o.summedHitLen;
+        summedHitLens[0][0] = o.summedHitLens[0][0];
+        summedHitLens[0][1] = o.summedHitLens[0][1];
+        summedHitLens[1][0] = o.summedHitLens[1][0];
+        summedHitLens[1][1] = o.summedHitLens[1][1];
+        timeStamp = o.timeStamp;
+        readPositions = o.readPositions;
+        leaf = o.leaf;
+        num_leaves = o.num_leaves;
+        rank = o.rank;
+        path = o.path;
+        
+        return *this;
+    }
+
+    void finalize(
+                  bool paired,
+                  bool mate1fw,
+                  bool mate2fw) {
+        if(paired) {
+#if 1
+            score = max(scores[0][0], scores[0][1]) + max(scores[1][0], scores[1][1]);
+            summedHitLen = max(summedHitLens[0][0], summedHitLens[0][1]) + max(summedHitLens[1][0], summedHitLens[1][1]);
+#else
+            uint32_t score1 = 0, score2 = 0;
+            double summedHitLen1 = 0.0, summedHitLen2 = 0.0;
+            if(mate1fw == mate2fw) {
+                score1 = scores[0][0] + scores[1][0];
+                score2 = scores[0][1] + scores[1][1];
+                summedHitLen1 = summedHitLens[0][0] + summedHitLens[1][0];
+                summedHitLen2 = summedHitLens[0][1] + summedHitLens[1][1];
+            } else {
+                score1 = scores[0][0] + scores[1][1];
+                score2 = scores[0][1] + scores[1][0];
+                summedHitLen1 = summedHitLens[0][0] + summedHitLens[1][1];
+                summedHitLen2 = summedHitLens[0][1] + summedHitLens[1][0];
+            }
+            if(score1 >= score2) {
+                score = score1;
+                summedHitLen = summedHitLen1;
+            } else {
+                score = score2;
+                summedHitLen = summedHitLen2;
+            }
+#endif
+        } else {
+            score = max(scores[0][0], scores[0][1]);
+            summedHitLen = max(summedHitLens[0][0], summedHitLens[0][1]);
+        }
+    }
+};
+
+/**
+ * With a hierarchical indexing, SplicedAligner provides several alignment strategies
+ * , which enable effective alignment of RNA-seq reads
+ */
+template <typename index_t, typename local_index_t>
+class Classifier : public HI_Aligner<index_t, local_index_t> {
+    
+public:
+    
+    /**
+     * Initialize with index.
+     */
+    Classifier(const Ebwt<index_t>& ebwt,
+               const EList<string>& refnames,
+               bool mate1fw,
+               bool mate2fw,
+               index_t minHitLen,
+               bool tree_traverse,
+               const string& classification_rank,
+               const EList<uint64_t>& hostGenomes,
+               const EList<uint64_t>& excluded_taxIDs) :
+    HI_Aligner<index_t, local_index_t>(
+                                       ebwt,
+                                       0,    // don't make use of splice sites found by earlier reads
+                                       true), // no spliced alignment
+    _refnames(refnames),
+    _minHitLen(minHitLen),
+    _mate1fw(mate1fw),
+    _mate2fw(mate2fw),
+    _tree_traverse(tree_traverse)
+    {
+        _classification_rank = get_tax_rank_id(classification_rank.c_str());
+        _classification_rank = TaxonomyPathTable::rank_to_pathID(_classification_rank);
+        
+        const map<uint64_t, TaxonomyNode>& tree = ebwt.tree();
+        _host_taxIDs.clear();
+        if(hostGenomes.size() > 0) {
+            for(map<uint64_t, TaxonomyNode>::const_iterator itr = tree.begin(); itr != tree.end(); itr++) {
+                uint64_t tmp_taxID = itr->first;
+                while(true) {
+                    bool found = false;
+                    for(size_t t = 0; t < hostGenomes.size(); t++) {
+                        if(tmp_taxID == hostGenomes[t]) {
+                            _host_taxIDs.insert(itr->first);
+                            found = true;
+                            break;
+                        }
+                    }
+                    if(found) break;
+                    map<uint64_t, TaxonomyNode>::const_iterator itr2 = tree.find(tmp_taxID);
+                    if(itr2 == tree.end()) break;
+                    const TaxonomyNode& node = itr2->second;
+                    if(tmp_taxID == node.parent_tid) break;
+                    tmp_taxID = node.parent_tid;
+                }
+            }
+        }
+        
+        _excluded_taxIDs.clear();
+        if(excluded_taxIDs.size() > 0) {
+            for(map<uint64_t, TaxonomyNode>::const_iterator itr = tree.begin(); itr != tree.end(); itr++) {
+                uint64_t tmp_taxID = itr->first;
+                while(true) {
+                    bool found = false;
+                    for(size_t t = 0; t < excluded_taxIDs.size(); t++) {
+                        if(tmp_taxID == excluded_taxIDs[t]) {
+                            _excluded_taxIDs.insert(itr->first);
+                            found = true;
+                            break;
+                        }
+                    }
+                    if(found) break;
+                    map<uint64_t, TaxonomyNode>::const_iterator itr2 = tree.find(tmp_taxID);
+                    if(itr2 == tree.end()) break;
+                    if(tmp_taxID == itr2->second.parent_tid) break;
+                    tmp_taxID = itr2->second.parent_tid;
+                }
+            }
+        }
+    }
+    
+    ~Classifier() {
+    }
+
+    /**
+     * Aligns a read or a pair
+     * This funcion is called per read or pair
+     */
+    virtual
+    int go(
+           const Scoring&           sc,
+           const Ebwt<index_t>&     ebwtFw,
+           const Ebwt<index_t>&     ebwtBw,
+           const BitPairReference&  ref,
+           WalkMetrics&             wlm,
+           PerReadMetrics&          prm,
+           HIMetrics&               him,
+		   SpeciesMetrics&          spm,
+           RandomSource&            rnd,
+           AlnSinkWrap<index_t>&    sink)
+    {
+        _hitMap.clear();
+        
+        const index_t increment = (2 * _minHitLen <= 33) ? 10 : (2 * _minHitLen - 33);
+        const ReportingParams& rp = sink.reportingParams();
+        index_t maxGenomeHitSize = rp.khits;
+		bool isFw = false;
+        
+        //
+        uint32_t ts = 0; // time stamp
+        // for each mate. only called once for unpaired data
+        for(int rdi = 0; rdi < (this->_paired ? 2 : 1); rdi++) {
+            assert(this->_rds[rdi] != NULL);
+            
+            // search for partial hits on the forward and reverse strand (saved in this->_hits[rdi])
+            searchForwardAndReverse(rdi, ebwtFw, sc, rnd, rp, increment);
+            
+            // get forward or reverse hits for this read from this->_hits[rdi]
+            //  the strand is chosen based on higher average hit length in either direction
+            pair<int, int> fwp = getForwardOrReverseHit(rdi);
+            for(int fwi = fwp.first; fwi < fwp.second; fwi++) {
+                ReadBWTHit<index_t>& hit = this->_hits[rdi][fwi];
+                assert(hit.done());
+                isFw = hit._fw;  // TODO: Sync between mates!
+                
+                // choose candidate partial alignments for further alignment
+                index_t offsetSize = hit.offsetSize();
+                this->_genomeHits.clear();
+                
+                // sort partial hits by size (number of genome positions), ascending, and then length, descending
+                for(size_t hi = 0; hi < offsetSize; hi++) {
+                    const BWTHit<index_t> partialHit = hit.getPartialHit(hi);
+#ifdef LI_DEBUG
+                    cout << partialHit.len() << " " << partialHit.size() << endl;
+#endif
+                    if(partialHit.len() >= _minHitLen && partialHit.size() > maxGenomeHitSize) {
+                        maxGenomeHitSize = partialHit.size();
+                    }
+                }
+                
+                if(maxGenomeHitSize > (index_t)rp.khits) {
+                    maxGenomeHitSize += rp.khits;
+                }
+                
+                hit._partialHits.sort(compareBWTHits());
+                size_t usedPortion = 0;
+                size_t genomeHitCnt = 0;
+                for(size_t hi = 0; hi < offsetSize; hi++, ts++) {
+                    const BWTHit<index_t>& partialHit = hit.getPartialHit(hi);
+                    size_t partialHitLen = partialHit.len();
+                    if(partialHitLen <= _minHitLen) continue;                    
+                    if(partialHit.size() == 0) continue;
+                    
+                    // only keep this partial hit if it is equal to or bigger than minHitLen (default: 22 bp)
+                    // TODO: consider not requiring minHitLen when we have already hits to the same genome
+                    bool considerOnlyIfPreviouslyObserved = partialHitLen < _minHitLen;
+                    
+                    // get all coordinates of the hit
+                    EList<Coord>& coords = getCoords(
+                                                     hit,
+                                                     hi,
+                                                     ebwtFw,
+                                                     ref,
+                                                     rnd,
+                                                     maxGenomeHitSize,
+                                                     wlm,
+                                                     prm,
+                                                     him);
+                    if(coords.empty())
+                        continue;
+                    
+                    usedPortion += partialHitLen;
+                    assert_gt(coords.size(), 0);
+                    
+                    // the maximum number of hits per read is maxGenomeHitSize (change with parameter -k)
+                    size_t nHitsToConsider = coords.size();
+                    if(coords.size() > rp.ihits) {
+                        continue;
+                    }
+
+                    // find the genome id for all coordinates, and count the number of genomes
+                    EList<pair<uint64_t, uint64_t> > coord_ids;
+                    for(index_t k = 0; k < nHitsToConsider; k++, genomeHitCnt++) {
+                        const Coord& coord = coords[k];
+                        assert_lt(coord.ref(), _refnames.size()); // gives a warning - coord.ref() is signed integer. why?
+                        
+                        // extract numeric id from refName
+                        const EList<pair<string, uint64_t> >& uid_to_tid = ebwtFw.uid_to_tid();
+                        assert_lt(coord.ref(), uid_to_tid.size());
+                        uint64_t taxID = uid_to_tid[coord.ref()].second;
+                        bool found = false;
+                        for(index_t k2 = 0; k2 < coord_ids.size(); k2++) {
+                            // count the genome if it is not in coord_ids, yet
+                            if(coord_ids[k2].first == (uint64_t)coord.ref()) {
+                                found = true;
+                                break;
+                            }
+                        }
+                        if(found) continue;
+                        // add to coord_ids
+                        coord_ids.expand();
+                        coord_ids.back().first = coord.ref();
+                        coord_ids.back().second = taxID;
+                    }
+                    
+                    ASSERT_ONLY(size_t n_genomes = coord_ids.size());
+                    // scoring function: calculate the weight of this partial hit
+                    assert_gt(partialHitLen, 15);
+                    assert_gt(n_genomes, 0);
+                    uint32_t partialHitScore = (uint32_t)((partialHitLen - 15) * (partialHitLen - 15)) ; // / n_genomes;
+                    double weightedHitLen = double(partialHitLen) ; // / double(n_genomes) ;
+                    
+                    // go through all coordinates reported for partial hit
+                    for(index_t k = 0; k < coord_ids.size(); ++k) {
+                        uint64_t uniqueID = coord_ids[k].first;
+                        uint64_t taxID = coord_ids[k].second;
+                        if(_excluded_taxIDs.find(taxID) != _excluded_taxIDs.end())
+                            break;
+                        // add hit to genus map and get new index in the map
+                        size_t idx = addHitToHitMap(
+                                                    ebwtFw,
+                                                    _hitMap,
+                                                    rdi,
+                                                    fwi,
+                                                    uniqueID,
+                                                    taxID,
+                                                    ts,
+                                                    partialHitScore,
+                                                    weightedHitLen,
+                                                    considerOnlyIfPreviouslyObserved,
+                                                    partialHit._bwoff,
+                                                    partialHit.len());
+                        
+                        //if considerOnlyIfPreviouslyObserved and it was not found, genus Idx size is equal to the genus Map size
+                        if(idx >= _hitMap.size()) {
+                            continue;
+                        }
+                        
+#ifdef FLORIAN_DEBUG
+                        std::cerr << speciesID << ';';
+#endif
+                    }
+                    
+                    if(genomeHitCnt >= maxGenomeHitSize)
+                        break;
+                    
+#ifdef FLORIAN_DEBUG
+                    std::cerr << "  partialHits-done";
+#endif
+                } // partialHits
+            } // fwi
+            
+#ifdef FLORIAN_DEBUG
+            std::cerr << "  rdi-done" << endl;
+#endif
+        } // rdi
+        
+        for(size_t i = 0; i < _hitMap.size(); i++) {
+            _hitMap[i].finalize(this->_paired, this->_mate1fw, this->_mate2fw);
+        }
+        
+        // If the number of hits is more than -k,
+        //   traverse up the taxonomy tree to reduce the number
+        if(_hitMap.size() > (size_t)rp.khits) {
+            // Count the number of the best hits
+            uint32_t best_score = _hitMap[0].score;
+            for(size_t i = 1; i < _hitMap.size(); i++) {
+                if(best_score < _hitMap[i].score) {
+                    best_score = _hitMap[i].score;
+                }
+            }
+            
+            // Remove secondary hits
+            for(int i = 0; i < (int)_hitMap.size(); i++) {
+                if(_hitMap[i].score < best_score) {
+                    if(i + 1 < _hitMap.size()) {
+                        _hitMap[i] = _hitMap.back();
+                    }
+                    _hitMap.pop_back();
+                    i--;
+                }
+            }
+            
+            if(!_tree_traverse) {
+                if(_hitMap.size() > (size_t)rp.khits)
+                    return 0;
+            }
+            
+            uint8_t rank = 0;
+            while(_hitMap.size() > (size_t)rp.khits) {
+                _hitTaxCount.clear();
+                for(size_t i = 0; i < _hitMap.size(); i++) {
+                    while(_hitMap[i].rank < rank) {
+                        if(_hitMap[i].rank + 1 >= _hitMap[i].path.size()) {
+                            _hitMap[i].rank = std::numeric_limits<uint8_t>::max();
+                            break;
+                        }
+                        _hitMap[i].rank += 1;
+                        _hitMap[i].taxID = _hitMap[i].path[_hitMap[i].rank];
+                        _hitMap[i].leaf = false;
+                    }
+                    if(_hitMap[i].rank > rank) continue;
+                    if(rank + 1 >= _hitMap[i].path.size()) continue;
+                    uint64_t parent_taxID = _hitMap[i].path[rank + 1];
+                    
+                    // Daehwan: we may want to traverse up the tree more until we get non-zero taxID.
+                    if(parent_taxID == 0) continue;
+                    
+                    size_t j = 0;
+                    for(; j < _hitTaxCount.size(); j++) {
+                        if(_hitTaxCount[j].second == parent_taxID) {
+                            _hitTaxCount[j].first += 1;
+                            break;
+                        }
+                    }
+                    if(j == _hitTaxCount.size()) {
+                        _hitTaxCount.expand();
+                        _hitTaxCount.back().first = 1;
+                        _hitTaxCount.back().second = parent_taxID;
+                    }
+                }
+                if(_hitTaxCount.size() <= 0) break;
+                _hitTaxCount.sort();
+                size_t j = _hitTaxCount.size();
+                while(j-- > 0) {
+                    uint64_t parent_taxID = _hitTaxCount[j].second;
+                    int64_t max_score = 0;
+                    for(size_t i = 0; i < _hitMap.size(); i++) {
+                        assert_geq(_hitMap[i].rank, rank);
+                        if(_hitMap[i].rank != rank) continue;
+                        if(rank + 1 >= _hitMap[i].path.size()) continue;
+                        if(parent_taxID == _hitMap[i].path[rank + 1]) {
+                            _hitMap[i].uniqueID = 0;
+                            _hitMap[i].rank = rank + 1;
+                            _hitMap[i].taxID = parent_taxID;
+                            _hitMap[i].leaf = false;
+                        }
+                        if(parent_taxID == _hitMap[i].taxID) {
+                            if(_hitMap[i].score > max_score) {
+                                max_score = _hitMap[i].score;
+                            }
+                        }
+                    }
+                    
+                    bool first = true;
+                    size_t rep_i = _hitMap.size();
+                    for(size_t i = 0; i < _hitMap.size(); i++) {
+                        if(parent_taxID == _hitMap[i].taxID) {
+                            if(!first) {
+                                assert_lt(rep_i, _hitMap.size());
+                                _hitMap[rep_i].num_leaves += _hitMap[i].num_leaves;
+                                if(i + 1 < _hitMap.size()) {
+                                    _hitMap[i] = _hitMap.back();
+                                }
+                                _hitMap.pop_back();
+                                i--;
+                            } else {
+                                first = false;
+                                rep_i = i;
+                            }
+                        }
+                    }
+                    
+                    if(_hitMap.size() <= (size_t)rp.khits)
+                        break;
+                }
+                rank += 1;
+            }
+        }
+        if(_hitMap.size() > (size_t)rp.khits)
+            return 0;
+       
+#if 0
+       	// boost up the score if the assignment is unique
+        if(_hitMap.size() == 1) {
+            HitCount& hitCount = _hitMap[0];
+            hitCount.score = (hitCount.summedHitLen - 15) * (hitCount.summedHitLen - 15);
+        }
+#endif
+        
+        index_t rdlen = this->_rds[0]->length();
+        int64_t max_score = (rdlen > 15 ? (rdlen - 15) * (rdlen - 15) : 0);
+        if(this->_paired) {
+            rdlen = this->_rds[1]->length();
+            max_score += (rdlen > 15 ? (rdlen - 15) * (rdlen - 15) : 0);
+        }
+        
+        // See if some of the assignments corresponde to host taxIDs
+        int64_t best_score = 0;
+        bool only_host_taxIDs = false;
+        for(size_t gi = 0; gi < _hitMap.size(); gi++) {
+            if(_hitMap[gi].score > best_score) {
+                best_score = _hitMap[gi].score;
+                only_host_taxIDs = (_host_taxIDs.find(_hitMap[gi].taxID) != _host_taxIDs.end());
+            } else if(_hitMap[gi].score == best_score) {
+                only_host_taxIDs |= (_host_taxIDs.find(_hitMap[gi].taxID) != _host_taxIDs.end());
+            }
+        }
+        
+        for(size_t gi = 0; gi < _hitMap.size(); gi++) {
+            assert_gt(_hitMap[gi].score, 0);
+            HitCount<index_t>& hitCount = _hitMap[gi];
+            if(only_host_taxIDs) {
+                if(_host_taxIDs.find(_hitMap[gi].taxID) == _host_taxIDs.end())
+                    continue;
+            }
+            const EList<pair<string, uint64_t> >& uid_to_tid = ebwtFw.uid_to_tid();
+            assert_lt(hitCount.uniqueID, uid_to_tid.size());
+            const std::map<uint64_t, TaxonomyNode>& tree = ebwtFw.tree();
+            uint8_t taxRank = RANK_UNKNOWN;
+            std::map<uint64_t, TaxonomyNode>::const_iterator itr = tree.find(hitCount.taxID);
+            if(itr != tree.end()) {
+                taxRank = itr->second.rank;
+            }
+            // report
+            AlnRes rs;
+            rs.init(
+                    hitCount.score,
+                    max_score,
+                    uid_to_tid[hitCount.uniqueID].first,
+                    hitCount.taxID,
+                    taxRank,
+                    hitCount.summedHitLen,
+                    hitCount.readPositions,
+                    isFw);
+            sink.report(0, &rs);
+        }
+        return 0;
+    }
+    
+    bool getGenomeIdx(
+                      const Ebwt<index_t>&       ebwt,
+                      const BitPairReference&    ref,
+                      RandomSource&              rnd,
+                      index_t                    top,
+                      index_t                    bot,
+                      bool                       fw,
+                      index_t                    maxelt,
+                      index_t                    rdoff,
+                      index_t                    rdlen,
+                      EList<Coord>&              coords,
+                      WalkMetrics&               met,
+                      PerReadMetrics&            prm,
+                      HIMetrics&                 him,
+                      bool                       rejectStraddle,
+                      bool&                      straddled)
+    {
+        straddled = false;
+        assert_gt(bot, top);
+        index_t nelt = bot - top;
+        nelt = min<index_t>(nelt, maxelt);
+        coords.clear();
+        him.globalgenomecoords += (bot - top);
+        this->_offs.resize(nelt);
+        this->_offs.fill(std::numeric_limits<index_t>::max());
+        this->_sas.init(top, rdlen, EListSlice<index_t, 16>(this->_offs, 0, nelt));
+        this->_gws.init(ebwt, ref, this->_sas, rnd, met);
+        for(index_t off = 0; off < nelt; off++) {
+            WalkResult<index_t> wr;
+            this->_gws.advanceElement(
+                                off,
+                                ebwt,         // forward Bowtie index for walking left
+                                ref,          // bitpair-encoded reference
+                                this->_sas,   // SA range with offsets
+                                this->_gwstate,     // GroupWalk state; scratch space
+                                wr,           // put the result here
+                                met,          // metrics
+                                prm);         // per-read metrics
+            // Coordinate of the seed hit w/r/t the pasted reference string
+            coords.expand();
+            coords.back().init(wr.toff, 0, fw);
+        }
+        
+        return true;
+    }
+private:
+    EList<string>                _refnames;
+    EList<HitCount<index_t> >    _hitMap;
+    index_t                      _minHitLen;
+    EList<uint16_t>              _tempTies;
+    bool                         _mate1fw;
+    bool                         _mate2fw;
+    
+    bool                         _tree_traverse;
+    uint8_t                      _classification_rank;
+    set<uint64_t>                _host_taxIDs; // favor these genomes
+    set<uint64_t>                _excluded_taxIDs;
+    
+    // Temporary variables
+    ReadBWTHit<index_t>          _tempHit;
+    EList<pair<uint32_t, uint64_t> > _hitTaxCount;  // pair of count and taxID
+    EList<uint64_t>              _tempPath;
+    
+    void searchForwardAndReverse(
+                                 index_t rdi,
+                                 const Ebwt<index_t>& ebwtFw,
+                                 const Scoring& sc,
+                                 RandomSource& rnd,
+                                 const ReportingParams& rp,
+                                 const index_t increment)
+    {
+        const Read& rd = *(this->_rds[rdi]);
+
+        bool done[2] = {false, false};
+        size_t cur[2] = {0, 0} ;
+        
+        index_t rdlen = rd.length();
+        //const size_t maxDiff = (rdlen / 2 > 2 * _minHitLen) ? rdlen / 2 : (2 * _minHitLen);
+        size_t sum[2] = {0, 0} ;
+        
+        // search for partial hits on the forward and reverse strand
+        while(!done[0] || !done[1]) {
+            for(index_t fwi = 0; fwi < 2; fwi++) {
+                if(done[fwi])
+                    continue;
+                
+                size_t mineFw = 0, mineRc = 0;
+                bool fw = (fwi == 0);
+                ReadBWTHit<index_t>& hit = this->_hits[rdi][fwi];
+                this->partialSearch(
+                                    ebwtFw,
+                                    rd,
+                                    sc,
+                                    fw,
+                                    0,
+                                    mineFw,
+                                    mineRc,
+                                    hit,
+                                    rnd);
+                
+                BWTHit<index_t>& lastHit = hit.getPartialHit(hit.offsetSize() - 1);
+                if(hit.done()) {
+                    done[fwi] = true;
+                    cur[fwi] = rdlen;
+                    if(lastHit.len() >= _minHitLen) {
+                        sum[fwi] += lastHit.len();
+                        if(0) //lastHit.len() < 31 && rdlen > 31 && lastHit.size() == 1 )
+                        {
+                            ReadBWTHit<index_t> testHit ;
+                            testHit.init( fw, rdlen ) ;
+                            testHit.setOffset(hit.cur() - 1 - 31 + 1);
+                            this->partialSearch(ebwtFw,
+                                                rd,
+                                                sc,
+                                                fw,
+                                                0,
+                                                mineFw,
+                                                mineRc,
+                                                testHit,
+                                                rnd);
+                            index_t tmpLen = testHit.getPartialHit( testHit.offsetSize() - 1 ).len();
+#ifdef LI_DEBUG
+                            cout << "(adjust: " << tmpLen << ")";
+#endif
+                            if(tmpLen >= 31) {
+                                lastHit._len = tmpLen;
+                            }
+                        }
+                    }
+                    
+                    continue;
+                }
+                
+                cur[fwi] = hit.cur();
+#ifdef LI_DEBUG
+                cout << fwi << ":" << lastHit.len() << " " << cur[fwi] << " ";
+#endif
+                if(lastHit.len() >= _minHitLen)
+                    sum[fwi] += lastHit.len();
+                
+                if(lastHit.len() > increment) {
+                    if(lastHit.len() < _minHitLen) {
+                        // daehwan - for debugging purposes
+#if 1
+                        hit.setOffset(hit.cur() + 1);
+#else
+                        hit.setOffset(hit.cur() - increment);
+#endif
+                    } else {
+                        hit.setOffset(hit.cur() + 1);
+                        if(0) //lastHit.len() < 31 && hit.cur() >= 31 && lastHit.size() == 1 )
+                        {
+                            ReadBWTHit<index_t> testHit;
+                            testHit.init(fw, rdlen);
+                            testHit.setOffset(hit.cur() - 1 - 31); // why not hit.cur() - 1 - 31 + 1? because we "+1" before the if!
+                            
+                            this->partialSearch(ebwtFw,
+                                                rd,
+                                                sc,
+                                                fw,
+                                                0,
+                                                mineFw,
+                                                mineRc,
+                                                testHit,
+                                                rnd);
+                            index_t tmpLen = testHit.getPartialHit(testHit.offsetSize() - 1 ).len();
+#ifdef LI_DEBUG
+                            cout << "(adjust: " << tmpLen << ")";
+#endif
+                            if(tmpLen >= 31) {
+                                lastHit._len = tmpLen;
+                            }
+                        }
+                    }
+                }
+                if(hit.cur() + _minHitLen >= rdlen) {
+                    hit.done(true);
+                    done[fwi] = true;
+                    continue;
+                }
+
+                if(lastHit.len() <= 3) {
+                    // This happens most likely due to the Ns in the read
+                    --fwi ; // Repeat this strand again.
+                }
+            }
+#ifdef LI_DEBUG
+            cout << endl;
+#endif
+
+            // No early termination
+#if 0
+            if(sum[0] > sum[1] + (rdlen - cur[1] + 1)) {
+                this->_hits[rdi][1].done(true);
+                done[1] = true;
+            } else if(sum[1] > sum[0] + (rdlen - cur[0] + 1)) {
+                this->_hits[rdi][0].done(true);
+                done[0] = true;
+            }
+#endif
+        }
+        
+        // Extend partial hits
+        if(sum[0] >= _minHitLen && sum[1] >= _minHitLen) {
+            ReadBWTHit<index_t>& hits = this->_hits[rdi][0];
+            ReadBWTHit<index_t>& rchits = this->_hits[rdi][1];
+            for(size_t i = 0; i < hits.offsetSize(); i++) {
+                BWTHit<index_t>& hit = hits.getPartialHit(i);
+                index_t len = hit.len();
+                //if(len < _minHitLen) continue;
+                index_t l = hit._bwoff;
+                index_t r = hit._bwoff + len;
+                for(size_t j = 0; j < rchits.offsetSize(); j++) {
+                    BWTHit<index_t>& rchit = rchits.getPartialHit(j);
+                    index_t rclen = rchit.len();
+                    if(len < _minHitLen && rclen < _minHitLen) continue;
+                    index_t rc_l = rdlen - rchit._bwoff - rchit._len;
+                    index_t rc_r = rc_l + rclen;
+                    if(r <= rc_l) continue;
+                    if(rc_r <= l) continue;
+                    if(l == rc_l && r == rc_r) continue;
+                    if(l < rc_l && r > rc_r) continue;
+                    if(l > rc_l && r < rc_r) continue;
+                    if(l > rc_l) {
+                        _tempHit.init(true /* fw */, rdlen);
+                        _tempHit.setOffset(rc_l);
+                        size_t mineFw = 0, mineRc = 0;
+                        this->partialSearch(ebwtFw,
+                                            rd,
+                                            sc,
+                                            true, // fw
+                                            0,
+                                            mineFw,
+                                            mineRc,
+                                            _tempHit,
+                                            rnd);
+                        BWTHit<index_t>& tmphit = _tempHit.getPartialHit(0);
+                        if(tmphit.len() == len + l - rc_l) {
+                            hit = tmphit;
+                        }
+                    }
+                    if(r > rc_r) {
+                        _tempHit.init(false /* fw */, rdlen);
+                        _tempHit.setOffset(rdlen - r);
+                        size_t mineFw = 0, mineRc = 0;
+                        this->partialSearch(ebwtFw,
+                                            rd,
+                                            sc,
+                                            false, // fw
+                                            0,
+                                            mineFw,
+                                            mineRc,
+                                            _tempHit,
+                                            rnd);
+                        BWTHit<index_t>& tmphit = _tempHit.getPartialHit(0);
+                        if(tmphit.len() == rclen + r - rc_r) {
+                            rchit = tmphit;
+                        }
+                    }
+                }
+            }
+            
+            // Remove partial hits that are mapped more than user-specified number
+            for(size_t i = 0; i < hits.offsetSize(); i++) {
+                BWTHit<index_t>& hit = hits.getPartialHit(i);
+                index_t len = hit.len();
+                index_t l = hit._bwoff;
+                index_t r = hit._bwoff + len;
+                for(size_t j = 0; j < rchits.offsetSize(); j++) {
+                    BWTHit<index_t>& rchit = rchits.getPartialHit(j);
+                    index_t rclen = rchit.len();
+                    index_t rc_l = rdlen - rchit._bwoff - rchit._len;
+                    index_t rc_r = rc_l + rclen;
+                    if(rc_l < l) break;
+                    if(len != rclen) continue;
+                    if(l == rc_l &&
+                       r == rc_r &&
+                       hit.size() + rchit.size() > rp.ihits) {
+                        hit.reset();
+                        rchit.reset();
+                        break;
+                    }
+                }
+            }
+        }
+        
+        // Trim partial hits
+        for(int fwi = 0; fwi < 2; fwi++) {
+            ReadBWTHit<index_t>& hits = this->_hits[rdi][fwi];
+            if(hits.offsetSize() < 2) continue;
+            for(size_t i = 0; i < hits.offsetSize() - 1; i++) {
+                BWTHit<index_t>& hit = hits.getPartialHit(i);
+                for(size_t j = i + 1; j < hits.offsetSize(); j++) {
+                    BWTHit<index_t>& hit2 = hits.getPartialHit(j);
+                    if(hit._bwoff >= hit2._bwoff) {
+                        hit._len = 0;
+                        break;
+                    }
+                    if(hit._bwoff + hit._len <= hit2._bwoff) break;
+                    if(hit._len >= hit2._len) {
+                        index_t hit2_end = hit2._bwoff + hit2._len;
+                        hit2._bwoff = hit._bwoff + hit._len;
+                        hit2._len = hit2_end - hit2._bwoff;
+                    } else {
+                        hit._len = hit2._bwoff - hit._bwoff;
+                    }
+                }
+            }
+        }
+    }
+    
+    pair<int, int> getForwardOrReverseHit(index_t rdi) {
+        index_t avgHitLength[2] = {0, 0};
+        index_t hitSize[2] = {0, 0} ;
+        index_t maxHitLength[2] = {0, 0} ;
+        for(index_t fwi = 0; fwi < 2; fwi++) {
+            ReadBWTHit<index_t>& hit = this->_hits[rdi][fwi];
+            index_t numHits = 0;
+            index_t totalHitLength = 0;
+#ifdef LI_DEBUG
+            cout << fwi << ": ";
+#endif
+            for(size_t i = 0; i < hit.offsetSize(); i++) {
+                index_t len = hit.getPartialHit(i).len();
+#ifdef LI_DEBUG
+                cout << len << " ";
+#endif
+                
+                if(len < _minHitLen) continue;
+                totalHitLength += (len - 15) * (len - 15);
+                hitSize[fwi] += hit.getPartialHit(i).size();
+                if(len > maxHitLength[fwi])
+                    maxHitLength[fwi] = len;
+                numHits++;
+            }
+#ifdef LI_DEBUG
+            cout << endl;
+#endif
+            if(numHits > 0) {
+                avgHitLength[fwi] = totalHitLength ; /// numHits;
+            }
+        }
+        
+        // choose read direction with a higher average hit length
+        //cout<<"strand choosing: "<<avgHitLength[0]<<" "<<avgHitLength[1]<<endl ;
+        index_t fwi;//= (avgHitLength[0] > avgHitLength[1])? 0 : 1;
+        if(avgHitLength[0] != avgHitLength[1])
+            fwi = (avgHitLength[0] > avgHitLength[1]) ? 0 : 1;
+        else if(maxHitLength[0] != maxHitLength[1])
+            fwi = (maxHitLength[0] > maxHitLength[1])? 0 : 1;
+        else
+            return pair<int, int>(0, 2);
+        
+        return pair<int, int>((int)fwi, (int)fwi + 1);
+    }
+    
+    EList<Coord>& getCoords(
+                            ReadBWTHit<index_t>& hit,
+                            size_t hi,
+                            const Ebwt<index_t>& ebwtFw,
+                            const BitPairReference& ref,
+                            RandomSource& rnd,
+                            const index_t maxGenomeHitSize,
+                            WalkMetrics& wlm,
+                            PerReadMetrics& prm,
+                            HIMetrics& him)
+    {
+        BWTHit<index_t>& partialHit = hit.getPartialHit(hi);
+	assert(!partialHit.hasGenomeCoords());
+        bool straddled = false;
+        this->getGenomeIdx(
+                           ebwtFw,     // FB: Why is it called ...FW here?
+                           ref,
+                           rnd,
+                           partialHit._top,
+                           partialHit._bot,
+                           hit._fw == 0, // FIXME: fwi and hit._fw are defined differently
+                           maxGenomeHitSize - this->_genomeHits.size(),
+                           hit._len - partialHit._bwoff - partialHit._len,
+                           partialHit._len,
+                           partialHit._coords,
+                           wlm,       // why is it called wlm here?
+                           prm,
+                           him,
+                           false, // reject straddled
+                           straddled);
+#ifdef FLORIAN_DEBUG
+        std::cerr <<  partialHit.len() << ':';
+#endif
+        // get all coordinates of the hit
+        return partialHit._coords;
+    }
+
+
+    // append a hit to genus map or update entry
+    size_t addHitToHitMap(
+                          const Ebwt<index_t>& ebwt,
+                          EList<HitCount<index_t> >& hitMap,
+                          int rdi,
+                          int fwi,
+                          uint64_t uniqueID,
+                          uint64_t taxID,
+                          size_t hi,
+                          uint32_t partialHitScore,
+                          double weightedHitLen,
+                          bool considerOnlyIfPreviouslyObserved,
+                          size_t offset,
+                          size_t length)
+    {
+        size_t idx = 0;
+#ifdef LI_DEBUG
+        cout << "Add " << taxID << " " << partialHitScore << " " << weightedHitLen << endl;
+#endif
+        const TaxonomyPathTable& pathTable = ebwt.paths();
+        pathTable.getPath(taxID, _tempPath);
+        uint8_t rank = _classification_rank;
+        if(rank > 0) {
+            for(; rank < _tempPath.size(); rank++) {
+                if(_tempPath[rank] != 0) {
+                    taxID = _tempPath[rank];
+                    break;
+                }
+            }
+        }
+        
+        for(; idx < hitMap.size(); ++idx) {
+            bool same = false;
+            if(rank == 0) {
+                same = (uniqueID == hitMap[idx].uniqueID);
+            } else {
+                same = (taxID == hitMap[idx].taxID);
+            }
+            if(same) {
+                if(hitMap[idx].timeStamp != hi) {
+                    hitMap[idx].count += 1;
+                    hitMap[idx].scores[rdi][fwi] += partialHitScore;
+                    hitMap[idx].summedHitLens[rdi][fwi] += weightedHitLen;
+                    hitMap[idx].timeStamp = (uint32_t)hi;
+                    hitMap[idx].readPositions.push_back(make_pair(offset, length));
+                }
+                break;
+            }
+        }
+        
+        if(idx >= hitMap.size() && !considerOnlyIfPreviouslyObserved) {
+            hitMap.expand();
+            HitCount<index_t>& hitCount = hitMap.back();
+            hitCount.reset();
+            hitCount.uniqueID = uniqueID;
+            hitCount.count = 1;
+            hitCount.scores[rdi][fwi] = partialHitScore;
+            hitCount.summedHitLens[rdi][fwi] = weightedHitLen;
+            hitCount.timeStamp = (uint32_t)hi;
+            hitCount.readPositions.clear();
+            hitCount.readPositions.push_back(make_pair(offset, length));
+            hitCount.path = _tempPath;
+            hitCount.rank = rank;
+            hitCount.taxID = taxID;
+        }
+
+        //if considerOnlyIfPreviouslyObserved and it was not found, genus Idx size is equal to the genus Map size
+        //assert_lt(genusIdx, genusMap.size());
+        return idx;
+    }
+
+    // compare BWTHits by size, ascending, first, then by length, descending
+    //   TODO: move this operator into BWTHits if that is the standard way we would like to sort
+    //   TODO: this ordering does not necessarily give the best results
+    struct compareBWTHits {
+        bool operator()(const BWTHit<index_t>& a, const BWTHit<index_t>& b) const {
+            if(a.len() >= 22 || b.len() >= 22) {
+                if(a.len() >= 22 && b.len() >= 22) {
+                    // sort ascending by size
+                    if (a.size() < b.size()) return true;
+                    if (a.size() > b.size()) return false;
+                }
+                
+                // sort descending by length
+                if (b.len() < a.len()) return true;
+                if (b.len() > a.len()) return false;
+            }
+            
+            // sort by the weighted len
+            if(b.len() * a.size() < a.len() * b.size()) return true;
+            if(b.len() * a.size() > a.len() * b.size()) return false;
+            
+            // sort ascending by size
+            if(a.size() < b.size()) return true;
+            if(a.size() > b.size()) return false;
+            
+            // sort descending by length
+            if(b.len() < a.len()) return true;
+            if(b.len() > a.len()) return false;
+            
+            return false;
+        }
+    };
+};
+
+
+#endif /*CLASSIFIER_H_*/
diff --git a/diff_sample.cpp b/diff_sample.cpp
new file mode 100644
index 0000000..b722702
--- /dev/null
+++ b/diff_sample.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "diff_sample.h"
+
+struct sampleEntry clDCs[16];
+bool clDCs_calced = false; /// have clDCs been calculated?
+
+/**
+ * Entries 4-57 are transcribed from page 6 of Luk and Wong's paper
+ * "Two New Quorum Based Algorithms for Distributed Mutual Exclusion",
+ * which is also used and cited in the Burkhardt and Karkkainen's
+ * papers on difference covers for sorting.  These samples are optimal
+ * according to Luk and Wong.
+ *
+ * All other entries are generated via the exhaustive algorithm in
+ * calcExhaustiveDC().
+ *
+ * The 0 is stored at the end of the sample as an end-of-list marker,
+ * but 0 is also an element of each.
+ *
+ * Note that every difference cover has a 0 and a 1.  Intuitively,
+ * any optimal difference cover sample can be oriented (i.e. rotated)
+ * such that it includes 0 and 1 as elements.
+ *
+ * All samples in this list have been verified to be complete covers.
+ *
+ * A value of 0xffffffff in the first column indicates that there is no
+ * sample for that value of v.  We do not keep samples for values of v
+ * less than 3, since they are trivial (and the caller probably didn't
+ * mean to ask for it).
+ */
+uint32_t dc0to64[65][10] = {
+	{0xffffffff},                     // 0
+	{0xffffffff},                     // 1
+	{0xffffffff},                     // 2
+	{1, 0},                           // 3
+	{1, 2, 0},                        // 4
+	{1, 2, 0},                        // 5
+	{1, 3, 0},                        // 6
+	{1, 3, 0},                        // 7
+	{1, 2, 4, 0},                     // 8
+	{1, 2, 4, 0},                     // 9
+	{1, 2, 5, 0},                     // 10
+	{1, 2, 5, 0},                     // 11
+	{1, 3, 7, 0},                     // 12
+	{1, 3, 9, 0},                     // 13
+	{1, 2, 3, 7, 0},                  // 14
+	{1, 2, 3, 7, 0},                  // 15
+	{1, 2, 5, 8, 0},                  // 16
+	{1, 2, 4, 12, 0},                 // 17
+	{1, 2, 5, 11, 0},                 // 18
+	{1, 2, 6, 9, 0},                  // 19
+	{1, 2, 3, 6, 10, 0},              // 20
+	{1, 4, 14, 16, 0},                // 21
+	{1, 2, 3, 7, 11, 0},              // 22
+	{1, 2, 3, 7, 11, 0},              // 23
+	{1, 2, 3, 7, 15, 0},              // 24
+	{1, 2, 3, 8, 12, 0},              // 25
+	{1, 2, 5, 9, 15, 0},              // 26
+	{1, 2, 5, 13, 22, 0},             // 27
+	{1, 4, 15, 20, 22, 0},            // 28
+	{1, 2, 3, 4, 9, 14, 0},           // 29
+	{1, 2, 3, 4, 9, 19, 0},           // 30
+	{1, 3, 8, 12, 18, 0},             // 31
+	{1, 2, 3, 7, 11, 19, 0},          // 32
+	{1, 2, 3, 6, 16, 27, 0},          // 33
+	{1, 2, 3, 7, 12, 20, 0},          // 34
+	{1, 2, 3, 8, 12, 21, 0},          // 35
+	{1, 2, 5, 12, 14, 20, 0},         // 36
+	{1, 2, 4, 10, 15, 22, 0},         // 37
+	{1, 2, 3, 4, 8, 14, 23, 0},       // 38
+	{1, 2, 4, 13, 18, 33, 0},         // 39
+	{1, 2, 3, 4, 9, 14, 24, 0},       // 40
+	{1, 2, 3, 4, 9, 15, 25, 0},       // 41
+	{1, 2, 3, 4, 9, 15, 25, 0},       // 42
+	{1, 2, 3, 4, 10, 15, 26, 0},      // 43
+	{1, 2, 3, 6, 16, 27, 38, 0},      // 44
+	{1, 2, 3, 5, 12, 18, 26, 0},      // 45
+	{1, 2, 3, 6, 18, 25, 38, 0},      // 46
+	{1, 2, 3, 5, 16, 22, 40, 0},      // 47
+	{1, 2, 5, 9, 20, 26, 36, 0},      // 48
+	{1, 2, 5, 24, 33, 36, 44, 0},     // 49
+	{1, 3, 8, 17, 28, 32, 38, 0},     // 50
+	{1, 2, 5, 11, 18, 30, 38, 0},     // 51
+	{1, 2, 3, 4, 6, 14, 21, 30, 0},   // 52
+	{1, 2, 3, 4, 7, 21, 29, 44, 0},   // 53
+	{1, 2, 3, 4, 9, 15, 21, 31, 0},   // 54
+	{1, 2, 3, 4, 6, 19, 26, 47, 0},   // 55
+	{1, 2, 3, 4, 11, 16, 33, 39, 0},  // 56
+	{1, 3, 13, 32, 36, 43, 52, 0},    // 57
+
+	// Generated by calcExhaustiveDC()
+	{1, 2, 3, 7, 21, 33, 37, 50, 0},  // 58
+	{1, 2, 3, 6, 13, 21, 35, 44, 0},  // 59
+	{1, 2, 4, 9, 15, 25, 30, 42, 0},  // 60
+	{1, 2, 3, 7, 15, 25, 36, 45, 0},  // 61
+	{1, 2, 4, 10, 32, 39, 46, 51, 0}, // 62
+	{1, 2, 6, 8, 20, 38, 41, 54, 0},  // 63
+	{1, 2, 5, 14, 16, 34, 42, 59, 0}  // 64
+};
diff --git a/diff_sample.h b/diff_sample.h
new file mode 100644
index 0000000..f293024
--- /dev/null
+++ b/diff_sample.h
@@ -0,0 +1,1000 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DIFF_SAMPLE_H_
+#define DIFF_SAMPLE_H_
+
+#include <stdint.h>
+#include <string.h>
+#include "assert_helpers.h"
+#include "multikey_qsort.h"
+#include "timer.h"
+#include "ds.h"
+#include "mem_ids.h"
+#include "ls.h"
+#include "btypes.h"
+
+using namespace std;
+
+#ifndef VMSG_NL
+#define VMSG_NL(...) \
+if(this->verbose()) { \
+	stringstream tmp; \
+	tmp << __VA_ARGS__ << endl; \
+	this->verbose(tmp.str()); \
+}
+#endif
+
+#ifndef VMSG
+#define VMSG(...) \
+if(this->verbose()) { \
+	stringstream tmp; \
+	tmp << __VA_ARGS__; \
+	this->verbose(tmp.str()); \
+}
+#endif
+
+/**
+ * Routines for calculating, sanity-checking, and dispensing difference
+ * cover samples to clients.
+ */
+
+/**
+ *
+ */
+struct sampleEntry {
+	uint32_t maxV;
+	uint32_t numSamples;
+	uint32_t samples[128];
+};
+
+/// Array of Colbourn and Ling calculated difference covers up to
+/// r = 16 (maxV = 5953)
+extern struct sampleEntry clDCs[16];
+extern bool clDCs_calced; /// have clDCs been calculated?
+
+/**
+ * Check that the given difference cover 'ds' actually covers all
+ * differences for a periodicity of v.
+ */
+template<typename T>
+static bool dcRepOk(T v, EList<T>& ds) {
+	// diffs[] records all the differences observed
+	AutoArray<bool> covered(v, EBWT_CAT);
+	for(T i = 1; i < v; i++) {
+		covered[i] = false;
+	}
+	for(T di = T(); di < ds.size(); di++) {
+		for(T dj = di+1; dj < ds.size(); dj++) {
+			assert_lt(ds[di], ds[dj]);
+			T d1 = (ds[dj] - ds[di]);
+			T d2 = (ds[di] + v - ds[dj]);
+			assert_lt(d1, v);
+			assert_lt(d2, v);
+			covered[d1] = true;
+			covered[d2] = true;
+		}
+	}
+	bool ok = true;
+	for(T i = 1; i < v; i++) {
+		if(covered[i] == false) {
+			ok = false;
+			break;
+		}
+	}
+	return ok;
+}
+
+/**
+ * Return true iff each element of ts (with length 'limit') is greater
+ * than the last.
+ */
+template<typename T>
+static bool increasing(T* ts, size_t limit) {
+	for(size_t i = 0; i < limit-1; i++) {
+		if(ts[i+1] <= ts[i]) return false;
+	}
+	return true;
+}
+
+/**
+ * Return true iff the given difference cover covers difference 'diff'
+ * mod 'v'.
+ */
+template<typename T>
+static inline bool hasDifference(T *ds, T d, T v, T diff) {
+	// diffs[] records all the differences observed
+	for(T di = T(); di < d; di++) {
+		for(T dj = di+1; dj < d; dj++) {
+			assert_lt(ds[di], ds[dj]);
+			T d1 = (ds[dj] - ds[di]);
+			T d2 = (ds[di] + v - ds[dj]);
+			assert_lt(d1, v);
+			assert_lt(d2, v);
+			if(d1 == diff || d2 == diff) return true;
+		}
+	}
+	return false;
+}
+
+/**
+ * Exhaustively calculate optimal difference cover samples for v = 4,
+ * 8, 16, 32, 64, 128, 256 and store results in p2DCs[]
+ */
+template<typename T>
+void calcExhaustiveDC(T i, bool verbose = false, bool sanityCheck = false) {
+	T v = i;
+	AutoArray<bool> diffs(v, EBWT_CAT);
+	// v is the target period
+	T ld = (T)ceil(sqrt(v));
+	// ud is the upper bound on |D|
+	T ud = v / 2;
+	// for all possible |D|s
+	bool ok = true;
+	T *ds = NULL;
+	T d;
+	for(d = ld; d <= ud+1; d++) {
+		// for all possible |D| samples
+		AutoArray<T> ds(d, EBWT_CAT);
+		for(T j = 0; j < d; j++) {
+			ds[j] = j;
+		}
+		assert(increasing(ds, d));
+		while(true) {
+			// reset diffs[]
+			for(T t = 1; t < v; t++) {
+				diffs[t] = false;
+			}
+			T diffCnt = 0;
+			// diffs[] records all the differences observed
+			for(T di = 0; di < d; di++) {
+				for(T dj = di+1; dj < d; dj++) {
+					assert_lt(ds[di], ds[dj]);
+					T d1 = (ds[dj] - ds[di]);
+					T d2 = (ds[di] + v - ds[dj]);
+					assert_lt(d1, v);
+					assert_lt(d2, v);
+					assert_gt(d1, 0);
+					assert_gt(d2, 0);
+					if(!diffs[d1]) diffCnt++; diffs[d1] = true;
+					if(!diffs[d2]) diffCnt++; diffs[d2] = true;
+				}
+			}
+			// Do we observe all possible differences (except 0)
+			ok = diffCnt == v-1;
+			if(ok) {
+				// Yes, all differences are covered
+				break;
+			} else {
+				// Advance ds
+				// (Following is commented out because it turns out
+				// it's slow)
+				// Find a missing difference
+				//uint32_t missing = 0xffffffff;
+				//for(uint32_t t = 1; t < v; t++) {
+				//	if(diffs[t] == false) {
+				//		missing = diffs[t];
+				//		break;
+				//	}
+				//}
+				//assert_neq(missing, 0xffffffff);
+				assert(increasing(ds, d));
+				bool advanced = false;
+				bool keepGoing = false;
+				do {
+					keepGoing = false;
+					for(T bd = d-1; bd > 1; bd--) {
+						T dif = (d-1)-bd;
+						if(ds[bd] < v-1-dif) {
+							ds[bd]++;
+							assert_neq(0, ds[bd]);
+							// Reset subsequent ones
+							for(T bdi = bd+1; bdi < d; bdi++) {
+								assert_eq(0, ds[bdi]);
+								ds[bdi] = ds[bdi-1]+1;
+								assert_gt(ds[bdi], ds[bdi-1]);
+							}
+							assert(increasing(ds, d));
+							// (Following is commented out because
+							// it turns out it's slow)
+							// See if the new DC has the missing value
+							//if(!hasDifference(ds, d, v, missing)) {
+							//	keepGoing = true;
+							//	break;
+							//}
+							advanced = true;
+							break;
+						} else {
+							ds[bd] = 0;
+							// keep going
+						}
+					}
+				} while(keepGoing);
+				// No solution for this |D|
+				if(!advanced) break;
+				assert(increasing(ds, d));
+			}
+		} // next sample assignment
+		if(ok) {
+			break;
+		}
+	} // next |D|
+	assert(ok);
+	cout << "Did exhaustive v=" << v << " |D|=" << d << endl;
+	cout << "  ";
+	for(T i = 0; i < d; i++) {
+		cout << ds[i];
+		if(i < d-1) cout << ",";
+	}
+	cout << endl;
+}
+
+/**
+ * Routune for calculating the elements of clDCs up to r = 16 using the
+ * technique of Colbourn and Ling.
+ *
+ * See http://citeseer.ist.psu.edu/211575.html
+ */
+template <typename T>
+void calcColbournAndLingDCs(bool verbose = false, bool sanityCheck = false) {
+	for(T r = 0; r < 16; r++) {
+		T maxv = 24*r*r + 36*r + 13; // Corollary 2.3
+		T numsamp = 6*r + 4;
+		clDCs[r].maxV = maxv;
+		clDCs[r].numSamples = numsamp;
+		memset(clDCs[r].samples, 0, 4 * 128);
+		T i;
+		// clDCs[r].samples[0] = 0;
+		// Fill in the 1^r part of the B series
+		for(i = 1; i < r+1; i++) {
+			clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1;
+		}
+		// Fill in the (r + 1)^1 part
+		clDCs[r].samples[r+1] = clDCs[r].samples[r] + r + 1;
+		// Fill in the (2r + 1)^r part
+		for(i = r+2; i < r+2+r; i++) {
+			clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 1;
+		}
+		// Fill in the (4r + 3)^(2r + 1) part
+		for(i = r+2+r; i < r+2+r+2*r+1; i++) {
+			clDCs[r].samples[i] = clDCs[r].samples[i-1] + 4*r + 3;
+		}
+		// Fill in the (2r + 2)^(r + 1) part
+		for(i = r+2+r+2*r+1; i < r+2+r+2*r+1+r+1; i++) {
+			clDCs[r].samples[i] = clDCs[r].samples[i-1] + 2*r + 2;
+		}
+		// Fill in the last 1^r part
+		for(i = r+2+r+2*r+1+r+1; i < r+2+r+2*r+1+r+1+r; i++) {
+			clDCs[r].samples[i] = clDCs[r].samples[i-1] + 1;
+		}
+		assert_eq(i, numsamp);
+		assert_lt(i, 128);
+		if(sanityCheck) {
+			// diffs[] records all the differences observed
+			AutoArray<bool> diffs(maxv, EBWT_CAT);
+			for(T i = 0; i < numsamp; i++) {
+				for(T j = i+1; j < numsamp; j++) {
+					T d1 = (clDCs[r].samples[j] - clDCs[r].samples[i]);
+					T d2 = (clDCs[r].samples[i] + maxv - clDCs[r].samples[j]);
+					assert_lt(d1, maxv);
+					assert_lt(d2, maxv);
+					diffs[d1] = true;
+					diffs[d2] = true;
+				}
+			}
+			// Should have observed all possible differences (except 0)
+			for(T i = 1; i < maxv; i++) {
+				if(diffs[i] == false) cout << r << ", " << i << endl;
+				assert(diffs[i] == true);
+			}
+		}
+	}
+	clDCs_calced = true;
+}
+
+/**
+ * A precalculated list of difference covers.
+ */
+extern uint32_t dc0to64[65][10];
+
+/**
+ * Get a difference cover for the requested periodicity v.
+ */
+template <typename T>
+static EList<T> getDiffCover(
+	T v,
+	bool verbose = false,
+	bool sanityCheck = false)
+{
+	assert_gt(v, 2);
+	EList<T> ret;
+	ret.clear();
+	// Can we look it up in our hardcoded array?
+	if(v <= 64 && dc0to64[v][0] == 0xffffffff) {
+		if(verbose) cout << "v in hardcoded area, but hardcoded entry was all-fs" << endl;
+		return ret;
+	} else if(v <= 64) {
+		ret.push_back(0);
+		for(size_t i = 0; i < 10; i++) {
+			if(dc0to64[v][i] == 0) break;
+			ret.push_back(dc0to64[v][i]);
+		}
+		if(sanityCheck) assert(dcRepOk(v, ret));
+		return ret;
+	}
+
+	// Can we look it up in our calcColbournAndLingDCs array?
+	if(!clDCs_calced) {
+		calcColbournAndLingDCs<uint32_t>(verbose, sanityCheck);
+		assert(clDCs_calced);
+	}
+	for(size_t i = 0; i < 16; i++) {
+		if(v <= clDCs[i].maxV) {
+			for(size_t j = 0; j < clDCs[i].numSamples; j++) {
+				T s = clDCs[i].samples[j];
+				if(s >= v) {
+					s %= v;
+					for(size_t k = 0; k < ret.size(); k++) {
+						if(s == ret[k]) break;
+						if(s < ret[k]) {
+							ret.insert(s, k);
+							break;
+						}
+					}
+				} else {
+					ret.push_back(s % v);
+				}
+			}
+			if(sanityCheck) assert(dcRepOk(v, ret));
+			return ret;
+		}
+	}
+	cerr << "Error: Could not find a difference cover sample for v=" << v << endl;
+	throw 1;
+}
+
+/**
+ * Calculate and return a delta map based on the given difference cover
+ * and periodicity v.
+ */
+template <typename T>
+static EList<T> getDeltaMap(T v, const EList<T>& dc) {
+	// Declare anchor-map-related items
+	EList<T> amap;
+	size_t amapEnts = 1;
+	amap.resizeExact((size_t)v);
+	amap.fill(0xffffffff);
+	amap[0] = 0;
+	// Print out difference cover (and optionally calculate
+	// anchor map)
+	for(size_t i = 0; i < dc.size(); i++) {
+		for(size_t j = i+1; j < dc.size(); j++) {
+			assert_gt(dc[j], dc[i]);
+			T diffLeft  = dc[j] - dc[i];
+			T diffRight = dc[i] + v - dc[j];
+			assert_lt(diffLeft, v);
+			assert_lt(diffRight, v);
+			if(amap[diffLeft] == 0xffffffff) {
+				amap[diffLeft] = dc[i];
+				amapEnts++;
+			}
+			if(amap[diffRight] == 0xffffffff) {
+				amap[diffRight] = dc[j];
+				amapEnts++;
+			}
+		}
+	}
+	return amap;
+}
+
+/**
+ * Return population count (count of all bits set to 1) of i.
+ */
+template<typename T>
+static unsigned int popCount(T i) {
+	unsigned int cnt = 0;
+	for(size_t j = 0; j < sizeof(T)*8; j++) {
+		if(i & 1) cnt++;
+		i >>= 1;
+	}
+	return cnt;
+}
+
+/**
+ * Calculate log-base-2 of i
+ */
+template<typename T>
+static unsigned int myLog2(T i) {
+	assert_eq(1, popCount(i)); // must be power of 2
+	for(size_t j = 0; j < sizeof(T)*8; j++) {
+		if(i & 1) return (int)j;
+		i >>= 1;
+	}
+	assert(false);
+	return 0xffffffff;
+}
+
+/**
+ *
+ */
+template<typename TStr>
+class DifferenceCoverSample {
+public:
+
+	DifferenceCoverSample(const TStr& __text,
+	                      uint32_t __v,
+	                      bool __verbose = false,
+	                      bool __sanity = false,
+	                      ostream& __logger = cout) :
+		_text(__text),
+		_v(__v),
+		_verbose(__verbose),
+		_sanity(__sanity),
+		_ds(getDiffCover(_v, _verbose, _sanity)),
+		_dmap(getDeltaMap(_v, _ds)),
+		_d((uint32_t)_ds.size()),
+		_doffs(),
+		_isaPrime(),
+		_dInv(),
+		_log2v(myLog2(_v)),
+		_vmask(OFF_MASK << _log2v),
+		_logger(__logger)
+	{
+		assert_gt(_d, 0);
+		assert_eq(1, popCount(_v)); // must be power of 2
+		// Build map from d's to idx's
+		_dInv.resizeExact((size_t)v());
+		_dInv.fill(0xffffffff);
+		uint32_t lim = (uint32_t)_ds.size();
+		for(uint32_t i = 0; i < lim; i++) {
+			_dInv[_ds[i]] = i;
+		}
+	}
+	
+	/**
+	 * Allocate an amount of memory that simulates the peak memory
+	 * usage of the DifferenceCoverSample with the given text and v.
+	 * Throws bad_alloc if it's not going to fit in memory.  Returns
+	 * the approximate number of bytes the Cover takes at all times.
+	 */
+	static size_t simulateAllocs(const TStr& text, uint32_t v) {
+		EList<uint32_t> ds(getDiffCover(v, false /*verbose*/, false /*sanity*/));
+		size_t len = text.length();
+		size_t sPrimeSz = (len / v) * ds.size();
+		// sPrime, sPrimeOrder, _isaPrime all exist in memory at
+		// once and that's the peak
+		AutoArray<TIndexOffU> aa(sPrimeSz * 3 + (1024 * 1024 /*out of caution*/), EBWT_CAT);
+		return sPrimeSz * 4; // sPrime array
+	}
+
+	uint32_t v() const                   { return _v; }
+	uint32_t log2v() const               { return _log2v; }
+	uint32_t vmask() const               { return _vmask; }
+	uint32_t modv(TIndexOffU i) const    { return (uint32_t)(i & ~_vmask); }
+	TIndexOffU divv(TIndexOffU i) const  { return i >> _log2v; }
+	uint32_t d() const                   { return _d; }
+	bool verbose() const                 { return _verbose; }
+	bool sanityCheck() const             { return _sanity; }
+	const TStr& text() const             { return _text; }
+	const EList<uint32_t>& ds() const    { return _ds; }
+	const EList<uint32_t>& dmap() const  { return _dmap; }
+	ostream& log() const                 { return _logger; }
+
+	void     build(int nthreads);
+	uint32_t tieBreakOff(TIndexOffU i, TIndexOffU j) const;
+	int64_t  breakTie(TIndexOffU i, TIndexOffU j) const;
+	bool     isCovered(TIndexOffU i) const;
+	TIndexOffU rank(TIndexOffU i) const;
+
+	/**
+	 * Print out the suffix array such that every sample offset has its
+	 * rank filled in and every non-sample offset is shown as '-'.
+	 */
+	void print(ostream& out) {
+		for(size_t i = 0; i < _text.length(); i++) {
+			if(isCovered(i)) {
+				out << rank(i);
+			} else {
+				out << "-";
+			}
+			if(i < _text.length()-1) {
+				out << ",";
+			}
+		}
+		out << endl;
+	}
+
+private:
+
+	void doBuiltSanityCheck() const;
+	void buildSPrime(EList<TIndexOffU>& sPrime, size_t padding);
+
+	bool built() const {
+		return _isaPrime.size() > 0;
+	}
+
+	void verbose(const string& s) const {
+		if(this->verbose()) {
+			this->log() << s.c_str();
+			this->log().flush();
+		}
+	}
+
+	const TStr&      _text;     // text to sample
+	uint32_t         _v;        // periodicity of sample
+	bool             _verbose;  //
+	bool             _sanity;   //
+	EList<uint32_t>  _ds;       // samples: idx -> d
+	EList<uint32_t>  _dmap;     // delta map
+	uint32_t         _d;        // |D| - size of sample
+	EList<TIndexOffU>  _doffs;    // offsets into sPrime/isaPrime for each d idx
+	EList<TIndexOffU>  _isaPrime; // ISA' array
+	EList<uint32_t>  _dInv;     // Map from d -> idx
+	uint32_t         _log2v;
+	TIndexOffU         _vmask;
+	ostream&         _logger;
+};
+
+/**
+ * Sanity-check the difference cover by first inverting _isaPrime then
+ * checking that each successive suffix really is less than the next.
+ */
+template <typename TStr>
+void DifferenceCoverSample<TStr>::doBuiltSanityCheck() const {
+	uint32_t v = this->v();
+	assert(built());
+	VMSG_NL("  Doing sanity check");
+	TIndexOffU added = 0;
+	EList<TIndexOffU> sorted;
+	sorted.resizeExact(_isaPrime.size());
+	sorted.fill(OFF_MASK);
+	for(size_t di = 0; di < this->d(); di++) {
+		uint32_t d = _ds[di];
+		size_t i = 0;
+		for(size_t doi = _doffs[di]; doi < _doffs[di+1]; doi++, i++) {
+			assert_eq(OFF_MASK, sorted[_isaPrime[doi]]);
+			// Maps the offset of the suffix to its rank
+			sorted[_isaPrime[doi]] = (TIndexOffU)(v*i + d);
+			added++;
+		}
+	}
+	assert_eq(added, _isaPrime.size());
+#ifndef NDEBUG
+	for(size_t i = 0; i < sorted.size()-1; i++) {
+		assert(sstr_suf_lt(this->text(), sorted[i], this->text(), sorted[i+1], false));
+	}
+#endif
+}
+
+/**
+ * Build the s' array by sampling suffixes (suffix offsets, actually)
+ * from t according to the difference-cover sample and pack them into
+ * an array of machine words in the order dictated by the "mu" mapping
+ * described in Burkhardt.
+ *
+ * Also builds _doffs map.
+ */
+template <typename TStr>
+void DifferenceCoverSample<TStr>::buildSPrime(
+	EList<TIndexOffU>& sPrime,
+	size_t padding)
+{
+	const TStr& t = this->text();
+	const EList<uint32_t>& ds = this->ds();
+	TIndexOffU tlen = (TIndexOffU)t.length();
+	uint32_t v = this->v();
+	uint32_t d = this->d();
+	assert_gt(v, 2);
+	assert_lt(d, v);
+	// Record where each d section should begin in sPrime
+	TIndexOffU tlenDivV = this->divv(tlen);
+	uint32_t tlenModV = this->modv(tlen);
+	TIndexOffU sPrimeSz = 0;
+	assert(_doffs.empty());
+	_doffs.resizeExact((size_t)d+1);
+	for(uint32_t di = 0; di < d; di++) {
+		// mu mapping
+		TIndexOffU sz = tlenDivV + ((ds[di] <= tlenModV) ? 1 : 0);
+		assert_geq(sz, 0);
+		_doffs[di] = sPrimeSz;
+		sPrimeSz += sz;
+	}
+	_doffs[d] = sPrimeSz;
+#ifndef NDEBUG
+	if(tlenDivV > 0) {
+		for(size_t i = 0; i < d; i++) {
+			assert_gt(_doffs[i+1], _doffs[i]);
+			TIndexOffU diff = _doffs[i+1] - _doffs[i];
+			assert(diff == tlenDivV || diff == tlenDivV+1);
+		}
+	}
+#endif
+	assert_eq(_doffs.size(), d+1);
+	// Size sPrime appropriately
+	sPrime.resizeExact((size_t)sPrimeSz + padding);
+	sPrime.fill(OFF_MASK);
+	// Slot suffixes from text into sPrime according to the mu
+	// mapping; where the mapping would leave a blank, insert a 0
+	TIndexOffU added = 0;
+	TIndexOffU i = 0;
+	for(uint64_t ti = 0; ti <= tlen; ti += v) {
+		for(uint32_t di = 0; di < d; di++) {
+			TIndexOffU tti = ti + ds[di];
+			if(tti > tlen) break;
+			TIndexOffU spi = _doffs[di] + i;
+			assert_lt(spi, _doffs[di+1]);
+			assert_leq(tti, tlen);
+			assert_lt(spi, sPrimeSz);
+			assert_eq(OFF_MASK, sPrime[spi]);
+			sPrime[spi] = tti; added++;
+		}
+		i++;
+	}
+	assert_eq(added, sPrimeSz);
+}
+
+/**
+ * Return true iff suffixes with offsets suf1 and suf2 out of host
+ * string 'host' are identical up to depth 'v'.
+ */
+template <typename TStr>
+static inline bool suffixSameUpTo(
+	const TStr& host,
+	TIndexOffU suf1,
+	TIndexOffU suf2,
+	TIndexOffU v)
+{
+	for(TIndexOffU i = 0; i < v; i++) {
+		bool endSuf1 = suf1+i >= host.length();
+		bool endSuf2 = suf2+i >= host.length();
+		if((endSuf1 && !endSuf2) || (!endSuf1 && endSuf2)) return false;
+		if(endSuf1 && endSuf2) return true;
+		if(host[suf1+i] != host[suf2+i]) return false;
+	}
+	return true;
+}
+
+template<typename TStr>
+struct VSortingParam {
+    DifferenceCoverSample<TStr>* dcs;
+    TIndexOffU*                  sPrimeArr;
+    size_t                       sPrimeSz;
+    TIndexOffU*                  sPrimeOrderArr;
+    size_t                       depth;
+    const EList<size_t>*         boundaries;
+    size_t*                      cur;
+    MUTEX_T*                     mutex;
+};
+
+template<typename TStr>
+static void VSorting_worker(void *vp)
+{
+    VSortingParam<TStr>* param = (VSortingParam<TStr>*)vp;
+    DifferenceCoverSample<TStr>* dcs = param->dcs;
+    const TStr& host = dcs->text();
+    const size_t hlen = host.length();
+    uint32_t v = dcs->v();
+    while(true) {
+        size_t cur = 0;
+        {
+            ThreadSafe ts(param->mutex, true);
+            cur = *(param->cur);
+            (*param->cur)++;
+        }
+        if(cur >= param->boundaries->size()) return;
+        size_t begin = (cur == 0 ? 0 : (*param->boundaries)[cur-1]);
+        size_t end = (*param->boundaries)[cur];
+        assert_leq(begin, end);
+        if(end - begin <= 1) continue;
+        mkeyQSortSuf2(
+                      host,
+                      hlen,
+                      param->sPrimeArr,
+                      param->sPrimeSz,
+                      param->sPrimeOrderArr,
+                      4,
+                      begin,
+                      end,
+                      param->depth,
+                      v);
+    }
+}
+
+/**
+ * Calculates a ranking of all suffixes in the sample and stores them,
+ * packed according to the mu mapping, in _isaPrime.
+ */
+template <typename TStr>
+void DifferenceCoverSample<TStr>::build(int nthreads) {
+	// Local names for relevant types
+	VMSG_NL("Building DifferenceCoverSample");
+	// Local names for relevant data
+	const TStr& t = this->text();
+	uint32_t v = this->v();
+	assert_gt(v, 2);
+	// Build s'
+	EList<TIndexOffU> sPrime;
+	// Need to allocate 2 extra elements at the end of the sPrime and _isaPrime
+	// arrays.  One element that's less than all others, and another that acts
+	// as needed padding for the Larsson-Sadakane sorting code.
+	size_t padding = 1;
+	VMSG_NL("  Building sPrime");
+	buildSPrime(sPrime, padding);
+	size_t sPrimeSz = sPrime.size() - padding;
+	assert_gt(sPrime.size(), padding);
+	assert_leq(sPrime.size(), t.length() + padding + 1);
+	TIndexOffU nextRank = 0;
+	{
+		VMSG_NL("  Building sPrimeOrder");
+		EList<TIndexOffU> sPrimeOrder;
+		sPrimeOrder.resizeExact(sPrimeSz);
+		for(TIndexOffU i = 0; i < sPrimeSz; i++) {
+			sPrimeOrder[i] = i;
+		}
+		// sPrime now holds suffix-offsets for DC samples.
+		{
+			Timer timer(cout, "  V-Sorting samples time: ", this->verbose());
+			VMSG_NL("  V-Sorting samples");
+			// Extract backing-store array from sPrime and sPrimeOrder;
+			// the mkeyQSortSuf2 routine works on the array for maximum
+			// efficiency
+			TIndexOffU *sPrimeArr = (TIndexOffU*)sPrime.ptr();
+			assert_eq(sPrimeArr[0], sPrime[0]);
+			assert_eq(sPrimeArr[sPrimeSz-1], sPrime[sPrimeSz-1]);
+			TIndexOffU *sPrimeOrderArr = (TIndexOffU*)sPrimeOrder.ptr();
+			assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]);
+			assert_eq(sPrimeOrderArr[sPrimeSz-1], sPrimeOrder[sPrimeSz-1]);
+            // Sort sample suffixes up to the vth character using a
+			// multikey quicksort.  Sort time is proportional to the
+			// number of samples times v.  It isn't quadratic.
+			// sPrimeOrder is passed in as a swapping partner for
+			// sPrimeArr, i.e., every time the multikey qsort swaps
+			// elements in sPrime, it swaps the same elements in
+			// sPrimeOrder too.  This allows us to easily reconstruct
+			// what the sort did.
+            if(nthreads == 1) {
+                mkeyQSortSuf2(t, sPrimeArr, sPrimeSz, sPrimeOrderArr, 4,
+                              this->verbose(), this->sanityCheck(), v);
+            } else {
+                int query_depth = 0;
+                int tmp_nthreads = nthreads;
+                while(tmp_nthreads > 0) {
+                    query_depth++;
+                    tmp_nthreads >>= 1;
+                }
+                EList<size_t> boundaries; // bucket boundaries for parallelization
+                TIndexOffU *sOrig = NULL;
+                if(this->sanityCheck()) {
+                    sOrig = new TIndexOffU[sPrimeSz];
+                    memcpy(sOrig, sPrimeArr, OFF_SIZE * sPrimeSz);
+                }
+                mkeyQSortSuf2(t, sPrimeArr, sPrimeSz, sPrimeOrderArr, 4,
+                              this->verbose(), false, query_depth, &boundaries);
+                if(boundaries.size() > 0) {
+                    AutoArray<tthread::thread*> threads(nthreads);
+                    EList<VSortingParam<TStr> > tparams;
+                    size_t cur = 0;
+                    MUTEX_T mutex;
+                    for(int tid = 0; tid < nthreads; tid++) {
+                        // Calculate bucket sizes by doing a binary search for each
+                        // suffix and noting where it lands
+                        tparams.expand();
+                        tparams.back().dcs = this;
+                        tparams.back().sPrimeArr = sPrimeArr;
+                        tparams.back().sPrimeSz = sPrimeSz;
+                        tparams.back().sPrimeOrderArr = sPrimeOrderArr;
+                        tparams.back().depth = query_depth;
+                        tparams.back().boundaries = &boundaries;
+                        tparams.back().cur = &cur;
+                        tparams.back().mutex = &mutex;
+                        threads[tid] = new tthread::thread(VSorting_worker<TStr>, (void*)&tparams.back());
+                    }
+                    for (int tid = 0; tid < nthreads; tid++) {
+                        threads[tid]->join();
+                    }
+                }
+                if(this->sanityCheck()) {
+                    sanityCheckOrderedSufs(t, t.length(), sPrimeArr, sPrimeSz, v);
+                    for(size_t i = 0; i < sPrimeSz; i++) {
+                        assert_eq(sPrimeArr[i], sOrig[sPrimeOrderArr[i]]);
+                    }
+                    delete[] sOrig;
+                }
+            }
+			// Make sure sPrime and sPrimeOrder are consistent with
+			// their respective backing-store arrays
+			assert_eq(sPrimeArr[0], sPrime[0]);
+			assert_eq(sPrimeArr[sPrimeSz-1], sPrime[sPrimeSz-1]);
+			assert_eq(sPrimeOrderArr[0], sPrimeOrder[0]);
+			assert_eq(sPrimeOrderArr[sPrimeSz-1], sPrimeOrder[sPrimeSz-1]);
+		}
+		// Now assign the ranking implied by the sorted sPrime/sPrimeOrder
+		// arrays back into sPrime.
+		VMSG_NL("  Allocating rank array");
+		_isaPrime.resizeExact(sPrime.size());
+		ASSERT_ONLY(_isaPrime.fill(OFF_MASK));
+		assert_gt(_isaPrime.size(), 0);
+		{
+			Timer timer(cout, "  Ranking v-sort output time: ", this->verbose());
+			VMSG_NL("  Ranking v-sort output");
+			for(size_t i = 0; i < sPrimeSz-1; i++) {
+				// Place the appropriate ranking
+				_isaPrime[sPrimeOrder[i]] = nextRank;
+				// If sPrime[i] and sPrime[i+1] are identical up to v, then we
+				// should give the next suffix the same rank
+				if(!suffixSameUpTo(t, sPrime[i], sPrime[i+1], v)) nextRank++;
+			}
+			_isaPrime[sPrimeOrder[sPrimeSz-1]] = nextRank; // finish off
+#ifndef NDEBUG
+			for(size_t i = 0; i < sPrimeSz; i++) {
+				assert_neq(OFF_MASK, _isaPrime[i]);
+				assert_lt(_isaPrime[i], sPrimeSz);
+			}
+#endif
+		}
+		// sPrimeOrder is destroyed
+		// All the information we need is now in _isaPrime
+	}
+	_isaPrime[_isaPrime.size()-1] = (TIndexOffU)sPrimeSz;
+	sPrime[sPrime.size()-1] = (TIndexOffU)sPrimeSz;
+	// _isaPrime[_isaPrime.size()-1] and sPrime[sPrime.size()-1] are just
+	// spacer for the Larsson-Sadakane routine to use
+	{
+		Timer timer(cout, "  Invoking Larsson-Sadakane on ranks time: ", this->verbose());
+		VMSG_NL("  Invoking Larsson-Sadakane on ranks");
+		if(sPrime.size() >= LS_SIZE) {
+			cerr << "Error; sPrime array has so many elements that it can't be converted to a signed array without overflow." << endl;
+			throw 1;
+		}
+		LarssonSadakane<TIndexOff> ls;
+		ls.suffixsort(
+			(TIndexOff*)_isaPrime.ptr(),
+			(TIndexOff*)sPrime.ptr(),
+			(TIndexOff)sPrimeSz,
+			(TIndexOff)sPrime.size(),
+			0);
+	}
+	// chop off final character of _isaPrime
+	_isaPrime.resizeExact(sPrimeSz);
+	for(size_t i = 0; i < _isaPrime.size(); i++) {
+		_isaPrime[i]--;
+	}
+#ifndef NDEBUG
+	for(size_t i = 0; i < sPrimeSz-1; i++) {
+		assert_lt(_isaPrime[i], sPrimeSz);
+		assert(i == 0 || _isaPrime[i] != _isaPrime[i-1]);
+	}
+#endif
+	VMSG_NL("  Sanity-checking and returning");
+	if(this->sanityCheck()) doBuiltSanityCheck();
+}
+
+/**
+ * Return true iff index i within the text is covered by the difference
+ * cover sample.  Allow i to be off the end of the text; simplifies
+ * logic elsewhere.
+ */
+template <typename TStr>
+bool DifferenceCoverSample<TStr>::isCovered(TIndexOffU i) const {
+	assert(built());
+	uint32_t modi = this->modv(i);
+	assert_lt(modi, _dInv.size());
+	return _dInv[modi] != 0xffffffff;
+}
+
+/**
+ * Given a text offset that's covered, return its lexicographical rank
+ * among the sample suffixes.
+ */
+template <typename TStr>
+TIndexOffU DifferenceCoverSample<TStr>::rank(TIndexOffU i) const {
+	assert(built());
+	assert_lt(i, this->text().length());
+	uint32_t imodv = this->modv(i);
+	assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample
+	TIndexOffU ioff = this->divv(i);
+	assert_lt(ioff, _doffs[_dInv[imodv]+1] - _doffs[_dInv[imodv]]);
+	TIndexOffU isaIIdx = _doffs[_dInv[imodv]] + ioff;
+	assert_lt(isaIIdx, _isaPrime.size());
+	TIndexOffU isaPrimeI = _isaPrime[isaIIdx];
+	assert_leq(isaPrimeI, _isaPrime.size());
+	return isaPrimeI;
+}
+
+/**
+ * Return: < 0 if suffix i is lexicographically less than suffix j; > 0
+ * if suffix j is lexicographically greater.
+ */
+template <typename TStr>
+int64_t DifferenceCoverSample<TStr>::breakTie(TIndexOffU i, TIndexOffU j) const {
+	assert(built());
+	assert_neq(i, j);
+	assert_lt(i, this->text().length());
+	assert_lt(j, this->text().length());
+	uint32_t imodv = this->modv(i);
+	uint32_t jmodv = this->modv(j);
+	assert_neq(0xffffffff, _dInv[imodv]); // must be in the sample
+	assert_neq(0xffffffff, _dInv[jmodv]); // must be in the sample
+	uint32_t dimodv = _dInv[imodv];
+	uint32_t djmodv = _dInv[jmodv];
+	TIndexOffU ioff = this->divv(i);
+	TIndexOffU joff = this->divv(j);
+	assert_lt(dimodv+1, _doffs.size());
+	assert_lt(djmodv+1, _doffs.size());
+	// assert_lt: expected (32024) < (0)
+	assert_lt(ioff, _doffs[dimodv+1] - _doffs[dimodv]);
+	assert_lt(joff, _doffs[djmodv+1] - _doffs[djmodv]);
+	TIndexOffU isaIIdx = _doffs[dimodv] + ioff;
+	TIndexOffU isaJIdx = _doffs[djmodv] + joff;
+	assert_lt(isaIIdx, _isaPrime.size());
+	assert_lt(isaJIdx, _isaPrime.size());
+	assert_neq(isaIIdx, isaJIdx); // ranks must be unique
+	TIndexOffU isaPrimeI = _isaPrime[isaIIdx];
+	TIndexOffU isaPrimeJ = _isaPrime[isaJIdx];
+	assert_neq(isaPrimeI, isaPrimeJ); // ranks must be unique
+	assert_leq(isaPrimeI, _isaPrime.size());
+	assert_leq(isaPrimeJ, _isaPrime.size());
+	return (int64_t)isaPrimeI - (int64_t)isaPrimeJ;
+}
+
+/**
+ * Given i, j, return the number of additional characters that need to
+ * be compared before the difference cover can break the tie.
+ */
+template <typename TStr>
+uint32_t DifferenceCoverSample<TStr>::tieBreakOff(TIndexOffU i, TIndexOffU j) const {
+	const TStr& t = this->text();
+	const EList<uint32_t>& dmap = this->dmap();
+	assert(built());
+	// It's actually convenient to allow this, but we're permitted to
+	// return nonsense in that case
+	if(t[i] != t[j]) return 0xffffffff;
+	//assert_eq(t[i], t[j]); // if they're unequal, there's no tie to break
+	uint32_t v = this->v();
+	assert_neq(i, j);
+	assert_lt(i, t.length());
+	assert_lt(j, t.length());
+	uint32_t imod = this->modv(i);
+	uint32_t jmod = this->modv(j);
+	uint32_t diffLeft = (jmod >= imod)? (jmod - imod) : (jmod + v - imod);
+	uint32_t diffRight = (imod >= jmod)? (imod - jmod) : (imod + v - jmod);
+	assert_lt(diffLeft, dmap.size());
+	assert_lt(diffRight, dmap.size());
+	uint32_t destLeft = dmap[diffLeft];   // offset where i needs to be
+	uint32_t destRight = dmap[diffRight]; // offset where i needs to be
+	assert(isCovered(destLeft));
+	assert(isCovered(destLeft+diffLeft));
+	assert(isCovered(destRight));
+	assert(isCovered(destRight+diffRight));
+	assert_lt(destLeft, v);
+	assert_lt(destRight, v);
+	uint32_t deltaLeft = (destLeft >= imod)? (destLeft - imod) : (destLeft + v - imod);
+	if(deltaLeft == v) deltaLeft = 0;
+	uint32_t deltaRight = (destRight >= jmod)? (destRight - jmod) : (destRight + v - jmod);
+	if(deltaRight == v) deltaRight = 0;
+	assert_lt(deltaLeft, v);
+	assert_lt(deltaRight, v);
+	assert(isCovered(i+deltaLeft));
+	assert(isCovered(j+deltaLeft));
+	assert(isCovered(i+deltaRight));
+	assert(isCovered(j+deltaRight));
+	return min(deltaLeft, deltaRight);
+}
+
+#endif /*DIFF_SAMPLE_H_*/
diff --git a/doc/README b/doc/README
new file mode 100644
index 0000000..f2ca3e9
--- /dev/null
+++ b/doc/README
@@ -0,0 +1,4 @@
+To populate this directory, change to the bowtie2 directory and type
+'make doc'.  You must have pandoc installed:
+
+  http://johnmacfarlane.net/pandoc/
diff --git a/doc/add.css b/doc/add.css
new file mode 100644
index 0000000..849f79d
--- /dev/null
+++ b/doc/add.css
@@ -0,0 +1,57 @@
+.pageStyle #leftside { 
+  color: #666;
+}
+
+.pageStyle #leftside a { 
+  color: #0066B3;
+  text-decoration: none;
+}
+
+.pageStyle #leftside h1 {
+  background: none;
+  margin: 0 0 10px;
+  padding: 10px 0;
+  font: bold 1.9em Arial,Verdana,sans-serif;
+}
+
+.pageStyle #leftside h2 { 
+  background: none;
+  margin: 0 0 10px;
+  padding: 10px 0;
+  font: bold 1.2em Arial,Verdana,sans-serif;
+}  
+.pageStyle #leftside h3 { 
+  background: none;
+  margin: 0 0 10px 5px;
+  padding: 10px 0;
+  font: 1.2em Arial,Verdana,sans-serif;
+}
+
+.pageStyle #leftside table {
+  margin: 15px 0 0;
+}
+
+.pageStyle #leftside td {
+ vertical-align: top;
+}
+
+.pageStyle #leftside p { color:#444; }
+
+
+.pageStyle #leftside td p {
+ margin-left:15px;
+}
+
+.pageStyle #leftside h4 {
+    margin: 0px 15px 10px 10px;
+    padding: 10px 0px;
+    font: 1.1em Arial,Verdana,sans-serif;
+    background: none;
+}
+
+.pageStyle #leftside ul { margin:0; padding-left:0; list-style-type: circle; }
+.pageStyle #leftside #TOC ul { margin:0; padding-left:0; list-style-type: none; }
+.pageStyle #leftside li { color:#444; margin-left:14px; }
+.pageStyle #leftside #TOC li { margin-left:0; }
+.pageStyle #leftside #TOC li li { margin-left:14px; }
+.pageStyle #leftside p { padding: 0; margin:0 0 10px; }
diff --git a/doc/faq.shtml b/doc/faq.shtml
new file mode 100644
index 0000000..fa337a9
--- /dev/null
+++ b/doc/faq.shtml
@@ -0,0 +1,45 @@
+<!--#set var="Title" value="Centrifuge" -->
+<!--#set var="NoCrumbs" value="1" -->
+<!--#set var="SubTitle" value="Classifier for metagenomic sequences"-->
+<!--#set var="ExtraCSS" value="/software/hisat/add.css"-->
+<!--#include virtual="/iheader_r.shtml"-->
+<div id="mainContent">
+  <div id="main">
+	<div id="rightside">
+	<!--#include virtual="sidebar.inc.shtml"-->
+	</div> <!-- End of "rightside" -->
+    <div id="leftside">
+  	<h1>Frequently Asked Questions</h1><br>
+      <div id="toc">
+  	    <ul>
+	    <!--
+	      <li><a href="#edit_dist"><br>
+		</a></li>
+  	      </ul><br>
+		
+<h2 id="edit_dist"><br>
+		  </h2>
+		  <p><br>
+		  
+  </p>
+-->
+
+      </div>
+   </div>
+</div>
+</div>
+
+<!--#include virtual="footer.inc.html"-->
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-6101038-1");
+pageTracker._trackPageview();
+</script>
+
+</body>
+</html>
diff --git a/doc/footer.inc.html b/doc/footer.inc.html
new file mode 100644
index 0000000..8520a2d
--- /dev/null
+++ b/doc/footer.inc.html
@@ -0,0 +1,7 @@
+<div id="footer">
+  <table cellspacing="15" width="100%"><tbody><tr><td>
+   This research was supported in part by NIH grants R01-LM06845 and R01-GM083873 and NSF grant CCF-0347992.
+   </td><td align="right">
+   Administrator: <a href="mailto:infphilo at gmail.com">Daehwan Kim</a>. Design by <a href="http://www.free-css-templates.com" title="Design by David Herreman">David Herreman</a>
+   </td></tr></tbody></table>
+</div>
diff --git a/doc/index.shtml b/doc/index.shtml
new file mode 100644
index 0000000..cf078b8
--- /dev/null
+++ b/doc/index.shtml
@@ -0,0 +1,87 @@
+<!--#set var="Title" value="Centrifuge" -->
+<!--#set var="NoCrumbs" value="1" -->
+<!--#set var="SubTitle" value="Classifier for metagenomic sequences"-->
+<!--#set var="ExtraCSS" value="/software/centrifuge/add.css"-->
+<!--#include virtual="/iheader_r.shtml"-->
+<div id="mainContent">
+  <div id="subheader">
+    <table width="100%"><tbody><tr>
+	  <td>
+	    <strong>Centrifuge</strong> is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples, with better sensitivity than and comparable accuracy to other leading systems. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index, optimized specifically for the metagenomic classification problem. Centrifuge requires a relatively small index (e.g., 4.3 GB for ~4,100 bacterial ge [...]
+	    <br>
+	  </td>
+	  <td valign="middle" align="right">
+	    <a href="http://opensource.org"><img alt="Open Source Software" src="images/osi-certified.gif" border="0"></a>
+	</td></tr>
+    </tbody></table>
+  </div>
+  <div id="main">
+    <div id="rightside">
+	<!--#include virtual="sidebar.inc.shtml"-->
+	</div> <!-- End of "rightside" -->
+	<div id="leftside">
+	  <h2>Centrifuge 1.0.2-beta release 5/25/2016</h2>
+          <ul>
+	    <li>Fixed a runtime error during abundance analysis.</li>
+	    <li>Changed a default report file name from centrifuge_report.csv to centrifuge_report.tsv. </li>
+          </ul>
+          <br/>
+
+	  <h2>Centrifuge preprint is available <a href="http://biorxiv.org/content/early/2016/05/25/054965">here</a> at bioRxiv 5/24/2016</h2>
+	  
+	  <h2>Centrifuge 1.0.1-beta release 3/8/2016</h2>
+          <ul>
+	    <li>
+	      Centrifuge is now able to work directly with SRA data: both downloaded on demand over internet and prefetched to local disks.
+	      <ul>
+              <li>
+                For example, you can run Centrifuge with SRA data (SRR353653) as follows. <br/>
+                <i>centrifuge -x /path/to/index --sra-acc SRR353653</i>
+              </li>
+              <li> This eliminates the need to download SRA reads manually and to convert them into fasta/fastq format without affecting the run time. </li>
+            </ul>
+	    </li>
+	    <li>
+	      We provide a Centrifuge index (<i>nt</i> index) for NCBI nucleotide non-redundant sequences collected from plasmids, organelles, viruses, archaea, bacteria, and eukaryotes, totaling ~109 billion bps. Centrifuge is a very good alternative to Megablast (or Blast) for searching through this huge database.
+	    </li>
+	    <li>
+	      Fixed Centrifuge's scripts related to sequence downloading and index building.
+	    </li>
+          </ul>
+          <br/>
+	  <h2>Centrifuge 1.0.0-beta release 2/19/2016 - first release</h2>
+          <ul>
+	    <li>
+	      The first release of Centrifuge features a dramatically reduced database size,  higher classification accuracy and sensitivity, and comparably rapid classification speed.
+	    </li>
+	    <li>
+	      Please refer to the manual for details on how to run Centrifuge and interpret Centrifuge’s classification results.
+	    </li>
+	    <li>
+	      We provide several standard indexes designed to meet the needs of most users (see the side panel - Indexes)
+	      <ul>
+		<li> For compressed indexes, we first combined bacterial genomes belonging to the same species and removed redundant sequences, and built indexes using the combined sequences.
+		As a result, those compressed indexes are much smaller than uncompressed indexes.  Centrifuge classifies reads at the species level when using the compressed indexes and at the strain level (or the genome level) when using the uncompressed indexes. </li>
+	      </ul>
+	    </li>
+          </ul>
+          <br/>
+	  <h2>The Centrifuge source code is available in a <a href="https://github.com/infphilo/centrifuge">public GitHub repository</a> (7/14/2015).</h2>
+	</div>
+  </div>
+</div>
+
+<!--#include virtual="footer.inc.html"-->
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-6101038-1");
+pageTracker._trackPageview();
+</script>
+<!-- End google analytics code -->
+</body>
+</html>
diff --git a/doc/manual.html b/doc/manual.html
new file mode 100644
index 0000000..c3cdbd2
--- /dev/null
+++ b/doc/manual.html
@@ -0,0 +1,1060 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  <meta http-equiv="Content-Style-Type" content="text/css" />
+  <meta name="generator" content="pandoc" />
+  <title>HISAT Manual - </title>
+  <style type="text/css">code{white-space: pre;}</style>
+  <link rel="stylesheet" href="style.css" type="text/css" />
+</head>
+<body>
+<h1>Table of Contents</h1>
+<div id="TOC">
+<ul>
+<li><a href="#introduction">Introduction</a><ul>
+<li><a href="#what-is-hisat">What is HISAT?</a></li>
+</ul></li>
+<li><a href="#obtaining-bowtie-2">Obtaining Bowtie 2</a><ul>
+<li><a href="#building-from-source">Building from source</a></li>
+<li><a href="#adding-to-path">Adding to PATH</a></li>
+<li><a href="#reporting">Reporting</a><ul>
+<li><a href="#distinct-alignments-map-a-read-to-different-places">Distinct alignments map a read to different places</a></li>
+<li><a href="#default-mode-search-for-multiple-alignments-report-the-best-one">Default mode: search for multiple alignments, report the best one</a></li>
+<li><a href="#k-mode-search-for-one-or-more-alignments-report-each">-k mode: search for one or more alignments, report each</a></li>
+</ul></li>
+<li><a href="#alignment-summmary">Alignment summmary</a></li>
+<li><a href="#wrapper">Wrapper</a></li>
+<li><a href="#performance-tuning">Performance tuning</a></li>
+<li><a href="#command-line">Command Line</a><ul>
+<li><a href="#setting-function-options">Setting function options</a></li>
+<li><a href="#usage">Usage</a></li>
+<li><a href="#main-arguments">Main arguments</a></li>
+<li><a href="#options">Options</a></li>
+</ul></li>
+<li><a href="#sam-output">SAM output</a></li>
+</ul></li>
+<li><a href="#the-bowtie2-build-indexer">The <code>bowtie2-build</code> indexer</a><ul>
+<li><a href="#command-line-1">Command Line</a><ul>
+<li><a href="#main-arguments-1">Main arguments</a></li>
+<li><a href="#options-1">Options</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#the-bowtie2-inspect-index-inspector">The <code>bowtie2-inspect</code> index inspector</a><ul>
+<li><a href="#command-line-2">Command Line</a><ul>
+<li><a href="#main-arguments-2">Main arguments</a></li>
+<li><a href="#options-2">Options</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#getting-started-with-bowtie-2-lambda-phage-example">Getting started with Bowtie 2: Lambda phage example</a><ul>
+<li><a href="#indexing-a-reference-genome">Indexing a reference genome</a></li>
+<li><a href="#aligning-example-reads">Aligning example reads</a></li>
+<li><a href="#paired-end-example">Paired-end example</a></li>
+<li><a href="#local-alignment-example">Local alignment example</a></li>
+<li><a href="#using-samtoolsbcftools-downstream">Using SAMtools/BCFtools downstream</a></li>
+</ul></li>
+</ul>
+</div>
+<!--
+ ! This manual is written in "markdown" format and thus contains some
+ ! distracting formatting clutter.  See 'MANUAL' for an easier-to-read version
+ ! of this text document, or see the HTML manual online.
+ ! -->
+
+<h1 id="introduction">Introduction</h1>
+<h2 id="what-is-hisat">What is HISAT?</h2>
+<p><a href="http://www.ccb.jhu.edu/software/hisat">HISAT</a> is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s or 1,000s of characters to relatively long (e.g. mammalian) genomes. Bowtie 2 indexes the genome with an <a href="http://portal.acm.org/citation.cfm?id=796543">FM Index</a> (based on the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheel [...]
+<p><a href="http://bowtie-bio.sf.net/bowtie2">Bowtie 2</a> is often the first step in pipelines for comparative genomics, including for variation calling, ChIP-seq, RNA-seq, BS-seq. <a href="http://bowtie-bio.sf.net/bowtie2">Bowtie 2</a> and <a href="http://bowtie-bio.sf.net">Bowtie</a> (also called "<a href="http://bowtie-bio.sf.net">Bowtie 1</a>" here) are also tightly integrated into some tools, including <a href="http://tophat.cbcb.umd.edu/">TopHat</a>: a fast splice juncti [...]
+<h1 id="obtaining-bowtie-2">Obtaining Bowtie 2</h1>
+<p>Download Bowtie 2 sources and binaries from the <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/">Download</a> section of the Sourceforge site. Binaries are available for Intel architectures (<code>i386</code> and <code>x86_64</code>) running Linux, and Mac OS X. A 32-bit version is available for Windows. If you plan to compile Bowtie 2 yourself, make sure to get the source package, i.e., the filename that ends in "-source.zip".</p>
+<h2 id="building-from-source">Building from source</h2>
+<p>Building Bowtie 2 from source requires a GNU-like environment with GCC, GNU Make and other basics. It should be possible to build Bowtie 2 on most vanilla Linux installations or on a Mac installation with <a href="http://developer.apple.com/xcode/">Xcode</a> installed. Bowtie 2 can also be built on Windows using <a href="http://www.cygwin.com/">Cygwin</a> or <a href="http://www.mingw.org/">MinGW</a> (MinGW recommended). For a MinGW build the choice of what compiler is to be used is im [...]
+<p>First, download the source package from the <a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/">sourceforge site</a>. Make sure you're getting the source package; the file downloaded should end in <code>-source.zip</code>. Unzip the file, change to the unzipped directory, and build the Bowtie 2 tools by running GNU <code>make</code> (usually with the command <code>make</code>, but sometimes with <code>gmake</code>) with no arguments. If building with MinGW, run <code> [...]
+<p>Bowtie 2 is using the multithreading software model in order to speed up execution times on SMP architectures where this is possible. On POSIX platforms (like linux, Mac OS, etc) it needs the pthread library. Although it is possible to use pthread library on non-POSIX platform like Windows, due to performance reasons bowtie 2 will try to use Windows native multithreading if possible.</p>
+<h2 id="adding-to-path">Adding to PATH</h2>
+<p>By adding your new Bowtie 2 directory to your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH environment variable</a>, you ensure that whenever you run <code>bowtie2</code>, <code>bowtie2-build</code> or <code>bowtie2-inspect</code> from the command line, you will get the version you just installed without having to specify the entire path. This is recommended for most users. To do this, follow your operating system's instructions for adding the directory to your <a href= [...]
+<p>If you would like to install Bowtie 2 by copying the Bowtie 2 executable files to an existing directory in your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH</a>, make sure that you copy all the executables, including <code>bowtie2</code>, <code>bowtie2-align</code>, <code>bowtie2-build</code> and <code>bowtie2-inspect</code>.</p>
+<h2 id="reporting">Reporting</h2>
+<p>The reporting mode governs how many alignments Bowtie 2 looks for, and how to report them. Bowtie 2 has three distinct reporting modes. The default reporting mode is similar to the default reporting mode of many other read alignment tools, including <a href="http://bio-bwa.sourceforge.net/">BWA</a>. It is also similar to Bowtie 1's <code>-M</code> alignment mode.</p>
+<p>In general, when we say that a read has an alignment, we mean that it has a <a href="#valid-alignments-meet-or-exceed-the-minimum-score-threshold">valid alignment</a>. When we say that a read has multiple alignments, we mean that it has multiple alignments that are valid and distinct from one another.</p>
+<h3 id="distinct-alignments-map-a-read-to-different-places">Distinct alignments map a read to different places</h3>
+<p>Two alignments for the same individual read are "distinct" if they map the same read to different places. Specifically, we say that two alignments are distinct if there are no alignment positions where a particular read offset is aligned opposite a particular reference offset in both alignments with the same orientation. E.g. if the first alignment is in the forward orientation and aligns the read character at read offset 10 to the reference character at chromosome 3, offset [...]
+<p>Two alignments for the same pair are distinct if either the mate 1s in the two paired-end alignments are distinct or the mate 2s in the two alignments are distinct or both.</p>
+<h3 id="default-mode-search-for-multiple-alignments-report-the-best-one">Default mode: search for multiple alignments, report the best one</h3>
+<p>By default, Bowtie 2 searches for distinct, valid alignments for each read. When it finds a valid alignment, it generally will continue to look for alignments that are nearly as good or better. It will eventually stop looking, either because it exceeded a limit placed on search effort (see [<code>-D</code>] and <a href="#bowtie2-options-r"><code>-R</code></a>) or because it already knows all it needs to know to report an alignment. Information from the best alignments are used to esti [...]
+<p>See also: [<code>-D</code>], which puts an upper limit on the number of dynamic programming problems (i.e. seed extensions) that can "fail" in a row before Bowtie 2 stops searching. Increasing [<code>-D</code>] makes Bowtie 2 slower, but increases the likelihood that it will report the correct alignment for a read that aligns many places.</p>
+<p>See also: <a href="#bowtie2-options-r"><code>-R</code></a>, which sets the maximum number of times Bowtie 2 will "re-seed" when attempting to align a read with repetitive seeds. Increasing <a href="#bowtie2-options-r"><code>-R</code></a> makes Bowtie 2 slower, but increases the likelihood that it will report the correct alignment for a read that aligns many places.</p>
+<h3 id="k-mode-search-for-one-or-more-alignments-report-each">-k mode: search for one or more alignments, report each</h3>
+<p>In <a href="#bowtie2-options-k"><code>-k</code></a> mode, Bowtie 2 searches for up to N distinct, valid alignments for each read, where N equals the integer specified with the <code>-k</code> parameter. That is, if <code>-k 2</code> is specified, Bowtie 2 will search for at most 2 distinct alignments. It reports all alignments found, in descending order by alignment score. The alignment score for a paired-end alignment equals the sum of the alignment scores of the individual mates. Ea [...]
+<p>Bowtie 2 does not "find" alignments in any specific order, so for reads that have more than N distinct, valid alignments, Bowtie 2 does not gaurantee that the N alignments reported are the best possible in terms of alignment score. Still, this mode can be effective and fast in situations where the user cares more about whether a read aligns (or aligns a certain number of times) than where exactly it originated.</p>
+<h2 id="alignment-summmary">Alignment summmary</h2>
+<p>When Bowtie 2 finishes running, it prints messages summarizing what happened. These messages are printed to the "standard error" ("stderr") filehandle. For datasets consisting of unpaired reads, the summary might look like this:</p>
+<pre><code>20000 reads; of these:
+  20000 (100.00%) were unpaired; of these:
+    1247 (6.24%) aligned 0 times
+    18739 (93.69%) aligned exactly 1 time
+    14 (0.07%) aligned >1 times
+93.77% overall alignment rate</code></pre>
+<p>For datasets consisting of pairs, the summary might look like this:</p>
+<pre><code>10000 reads; of these:
+  10000 (100.00%) were paired; of these:
+    650 (6.50%) aligned concordantly 0 times
+    8823 (88.23%) aligned concordantly exactly 1 time
+    527 (5.27%) aligned concordantly >1 times
+    ----
+    650 pairs aligned concordantly 0 times; of these:
+      34 (5.23%) aligned discordantly 1 time
+    ----
+    616 pairs aligned 0 times concordantly or discordantly; of these:
+      1232 mates make up the pairs; of these:
+        660 (53.57%) aligned 0 times
+        571 (46.35%) aligned exactly 1 time
+        1 (0.08%) aligned >1 times
+96.70% overall alignment rate</code></pre>
+<p>The indentation indicates how subtotals relate to totals.</p>
+<h2 id="wrapper">Wrapper</h2>
+<p>The <code>bowtie2</code> executable is actually a Perl wrapper script that calls the compiled <code>bowtie2-align</code> binary. It is recommended that you always run the <code>bowtie2</code> wrapper and not run <code>bowtie2-align</code> directly.</p>
+<h2 id="performance-tuning">Performance tuning</h2>
+<ol style="list-style-type: decimal">
+<li><p>Use 64-bit version if possible</p>
+<p>The 64-bit version of Bowtie 2 is faster than the 32-bit version, owing to its use of 64-bit arithmetic. If possible, download the 64-bit binaries for Bowtie 2 and run on a 64-bit computer. If you are building Bowtie 2 from sources, you may need to pass the <code>-m64</code> option to <code>g++</code> to compile the 64-bit version; you can do this by including <code>BITS=64</code> in the arguments to the <code>make</code> command; e.g.: <code>make BITS=64 bowtie2</code>. To determine  [...]
+<li><p>If your computer has multiple processors/cores, use <code>-p</code></p>
+<p>The <a href="#bowtie2-options-p"><code>-p</code></a> option causes Bowtie 2 to launch a specified number of parallel search threads. Each thread runs on a different processor/core and all threads find alignments in parallel, increasing alignment throughput by approximately a multiple of the number of threads (though in practice, speedup is somewhat worse than linear).</p></li>
+</ol>
+<h2 id="command-line">Command Line</h2>
+<h3 id="setting-function-options">Setting function options</h3>
+<p>Some Bowtie 2 options specify a function rather than an individual number or setting. In these cases the user specifies three parameters: (a) a function type <code>F</code>, (b) a constant term <code>B</code>, and (c) a coefficient <code>A</code>. The available function types are constant (<code>C</code>), linear (<code>L</code>), square-root (<code>S</code>), and natural log (<code>G</code>). The parameters are specified as <code>F,B,A</code> - that is, the function type, the constan [...]
+<p>For example, if the function specification is <code>L,-0.4,-0.6</code>, then the function defined is:</p>
+<pre><code>f(x) = -0.4 + -0.6 * x</code></pre>
+<p>If the function specification is <code>G,1,5.4</code>, then the function defined is:</p>
+<pre><code>f(x) = 1.0 + 5.4 * ln(x)</code></pre>
+<p>See the documentation for the option in question to learn what the parameter <code>x</code> is for. For example, in the case if the <a href="#bowtie2-options-score-min"><code>--score-min</code></a> option, the function <code>f(x)</code> sets the minimum alignment score necessary for an alignment to be considered valid, and <code>x</code> is the read length.</p>
+<h3 id="usage">Usage</h3>
+<pre><code>bowtie2 [options]* -x <bt2-idx> {-1 <m1> -2 <m2> | -U <r>} -S [<hit>]</code></pre>
+<h3 id="main-arguments">Main arguments</h3>
+<table><tr><td>
+
+<pre><code>-x <bt2-idx></code></pre>
+</td><td>
+
+<p>The basename of the index for the reference genome. The basename is the name of any of the index files up to but not including the final <code>.1.bt2</code> / <code>.rev.1.bt2</code> / etc. <code>bowtie2</code> looks for the specified index first in the current directory, then in the directory specified in the <code>BOWTIE2_INDEXES</code> environment variable.</p>
+</td></tr><tr><td>
+
+<pre><code>-1 <m1></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing mate 1s (filename usually includes <code>_1</code>), e.g. <code>-1 flyA_1.fq,flyB_1.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m2></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>bowtie2</code> will read the mate 1s from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-2 <m2></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing mate 2s (filename usually includes <code>_2</code>), e.g. <code>-2 flyA_2.fq,flyB_2.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m1></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>bowtie2</code> will read the mate 2s from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-U <r></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing unpaired reads to be aligned, e.g. <code>lane1.fq,lane2.fq,lane3.fq,lane4.fq</code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>bowtie2</code> gets the reads from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-S <hit></code></pre>
+</td><td>
+
+<p>File to write SAM alignments to. By default, alignments are written to the "standard out" or "stdout" filehandle (i.e. the console).</p>
+</td></tr></table>
+
+<h3 id="options">Options</h3>
+<h4 id="input-options">Input options</h4>
+<table>
+<tr><td id="bowtie2-options-q">
+
+<pre><code>-q</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTQ files. FASTQ files usually have extension <code>.fq</code> or <code>.fastq</code>. FASTQ is the default format. See also: <a href="#bowtie2-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#bowtie2-options-int-quals"><code>--int-quals</code></a>.</p>
+</td></tr>
+<tr><td id="bowtie2-options-qseq">
+
+<pre><code>--qseq</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are QSEQ files. QSEQ files usually end in <code>_qseq.txt</code>. See also: <a href="#bowtie2-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#bowtie2-options-int-quals"><code>--int-quals</code></a>.</p>
+</td></tr>
+<tr><td id="bowtie2-options-f">
+
+<pre><code>-f</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTA files. FASTA files usually have extension <code>.fa</code>, <code>.fasta</code>, <code>.mfa</code>, <code>.fna</code> or similar. FASTA files do not have a way of specifying quality values, so when <code>-f</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
+</td></tr>
+<tr><td id="bowtie2-options-r">
+
+<pre><code>-r</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are files with one input sequence per line, without any other information (no read names, no qualities). When <code>-r</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
+</td></tr>
+<tr><td id="bowtie2-options-c">
+
+<pre><code>-c</code></pre>
+</td><td>
+
+<p>The read sequences are given on command line. I.e. <code><m1></code>, <code><m2></code> and <code><singles></code> are comma-separated lists of reads rather than lists of read files. There is no way to specify read names or qualities, so <code>-c</code> also implies <code>--ignore-quals</code>.</p>
+</td></tr>
+<tr><td id="bowtie2-options-s">
+
+<pre><code>-s/--skip <int></code></pre>
+</td><td>
+
+<p>Skip (i.e. do not align) the first <code><int></code> reads or pairs in the input.</p>
+</td></tr>
+<tr><td id="bowtie2-options-u">
+
+<pre><code>-u/--qupto <int></code></pre>
+</td><td>
+
+<p>Align the first <code><int></code> reads or read pairs from the input (after the <a href="#bowtie2-options-s"><code>-s</code>/<code>--skip</code></a> reads or pairs have been skipped), then stop. Default: no limit.</p>
+</td></tr>
+<tr><td id="bowtie2-options-5">
+
+<pre><code>-5/--trim5 <int></code></pre>
+</td><td>
+
+<p>Trim <code><int></code> bases from 5' (left) end of each read before alignment (default: 0).</p>
+</td></tr>
+<tr><td id="bowtie2-options-3">
+
+<pre><code>-3/--trim3 <int></code></pre>
+</td><td>
+
+<p>Trim <code><int></code> bases from 3' (right) end of each read before alignment (default: 0).</p>
+</td></tr><tr><td id="bowtie2-options-phred33-quals">
+
+<pre><code>--phred33</code></pre>
+</td><td>
+
+<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 33. This is also called the "Phred+33" encoding, which is used by the very latest Illumina pipelines.</p>
+</td></tr>
+<tr><td id="bowtie2-options-phred64-quals">
+
+<pre><code>--phred64</code></pre>
+</td><td>
+
+<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 64. This is also called the "Phred+64" encoding.</p>
+</td></tr>
+<tr><td id="bowtie2-options-solexa-quals">
+
+<pre><code>--solexa-quals</code></pre>
+</td><td>
+
+<p>Convert input qualities from <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Solexa</a> (which can be negative) to <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred</a> (which can't). This scheme was used in older Illumina GA Pipeline versions (prior to 1.3). Default: off.</p>
+</td></tr>
+<tr><td id="bowtie2-options-int-quals">
+
+<pre><code>--int-quals</code></pre>
+</td><td>
+
+<p>Quality values are represented in the read input file as space-separated ASCII integers, e.g., <code>40 40 30 40</code>..., rather than ASCII characters, e.g., <code>II?I</code>.... Integers are treated as being on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> scale unless <a href="#bowtie2-options-solexa-quals"><code>--solexa-quals</code></a> is also specified. Default: off.</p>
+</td></tr></table>
+
+<h4 id="alignment-options">Alignment options</h4>
+<table>
+
+<tr><td id="bowtie2-options-n-ceil">
+
+<pre><code>--n-ceil <func></code></pre>
+</td><td>
+
+<p>Sets a function governing the maximum number of ambiguous characters (usually <code>N</code>s and/or <code>.</code>s) allowed in a read as a function of read length. For instance, specifying <code>-L,0,0.15</code> sets the N-ceiling function <code>f</code> to <code>f(x) = 0 + 0.15 * x</code>, where x is the read length. See also: [setting function options]. Reads exceeding this ceiling are <a href="#filtering">filtered out</a>. Default: <code>L,0,0.15</code>.</p>
+</td></tr>
+
+<tr><td id="bowtie2-options-ignore-quals">
+
+<pre><code>--ignore-quals</code></pre>
+</td><td>
+
+<p>When calculating a mismatch penalty, always consider the quality value at the mismatched position to be the highest possible, regardless of the actual value. I.e. input is treated as though all quality values are high. This is also the default behavior when the input doesn't specify quality values (e.g. in <a href="#bowtie2-options-f"><code>-f</code></a>, <a href="#bowtie2-options-r"><code>-r</code></a>, or <a href="#bowtie2-options-c"><code>-c</code></a> modes).</p>
+</td></tr>
+<tr><td id="bowtie2-options-nofw">
+
+<pre><code>--nofw/--norc</code></pre>
+</td><td>
+
+<p>If <code>--nofw</code> is specified, <code>bowtie2</code> will not attempt to align unpaired reads to the forward (Watson) reference strand. If <code>--norc</code> is specified, <code>bowtie2</code> will not attempt to align unpaired reads against the reverse-complement (Crick) reference strand. In paired-end mode, <code>--nofw</code> and <code>--norc</code> pertain to the fragments; i.e. specifying <code>--nofw</code> causes <code>bowtie2</code> to explore only those paired-end confi [...]
+</td></tr>
+<tr><td id="bowtie2-options-end-to-end">
+
+<pre><code>--end-to-end</code></pre>
+</td><td>
+
+<p>In this mode, Bowtie 2 requires that the entire read align from one end to the other, without any trimming (or "soft clipping") of characters from either end. The match bonus <a href="#bowtie2-options-ma"><code>--ma</code></a> always equals 0 in this mode, so all alignment scores are less than or equal to 0, and the greatest possible alignment score is 0. This is mutually exclusive with <a href="#bowtie2-options-local"><code>--local</code></a>. <code>--end-to-end</code> is t [...]
+</td></tr>
+<tr><td id="bowtie2-options-local">
+
+<pre><code>--local</code></pre>
+</td><td>
+
+<p>In this mode, Bowtie 2 does not require that the entire read align from one end to the other. Rather, some characters may be omitted ("soft clipped") from the ends in order to achieve the greatest possible alignment score. The match bonus <a href="#bowtie2-options-ma"><code>--ma</code></a> is used in this mode, and the best possible alignment score is equal to the match bonus (<a href="#bowtie2-options-ma"><code>--ma</code></a>) times the length of the read. Specifying <code [...]
+</td></tr>
+</table>
+
+<h4 id="scoring-options">Scoring options</h4>
+<table>
+
+<tr><td id="bowtie2-options-ma">
+
+<pre><code>--ma <int></code></pre>
+</td><td>
+
+<p>Sets the match bonus. In <a href="#bowtie2-options-local"><code>--local</code></a> mode <code><int></code> is added to the alignment score for each position where a read character aligns to a reference character and the characters match. Not used in <a href="#bowtie2-options-end-to-end"><code>--end-to-end</code></a> mode. Default: 2.</p>
+</td></tr>
+<tr><td id="bowtie2-options-mp">
+
+<pre><code>--mp MX,MN</code></pre>
+</td><td>
+
+<p>Sets the maximum (<code>MX</code>) and minimum (<code>MN</code>) mismatch penalties, both integers. A number less than or equal to <code>MX</code> and greater than or equal to <code>MN</code> is subtracted from the alignment score for each position where a read character aligns to a reference character, the characters do not match, and neither is an <code>N</code>. If <a href="#bowtie2-options-ignore-quals"><code>--ignore-quals</code></a> is specified, the number subtracted quals <cod [...]
+</td></tr>
+<tr><td id="bowtie2-options-np">
+
+<pre><code>--np <int></code></pre>
+</td><td>
+
+<p>Sets penalty for positions where the read, reference, or both, contain an ambiguous character such as <code>N</code>. Default: 1.</p>
+</td></tr>
+<tr><td id="bowtie2-options-rdg">
+
+<pre><code>--rdg <int1>,<int2></code></pre>
+</td><td>
+
+<p>Sets the read gap open (<code><int1></code>) and extend (<code><int2></code>) penalties. A read gap of length N gets a penalty of <code><int1></code> + N * <code><int2></code>. Default: 5, 3.</p>
+</td></tr>
+<tr><td id="bowtie2-options-rfg">
+
+<pre><code>--rfg <int1>,<int2></code></pre>
+</td><td>
+
+<p>Sets the reference gap open (<code><int1></code>) and extend (<code><int2></code>) penalties. A reference gap of length N gets a penalty of <code><int1></code> + N * <code><int2></code>. Default: 5, 3.</p>
+</td></tr>
+<tr><td id="bowtie2-options-score-min">
+
+<pre><code>--score-min <func></code></pre>
+</td><td>
+
+<p>Sets a function governing the minimum alignment score needed for an alignment to be considered "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying <code>L,0,-0.6</code> sets the minimum-score function <code>f</code> to <code>f(x) = 0 + -0.6 * x</code>, where <code>x</code> is the read length. See also: [setting function options]. The default in <a href="#bowtie2-options-end-to-end"><code>--end-to-end</code></a> mode is <code>L,-0 [...]
+</td></tr>
+</table>
+
+<h4 id="reporting-options">Reporting options</h4>
+<table>
+
+<tr><td id="bowtie2-options-k">
+
+<pre><code>-k <int></code></pre>
+</td><td>
+
+<p>By default, <code>bowtie2</code> searches for distinct, valid alignments for each read. When it finds a valid alignment, it continues looking for alignments that are nearly as good or better. The best alignment found is reported (randomly selected from among best if tied). Information about the best alignments is used to estimate mapping quality and to set SAM optional fields, such as <a href="#bowtie2-build-opt-fields-as"><code>AS:i</code></a> and <a href="#bowtie2-build-opt-fields-x [...]
+<p>When <code>-k</code> is specified, however, <code>bowtie2</code> behaves differently. Instead, it searches for at most <code><int></code> distinct, valid alignments for each read. The search terminates when it can't find more distinct valid alignments, or when it finds <code><int></code>, whichever happens first. All alignments found are reported in descending order by alignment score. The alignment score for a paired-end alignment equals the sum of the alignment scores of [...]
+<p>Note: Bowtie 2 is not designed with large values for <code>-k</code> in mind, and when aligning reads to long, repetitive genomes large <code>-k</code> can be very, very slow.</p>
+</td></tr>
+<tr><td id="bowtie2-options-a">
+
+<pre><code>-a</code></pre>
+</td><td>
+
+<p>Like <a href="#bowtie2-options-k"><code>-k</code></a> but with no upper limit on number of alignments to search for. <code>-a</code> is mutually exclusive with <a href="#bowtie2-options-k"><code>-k</code></a>.</p>
+<p>Note: Bowtie 2 is not designed with <code>-a</code> mode in mind, and when aligning reads to long, repetitive genomes this mode can be very, very slow.</p>
+</td></tr>
+</table>
+
+<h4 id="paired-end-options">Paired-end options</h4>
+<table>
+
+<tr><td id="bowtie2-options-I">
+
+<pre><code>-I/--minins <int></code></pre>
+</td><td>
+
+<p>The minimum fragment length for valid paired-end alignments. E.g. if <code>-I 60</code> is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as <a href="#bowtie2-options-X"><code>-X</code></a> is also satisfied). A 19-bp gap would not be valid in that case. If trimming options <a href="#bowtie2-options-3"><code>-3</code></a> or <a href="#bowtie2-options-5"><cod [...]
+<p>The larger the difference between <a href="#bowtie2-options-I"><code>-I</code></a> and <a href="#bowtie2-options-X"><code>-X</code></a>, the slower Bowtie 2 will run. This is because larger differences bewteen <a href="#bowtie2-options-I"><code>-I</code></a> and <a href="#bowtie2-options-X"><code>-X</code></a> require that Bowtie 2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), Bowtie 2 is very efficient.</p>
+<p>Default: 0 (essentially imposing no minimum)</p>
+</td></tr>
+<tr><td id="bowtie2-options-X">
+
+<pre><code>-X/--maxins <int></code></pre>
+</td><td>
+
+<p>The maximum fragment length for valid paired-end alignments. E.g. if <code>-X 100</code> is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as <a href="#bowtie2-options-I"><code>-I</code></a> is also satisfied). A 61-bp gap would not be valid in that case. If trimming options <a href="#bowtie2-options-3"><code>-3</code></a> or <a href="#bowtie2-options-5"><code>-5 [...]
+<p>The larger the difference between <a href="#bowtie2-options-I"><code>-I</code></a> and <a href="#bowtie2-options-X"><code>-X</code></a>, the slower Bowtie 2 will run. This is because larger differences bewteen <a href="#bowtie2-options-I"><code>-I</code></a> and <a href="#bowtie2-options-X"><code>-X</code></a> require that Bowtie 2 scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), Bowtie 2 is very efficient.</p>
+<p>Default: 500.</p>
+</td></tr>
+<tr><td id="bowtie2-options-fr">
+
+<pre><code>--fr/--rf/--ff</code></pre>
+</td><td>
+
+<p>The upstream/downstream mate orientations for a valid paired-end alignment against the forward reference strand. E.g., if <code>--fr</code> is specified and there is a candidate paired-end alignment where mate 1 appears upstream of the reverse complement of mate 2 and the fragment length constraints (<a href="#bowtie2-options-I"><code>-I</code></a> and <a href="#bowtie2-options-X"><code>-X</code></a>) are met, that alignment is valid. Also, if mate 2 appears upstream of the reverse co [...]
+</td></tr>
+<tr><td id="bowtie2-options-no-mixed">
+
+<pre><code>--no-mixed</code></pre>
+</td><td>
+
+<p>By default, when <code>bowtie2</code> cannot find a concordant or discordant alignment for a pair, it then tries to find alignments for the individual mates. This option disables that behavior.</p>
+</td></tr>
+<tr><td id="bowtie2-options-no-discordant">
+
+<pre><code>--no-discordant</code></pre>
+</td><td>
+
+<p>By default, <code>bowtie2</code> looks for discordant alignments if it cannot find any concordant alignments. A discordant alignment is an alignment where both mates align uniquely, but that does not satisfy the paired-end constraints (<a href="#bowtie2-options-fr"><code>--fr</code>/<code>--rf</code>/<code>--ff</code></a>, <a href="#bowtie2-options-I"><code>-I</code></a>, <a href="#bowtie2-options-X"><code>-X</code></a>). This option disables that behavior.</p>
+</td></tr>
+<tr><td id="bowtie2-options-dovetail">
+
+<pre><code>--dovetail</code></pre>
+</td><td>
+
+<p>If the mates "dovetail", that is if one mate alignment extends past the beginning of the other such that the wrong mate begins upstream, consider that to be concordant. See also: <a href="#mates-can-overlap-contain-or-dovetail-each-other">Mates can overlap, contain or dovetail each other</a>. Default: mates cannot dovetail in a concordant alignment.</p>
+</td></tr>
+<tr><td id="bowtie2-options-no-contain">
+
+<pre><code>--no-contain</code></pre>
+</td><td>
+
+<p>If one mate alignment contains the other, consider that to be non-concordant. See also: <a href="#mates-can-overlap-contain-or-dovetail-each-other">Mates can overlap, contain or dovetail each other</a>. Default: a mate can contain the other in a concordant alignment.</p>
+</td></tr>
+<tr><td id="bowtie2-options-no-overlap">
+
+<pre><code>--no-overlap</code></pre>
+</td><td>
+
+<p>If one mate alignment overlaps the other at all, consider that to be non-concordant. See also: <a href="#mates-can-overlap-contain-or-dovetail-each-other">Mates can overlap, contain or dovetail each other</a>. Default: mates can overlap in a concordant alignment.</p>
+</td></tr></table>
+
+<h4 id="output-options">Output options</h4>
+<table>
+
+<tr><td id="bowtie2-options-t">
+
+<pre><code>-t/--time</code></pre>
+</td><td>
+
+<p>Print the wall-clock time required to load the index files and align the reads. This is printed to the "standard error" ("stderr") filehandle. Default: off.</p>
+</td></tr>
+<tr><td id="bowtie2-options-un">
+
+<pre><code>--un <path>
+--un-gz <path>
+--un-bz2 <path></code></pre>
+</td><td>
+
+<p>Write unpaired reads that fail to align to file at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code> bit set and neither the <code>0x40</code> nor <code>0x80</code> bits set. If <code>--un-gz</code> is specified, output will be gzip compressed. If <code>--un-bz2</code> is specified, output will be bzip2 compressed. Reads written in this way will appear exactly as they did in the input file, without any modification (same sequence, sam [...]
+</td></tr>
+<tr><td id="bowtie2-options-al">
+
+<pre><code>--al <path>
+--al-gz <path>
+--al-bz2 <path></code></pre>
+</td><td>
+
+<p>Write unpaired reads that align at least once to file at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code>, <code>0x40</code>, and <code>0x80</code> bits unset. If <code>--al-gz</code> is specified, output will be gzip compressed. If <code>--al-bz2</code> is specified, output will be bzip2 compressed. Reads written in this way will appear exactly as they did in the input file, without any modification (same sequence, same name, same q [...]
+</td></tr>
+<tr><td id="bowtie2-options-un-conc">
+
+<pre><code>--un-conc <path>
+--un-conc-gz <path>
+--un-conc-bz2 <path></code></pre>
+</td><td>
+
+<p>Write paired-end reads that fail to align concordantly to file(s) at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code> bit set and either the <code>0x40</code> or <code>0x80</code> bit set (depending on whether it's mate #1 or #2). <code>.1</code> and <code>.2</code> strings are added to the filename to distinguish which file contains mate #1 and mate #2. If a percent symbol, <code>%</code>, is used in <code><path></code>, the p [...]
+</td></tr>
+<tr><td id="bowtie2-options-al-conc">
+
+<pre><code>--al-conc <path>
+--al-conc-gz <path>
+--al-conc-bz2 <path></code></pre>
+</td><td>
+
+<p>Write paired-end reads that align concordantly at least once to file(s) at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code> bit unset and either the <code>0x40</code> or <code>0x80</code> bit set (depending on whether it's mate #1 or #2). <code>.1</code> and <code>.2</code> strings are added to the filename to distinguish which file contains mate #1 and mate #2. If a percent symbol, <code>%</code>, is used in <code><path></code [...]
+</td></tr>
+<tr><td id="bowtie2-options-quiet">
+
+<pre><code>--quiet</code></pre>
+</td><td>
+
+<p>Print nothing besides alignments and serious errors.</p>
+</td></tr>
+<tr><td id="bowtie2-options-met-file">
+
+<pre><code>--met-file <path></code></pre>
+</td><td>
+
+<p>Write <code>bowtie2</code> metrics to file <code><path></code>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#bowtie2-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
+</td></tr>
+<tr><td id="bowtie2-options-met-stderr">
+
+<pre><code>--met-stderr <path></code></pre>
+</td><td>
+
+<p>Write <code>bowtie2</code> metrics to the "standard error" ("stderr") filehandle. This is not mutually exclusive with <a href="#bowtie2-options-met-file"><code>--met-file</code></a>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#bowtie2-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
+</td></tr>
+<tr><td id="bowtie2-options-met">
+
+<pre><code>--met <int></code></pre>
+</td><td>
+
+<p>Write a new <code>bowtie2</code> metrics record every <code><int></code> seconds. Only matters if either <a href="#bowtie2-options-met-stderr"><code>--met-stderr</code></a> or <a href="#bowtie2-options-met-file"><code>--met-file</code></a> are specified. Default: 1.</p>
+</td></tr>
+</table>
+
+<h4 id="sam-options">SAM options</h4>
+<table>
+
+<tr><td id="bowtie2-options-no-unal">
+
+<pre><code>--no-unal</code></pre>
+</td><td>
+
+<p>Suppress SAM records for reads that failed to align.</p>
+</td></tr>
+<tr><td id="bowtie2-options-no-hd">
+
+<pre><code>--no-hd</code></pre>
+</td><td>
+
+<p>Suppress SAM header lines (starting with <code>@</code>).</p>
+</td></tr>
+<tr><td id="bowtie2-options-no-sq">
+
+<pre><code>--no-sq</code></pre>
+</td><td>
+
+<p>Suppress <code>@SQ</code> SAM header lines.</p>
+</td></tr>
+<tr><td id="bowtie2-options-rg-id">
+
+<pre><code>--rg-id <text></code></pre>
+</td><td>
+
+<p>Set the read group ID to <code><text></code>. This causes the SAM <code>@RG</code> header line to be printed, with <code><text></code> as the value associated with the <code>ID:</code> tag. It also causes the <code>RG:Z:</code> extra field to be attached to each SAM output record, with value set to <code><text></code>.</p>
+</td></tr>
+<tr><td id="bowtie2-options-rg">
+
+<pre><code>--rg <text></code></pre>
+</td><td>
+
+<p>Add <code><text></code> (usually of the form <code>TAG:VAL</code>, e.g. <code>SM:Pool1</code>) as a field on the <code>@RG</code> header line. Note: in order for the <code>@RG</code> line to appear, <a href="#bowtie2-options-rg-id"><code>--rg-id</code></a> must also be specified. This is because the <code>ID</code> tag is required by the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM Spec</a>. Specify <code>--rg</code> multiple times to set multiple fields. See the <a hr [...]
+</td></tr>
+<tr><td id="bowtie2-options-omit-sec-seq">
+
+<pre><code>--omit-sec-seq</code></pre>
+</td><td>
+
+<p>When printing secondary alignments, Bowtie 2 by default will write out the <code>SEQ</code> and <code>QUAL</code> strings. Specifying this option causes Bowtie 2 to print an asterix in those fields instead.</p>
+</td></tr>
+
+
+</table>
+
+<h4 id="performance-options">Performance options</h4>
+<table><tr>
+
+<td id="bowtie2-options-o">
+
+<pre><code>-o/--offrate <int></code></pre>
+</td><td>
+
+<p>Override the offrate of the index with <code><int></code>. If <code><int></code> is greater than the offrate used to build the index, then some row markings are discarded when the index is read into memory. This reduces the memory footprint of the aligner but requires more time to calculate text offsets. <code><int></code> must be greater than the value used to build the index.</p>
+</td></tr>
+<tr><td id="bowtie2-options-p">
+
+<pre><code>-p/--threads NTHREADS</code></pre>
+</td><td>
+
+<p>Launch <code>NTHREADS</code> parallel search threads (default: 1). Threads will run on separate processors/cores and synchronize when parsing reads and outputting alignments. Searching for alignments is highly parallel, and speedup is close to linear. Increasing <code>-p</code> increases Bowtie 2's memory footprint. E.g. when aligning to a human genome index, increasing <code>-p</code> from 1 to 8 increases the memory footprint by a few hundred megabytes. This option is only available [...]
+</td></tr>
+<tr><td id="bowtie2-options-reorder">
+
+<pre><code>--reorder</code></pre>
+</td><td>
+
+<p>Guarantees that output SAM records are printed in an order corresponding to the order of the reads in the original input file, even when <a href="#bowtie2-options-p"><code>-p</code></a> is set greater than 1. Specifying <code>--reorder</code> and setting <a href="#bowtie2-options-p"><code>-p</code></a> greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then if <code>--reorder</code> were not specified. Has no effect if <a href="#bowtie2-options-p"><code [...]
+</td></tr>
+<tr><td id="bowtie2-options-mm">
+
+<pre><code>--mm</code></pre>
+</td><td>
+
+<p>Use memory-mapped I/O to load the index, rather than typical file I/O. Memory-mapping allows many concurrent <code>bowtie</code> processes on the same computer to share the same memory image of the index (i.e. you pay the memory overhead just once). This facilitates memory-efficient parallelization of <code>bowtie</code> in situations where using <a href="#bowtie2-options-p"><code>-p</code></a> is not possible or not preferable.</p>
+</td></tr></table>
+
+<h4 id="other-options">Other options</h4>
+<table>
+<tr><td id="bowtie2-options-qc-filter">
+
+<pre><code>--qc-filter</code></pre>
+</td><td>
+
+<p>Filter out reads for which the QSEQ filter field is non-zero. Only has an effect when read format is <a href="#bowtie2-options-qseq"><code>--qseq</code></a>. Default: off.</p>
+</td></tr>
+<tr><td id="bowtie2-options-seed">
+
+<pre><code>--seed <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the seed for pseudo-random number generator. Default: 0.</p>
+</td></tr>
+<tr><td id="bowtie2-options-non-deterministic">
+
+<pre><code>--non-deterministic</code></pre>
+</td><td>
+
+<p>Normally, Bowtie 2 re-initializes its pseudo-random generator for each read. It seeds the generator with a number derived from (a) the read name, (b) the nucleotide sequence, (c) the quality sequence, (d) the value of the <a href="#bowtie2-options-seed"><code>--seed</code></a> option. This means that if two reads are identical (same name, same nucleotides, same qualities) Bowtie 2 will find and report the same alignment(s) for both, even if there was ambiguity. When <code>--non-determ [...]
+</td></tr>
+<tr><td id="bowtie2-options-version">
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr>
+<tr><td id="bowtie2-options-h">
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr></table>
+
+<h2 id="sam-output">SAM output</h2>
+<p>Following is a brief description of the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM</a> format as output by <code>bowtie2</code>. For more details, see the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM format specification</a>.</p>
+<p>By default, <code>bowtie2</code> prints a SAM header with <code>@HD</code>, <code>@SQ</code> and <code>@PG</code> lines. When one or more <a href="#bowtie2-options-rg"><code>--rg</code></a> arguments are specified, <code>bowtie2</code> will also print an <code>@RG</code> line that includes all user-specified <a href="#bowtie2-options-rg"><code>--rg</code></a> tokens separated by tabs.</p>
+<p>Each subsequnt line describes an alignment or, if the read failed to align, a read. Each line is a collection of at least 12 fields separated by tabs; from left to right, the fields are:</p>
+<ol style="list-style-type: decimal">
+<li><p>Name of read that aligned.</p>
+<p>Note that the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM specification</a> disallows whitespace in the read name. If the read name contains any whitespace characters, Bowtie 2 will truncate the name at the first whitespace character. This is similar to the behavior of other tools.</p></li>
+<li><p>Sum of all applicable flags. Flags relevant to Bowtie are:</p>
+<table><tr><td>
+
+<pre><code>1</code></pre>
+</td><td>
+
+<p>The read is one of a pair</p>
+</td></tr><tr><td>
+
+<pre><code>2</code></pre>
+</td><td>
+
+<p>The alignment is one end of a proper paired-end alignment</p>
+</td></tr><tr><td>
+
+<pre><code>4</code></pre>
+</td><td>
+
+<p>The read has no reported alignments</p>
+</td></tr><tr><td>
+
+<pre><code>8</code></pre>
+</td><td>
+
+<p>The read is one of a pair and has no reported alignments</p>
+</td></tr><tr><td>
+
+<pre><code>16</code></pre>
+</td><td>
+
+<p>The alignment is to the reverse reference strand</p>
+</td></tr><tr><td>
+
+<pre><code>32</code></pre>
+</td><td>
+
+<p>The other mate in the paired-end alignment is aligned to the reverse reference strand</p>
+</td></tr><tr><td>
+
+<pre><code>64</code></pre>
+</td><td>
+
+<p>The read is mate 1 in a pair</p>
+</td></tr><tr><td>
+
+<pre><code>128</code></pre>
+</td><td>
+
+<p>The read is mate 2 in a pair</p>
+</td></tr></table>
+
+<p>Thus, an unpaired read that aligns to the reverse reference strand will have flag 16. A paired-end read that aligns and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1).</p></li>
+<li><p>Name of reference sequence where alignment occurs</p></li>
+<li><p>1-based offset into the forward reference strand where leftmost character of the alignment occurs</p></li>
+<li><p>Mapping quality</p></li>
+<li><p>CIGAR string representation of alignment</p></li>
+<li><p>Name of reference sequence where mate's alignment occurs. Set to <code>=</code> if the mate's reference sequence is the same as this alignment's, or <code>*</code> if there is no mate.</p></li>
+<li><p>1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.</p></li>
+<li><p>Inferred fragment length. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if the mates did not align concordantly. However, size is non-0 if the mates aligned discordantly to the same chromosome.</p></li>
+<li><p>Read sequence (reverse-complemented if aligned to the reverse strand)</p></li>
+<li><p>ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> scale and the encoding is ASCII-offset by 33 (ASCII char <code>!</code>), similarly to a <a href="http://en.wikipedia.org/wiki/FASTQ_format">FASTQ</a> file.</p></li>
+<li><p>Optional fields. Fields are tab-separated. <code>bowtie2</code> outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:</p>
+<table>
+<tr><td id="bowtie2-build-opt-fields-as">
+</li>
+</ol>
+<pre><code>    AS:i:<N>
+
+</td>
+<td>
+
+Alignment score.  Can be negative.  Can be greater than 0 in [`--local`]
+mode (but not in [`--end-to-end`] mode).  Only present if SAM record is for
+an aligned read.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-xs"></code></pre>
+<pre><code>    XS:i:<N>
+
+</td>
+<td>
+
+Alignment score for second-best alignment.  Can be negative.  Can be greater
+than 0 in [`--local`] mode (but not in [`--end-to-end`] mode).  Only present
+if the SAM record is for an aligned read and more than one alignment was
+found for the read.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-ys"></code></pre>
+<pre><code>    YS:i:<N>
+
+</td>
+<td>
+
+Alignment score for opposite mate in the paired-end alignment.  Only present
+if the SAM record is for a read that aligned as part of a paired-end
+alignment.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-xn"></code></pre>
+<pre><code>    XN:i:<N>
+
+</td>
+<td>
+
+The number of ambiguous bases in the reference covering this alignment. 
+Only present if SAM record is for an aligned read.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-xm"></code></pre>
+<pre><code>    XM:i:<N>
+
+</td>
+<td>
+
+The number of mismatches in the alignment.  Only present if SAM record is
+for an aligned read.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-xo"></code></pre>
+<pre><code>    XO:i:<N>
+
+</td>
+<td>
+
+The number of gap opens, for both read and reference gaps, in the alignment.
+Only present if SAM record is for an aligned read.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-xg"></code></pre>
+<pre><code>    XG:i:<N>
+
+</td>
+<td>
+
+The number of gap extensions, for both read and reference gaps, in the
+alignment. Only present if SAM record is for an aligned read.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-nm"></code></pre>
+<pre><code>    NM:i:<N>
+
+</td>
+<td>
+
+The edit distance; that is, the minimal number of one-nucleotide edits
+(substitutions, insertions and deletions) needed to transform the read
+string into the reference string.  Only present if SAM record is for an
+aligned read.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-yf"></code></pre>
+<pre><code>    YF:Z:<S>
+
+</td><td>
+
+String indicating reason why the read was filtered out.  See also:
+[Filtering].  Only appears for reads that were filtered out.
+
+</td></tr>
+<tr><td id="bowtie2-build-opt-fields-yt"></code></pre>
+<pre><code>    YT:Z:<S>
+
+</td><td>
+
+Value of `UU` indicates the read was not part of a pair.  Value of `CP`
+indicates the read was part of a pair and the pair aligned concordantly.
+Value of `DP` indicates the read was part of a pair and the pair aligned
+discordantly.  Value of `UP` indicates the read was part of a pair but the
+pair failed to aligned either concordantly or discordantly.</code></pre>
+<pre><code></td></tr>
+<tr><td id="bowtie2-build-opt-fields-md"></code></pre>
+<pre><code>    MD:Z:<S>
+
+</td><td>
+
+A string representation of the mismatched reference bases in the alignment. 
+See [SAM] format specification for details.  Only present if SAM record is
+for an aligned read.
+
+</td></tr>
+</table></code></pre>
+<h1 id="the-bowtie2-build-indexer">The <code>bowtie2-build</code> indexer</h1>
+<p><code>bowtie2-build</code> builds a Bowtie index from a set of DNA sequences. <code>bowtie2-build</code> outputs a set of 6 files with suffixes <code>.1.bt2</code>, <code>.2.bt2</code>, <code>.3.bt2</code>, <code>.4.bt2</code>, <code>.rev.1.bt2</code>, and <code>.rev.2.bt2</code>. These files together constitute the index: they are all that is needed to align reads to that reference. The original sequence FASTA files are no longer used by Bowtie 2 once the index is built.</p>
+<p>Bowtie 2's <code>.bt2</code> index format is different from Bowtie 1's <code>.ebwt</code> format, and they are not compatible with each other.</p>
+<p>Use of Karkkainen's <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> allows <code>bowtie2-build</code> to trade off between running time and memory usage. <code>bowtie2-build</code> has three options governing how it makes this trade: <a href="#bowtie2-build-options-p"><code>-p</code>/<code>--packed</code></a>, <a href="#bowtie2-build-options-bmax"><code>--bmax</code></a>/<a href="#bowtie2-build-options-bmaxdivn"><code>--bmaxdivn</code></a>, and <a href= [...]
+<p>The indexer provides options pertaining to the "shape" of the index, e.g. <a href="#bowtie2-build-options-o"><code>--offrate</code></a> governs the fraction of <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows that are "marked" (i.e., the density of the suffix-array sample; see the original <a href="http://portal.acm.org/citation.cfm?id=796543">FM Index</a> paper for details). All of these options are potentially profitable t [...]
+<p>Because <code>bowtie2-build</code> uses 32-bit pointers internally, it can handle up to a theoretical maximum of 2^32-1 (somewhat more than 4 billion) characters in an index, though, with other constraints, the actual ceiling is somewhat less than that. If your reference exceeds 2^32-1 characters, <code>bowtie2-build</code> will print an error message and abort. To resolve this, divide your reference sequences into smaller batches and/or chunks and build a separate index for each.</p>
+<p>If your computer has more than 3-4 GB of memory and you would like to exploit that fact to make index building faster, use a 64-bit version of the <code>bowtie2-build</code> binary. The 32-bit version of the binary is restricted to using less than 4 GB of memory. If a 64-bit pre-built binary does not yet exist for your platform on the sourceforge download site, you will need to build one from source.</p>
+<p>The Bowtie 2 index is based on the <a href="http://portal.acm.org/citation.cfm?id=796543">FM Index</a> of Ferragina and Manzini, which in turn is based on the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> transform. The algorithm used to build the index is based on the <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> of Karkkainen.</p>
+<h2 id="command-line-1">Command Line</h2>
+<p>Usage:</p>
+<pre><code>bowtie2-build [options]* <reference_in> <bt2_base></code></pre>
+<h3 id="main-arguments-1">Main arguments</h3>
+<table><tr><td>
+
+<pre><code><reference_in></code></pre>
+</td><td>
+
+<p>A comma-separated list of FASTA files containing the reference sequences to be aligned to, or, if <a href="#bowtie2-build-options-c"><code>-c</code></a> is specified, the sequences themselves. E.g., <code><reference_in></code> might be <code>chr1.fa,chr2.fa,chrX.fa,chrY.fa</code>, or, if <a href="#bowtie2-build-options-c"><code>-c</code></a> is specified, this might be <code>GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA</code>.</p>
+</td></tr><tr><td>
+
+<pre><code><bt2_base></code></pre>
+</td><td>
+
+<p>The basename of the index files to write. By default, <code>bowtie2-build</code> writes files named <code>NAME.1.bt2</code>, <code>NAME.2.bt2</code>, <code>NAME.3.bt2</code>, <code>NAME.4.bt2</code>, <code>NAME.rev.1.bt2</code>, and <code>NAME.rev.2.bt2</code>, where <code>NAME</code> is <code><bt2_base></code>.</p>
+</td></tr></table>
+
+<h3 id="options-1">Options</h3>
+<table><tr><td>
+
+<pre><code>-f</code></pre>
+</td><td>
+
+<p>The reference input files (specified as <code><reference_in></code>) are FASTA files (usually having extension <code>.fa</code>, <code>.mfa</code>, <code>.fna</code> or similar).</p>
+</td></tr><tr><td id="bowtie2-build-options-c">
+
+<pre><code>-c</code></pre>
+</td><td>
+
+<p>The reference sequences are given on the command line. I.e. <code><reference_in></code> is a comma-separated list of sequences rather than a list of FASTA files.</p>
+</td></tr>
+<tr><td id="bowtie2-build-options-a">
+
+<pre><code>-a/--noauto</code></pre>
+</td><td>
+
+<p>Disable the default behavior whereby <code>bowtie2-build</code> automatically selects values for the <a href="#bowtie2-build-options-bmax"><code>--bmax</code></a>, <a href="#bowtie2-build-options-dcv"><code>--dcv</code></a> and <a href="#bowtie2-build-options-p"><code>--packed</code></a> parameters according to available memory. Instead, user may specify values for those parameters. If memory is exhausted during indexing, an error message will be printed; it is up to the user to try n [...]
+</td></tr><tr><td id="bowtie2-build-options-p">
+
+<pre><code>-p/--packed</code></pre>
+</td><td>
+
+<p>Use a packed (2-bits-per-nucleotide) representation for DNA strings. This saves memory but makes indexing 2-3 times slower. Default: off. This is configured automatically by default; use <a href="#bowtie2-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
+</td></tr><tr><td id="bowtie2-build-options-bmax">
+
+<pre><code>--bmax <int></code></pre>
+</td><td>
+
+<p>The maximum number of suffixes allowed in a block. Allowing more suffixes per block makes indexing faster, but increases peak memory usage. Setting this option overrides any previous setting for <a href="#bowtie2-build-options-bmax"><code>--bmax</code></a>, or <a href="#bowtie2-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default (in terms of the <a href="#bowtie2-build-options-bmaxdivn"><code>--bmaxdivn</code></a> parameter) is <a href="#bowtie2-build-options-bmaxdivn"><code> [...]
+</td></tr><tr><td id="bowtie2-build-options-bmaxdivn">
+
+<pre><code>--bmaxdivn <int></code></pre>
+</td><td>
+
+<p>The maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference. Setting this option overrides any previous setting for <a href="#bowtie2-build-options-bmax"><code>--bmax</code></a>, or <a href="#bowtie2-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default: <a href="#bowtie2-build-options-bmaxdivn"><code>--bmaxdivn</code></a> 4. This is configured automatically by default; use <a href="#bowtie2-build-options-a"><code>-a</code>/<code>-- [...]
+</td></tr><tr><td id="bowtie2-build-options-dcv">
+
+<pre><code>--dcv <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the period for the difference-cover sample. A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. Default: 1024. This is configured automatically by default; use <a href="#bowtie2-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
+</td></tr><tr><td id="bowtie2-build-options-nodc">
+
+<pre><code>--nodc</code></pre>
+</td><td>
+
+<p>Disable use of the difference-cover sample. Suffix sorting becomes quadratic-time in the worst case (where the worst case is an extremely repetitive reference). Default: off.</p>
+</td></tr><tr><td>
+
+<pre><code>-r/--noref</code></pre>
+</td><td>
+
+<p>Do not build the <code>NAME.3.bt2</code> and <code>NAME.4.bt2</code> portions of the index, which contain a bitpacked version of the reference sequences and are used for paired-end alignment.</p>
+</td></tr><tr><td>
+
+<pre><code>-3/--justref</code></pre>
+</td><td>
+
+<p>Build only the <code>NAME.3.bt2</code> and <code>NAME.4.bt2</code> portions of the index, which contain a bitpacked version of the reference sequences and are used for paired-end alignment.</p>
+</td></tr><tr><td id="bowtie2-build-options-o">
+
+<pre><code>-o/--offrate <int></code></pre>
+</td><td>
+
+<p>To map alignments back to positions on the reference sequences, it's necessary to annotate ("mark") some or all of the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows with their corresponding location on the genome. <a href="#bowtie2-build-options-o"><code>-o</code>/<code>--offrate</code></a> governs how many rows get marked: the indexer will mark every 2^<code><int></code> rows. Marking more rows makes reference-position looku [...]
+</td></tr><tr><td>
+
+<pre><code>-t/--ftabchars <int></code></pre>
+</td><td>
+
+<p>The ftab is the lookup table used to calculate an initial <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> range with respect to the first <code><int></code> characters of the query. A larger <code><int></code> yields a larger lookup table but faster query times. The ftab has size 4^(<code><int></code>+1) bytes. The default setting is 10 (ftab is 4MB).</p>
+</td></tr><tr><td>
+
+<pre><code>--seed <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the seed for pseudo-random number generator.</p>
+</td></tr><tr><td>
+
+<pre><code>--cutoff <int></code></pre>
+</td><td>
+
+<p>Index only the first <code><int></code> bases of the reference sequences (cumulative across sequences) and ignore the rest.</p>
+</td></tr><tr><td>
+
+<pre><code>-q/--quiet</code></pre>
+</td><td>
+
+<p><code>bowtie2-build</code> is verbose by default. With this option <code>bowtie2-build</code> will print only error messages.</p>
+</td></tr><tr><td>
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr><tr><td>
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr></table>
+
+<h1 id="the-bowtie2-inspect-index-inspector">The <code>bowtie2-inspect</code> index inspector</h1>
+<p><code>bowtie2-inspect</code> extracts information from a Bowtie index about what kind of index it is and what reference sequences were used to build it. When run without any options, the tool will output a FASTA file containing the sequences of the original references (with all non-<code>A</code>/<code>C</code>/<code>G</code>/<code>T</code> characters converted to <code>N</code>s). It can also be used to extract just the reference sequence names using the <a href="#bowtie2-inspect-opt [...]
+<h2 id="command-line-2">Command Line</h2>
+<p>Usage:</p>
+<pre><code>bowtie2-inspect [options]* <bt2_base></code></pre>
+<h3 id="main-arguments-2">Main arguments</h3>
+<table><tr><td>
+
+<pre><code><bt2_base></code></pre>
+</td><td>
+
+<p>The basename of the index to be inspected. The basename is name of any of the index files but with the <code>.X.bt2</code> or <code>.rev.X.bt2</code> suffix omitted. <code>bowtie2-inspect</code> first looks in the current directory for the index files, then in the directory specified in the <code>BOWTIE2_INDEXES</code> environment variable.</p>
+</td></tr></table>
+
+<h3 id="options-2">Options</h3>
+<table><tr><td>
+
+<pre><code>-a/--across <int></code></pre>
+</td><td>
+
+<p>When printing FASTA output, output a newline character every <code><int></code> bases (default: 60).</p>
+</td></tr><tr><td id="bowtie2-inspect-options-n">
+
+<pre><code>-n/--names</code></pre>
+</td><td>
+
+<p>Print reference sequence names, one per line, and quit.</p>
+</td></tr><tr><td id="bowtie2-inspect-options-s">
+
+<pre><code>-s/--summary</code></pre>
+</td><td>
+
+<p>Print a summary that includes information about index settings, as well as the names and lengths of the input sequences. The summary has this format:</p>
+<pre><code>Colorspace  <0 or 1>
+SA-Sample   1 in <sample>
+FTab-Chars  <chars>
+Sequence-1  <name>  <len>
+Sequence-2  <name>  <len>
+...
+Sequence-N  <name>  <len></code></pre>
+<p>Fields are separated by tabs. Colorspace is always set to 0 for Bowtie 2.</p>
+</td></tr><tr><td>
+
+<pre><code>-v/--verbose</code></pre>
+</td><td>
+
+<p>Print verbose output (for debugging).</p>
+</td></tr><tr><td>
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr><tr><td>
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr></table>
+
+<h1 id="getting-started-with-bowtie-2-lambda-phage-example">Getting started with Bowtie 2: Lambda phage example</h1>
+<p>Bowtie 2 comes with some example files to get you started. The example files are not scientifically significant; we use the <a href="http://en.wikipedia.org/wiki/Lambda_phage">Lambda phage</a> reference genome simply because it's short, and the reads were generated by a computer program, not a sequencer. However, these files will let you start running Bowtie 2 and downstream tools right away.</p>
+<p>First follow the manual instructions to <a href="#obtaining-bowtie-2">obtain Bowtie 2</a>. Set the <code>BT2_HOME</code> environment variable to point to the new Bowtie 2 directory containing the <code>bowtie2</code>, <code>bowtie2-build</code> and <code>bowtie2-inspect</code> binaries. This is important, as the <code>BT2_HOME</code> variable is used in the commands below to refer to that directory.</p>
+<h2 id="indexing-a-reference-genome">Indexing a reference genome</h2>
+<p>To create an index for the <a href="http://en.wikipedia.org/wiki/Lambda_phage">Lambda phage</a> reference genome included with Bowtie 2, create a new temporary directory (it doesn't matter where), change into that directory, and run:</p>
+<pre><code>$BT2_HOME/bowtie2-build $BT2_HOME/example/reference/lambda_virus.fa lambda_virus</code></pre>
+<p>The command should print many lines of output then quit. When the command completes, the current directory will contain four new files that all start with <code>lambda_virus</code> and end with <code>.1.bt2</code>, <code>.2.bt2</code>, <code>.3.bt2</code>, <code>.4.bt2</code>, <code>.rev.1.bt2</code>, and <code>.rev.2.bt2</code>. These files constitute the index - you're done!</p>
+<p>You can use <code>bowtie2-build</code> to create an index for a set of FASTA files obtained from any source, including sites such as <a href="http://genome.ucsc.edu/cgi-bin/hgGateway">UCSC</a>, <a href="http://www.ncbi.nlm.nih.gov/sites/genome">NCBI</a>, and <a href="http://www.ensembl.org/">Ensembl</a>. When indexing multiple FASTA files, specify all the files using commas to separate file names. For more details on how to create an index with <code>bowtie2-build</code>, see the <a h [...]
+<h2 id="aligning-example-reads">Aligning example reads</h2>
+<p>Stay in the directory created in the previous step, which now contains the <code>lambda_virus</code> index files. Next, run:</p>
+<pre><code>$BT2_HOME/bowtie2 -x lambda_virus -U $BT2_HOME/example/reads/reads_1.fq -S eg1.sam</code></pre>
+<p>This runs the Bowtie 2 aligner, which aligns a set of unpaired reads to the <a href="http://en.wikipedia.org/wiki/Lambda_phage">Lambda phage</a> reference genome using the index generated in the previous step. The alignment results in SAM format are written to the file <code>eg1.sam</code>, and a short alignment summary is written to the console. (Actually, the summary is written to the "standard error" or "stderr" filehandle, which is typically printed to the cons [...]
+<p>To see the first few lines of the SAM output, run:</p>
+<pre><code>head eg1.sam</code></pre>
+<p>You will see something like this:</p>
+<pre><code>@HD VN:1.0  SO:unsorted
+ at SQ SN:gi|9626243|ref|NC_001416.1|  LN:48502
+ at PG ID:bowtie2  PN:bowtie2  VN:2.0.1
+r1  0   gi|9626243|ref|NC_001416.1| 18401   42  122M    *   0   0   TGAATGCGAACTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACNGTACGCTGAGGGCAGAAAAAATCGTCGGGGACATTNTAAAGGCGGCGAGCGCGGCTTTTCCG  +"@6<:27(F&5)9)"B:%B+A-%5A?2$HCB0B+0=D<7E/<.03#!.F77 at 6B==?C"7>;))%;,3-$.A06+<-1/@@?,26">=?*@'0;$:;??G+:#+(A?9+10!8!?()?7C>  AS:i:-5 XN:i:0  XM:i:3  XO:i:0  XG:i:0  NM:i:3  MD:Z:59G13G21G26    YT:Z:UU
+r2  0   gi|9626243|ref|NC_001416.1| 8886    42  275M    *   0   0   NTTNTGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCTGTACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGATCACCCTGTGGGTTTATAAGGGGATCGGTGACCCCTACGCGAATCCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGNCCTATGACGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGC (#!!'+!$""%+(+)'%)%!+!(&++)''"#"#&#"!'!("%'""("+&%$%*%%#$%#%#!)*&#3 [...]
+r3  16  gi|9626243|ref|NC_001416.1| 11599   42  338M    *   0   0   GGGCGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAAGAGGAGAAAAATGCGCAGCAGCGGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTNACGAACGGCTGCAGACGCCGCTGCAGAAATATACCGCCCGTCAGGAAGAACTGANCAAGGCACNGAAAGACGGGAAAATCCTGCAGGCGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGTAAAAGCCGAAACAGTCCAGCGTGAAGGTGTCTGCGGGCGAT  7F$%6=$:9B@/F'>=?!D?@0(:A*)7/>9C>6#1<6:C(.CC;#.;>;2'$4D:?&amp [...]
+r4  0   gi|9626243|ref|NC_001416.1| 40075   42  184M    *   0   0   GGGCCAATGCGCTTACTGATGCGGAATTACGCCGTAAGGCCGCAGATGAGCTTGTCCATATGACTGCGAGAATTAACNGTGGTGAGGCGATCCCTGAACCAGTAAAACAACTTCCTGTCATGGGCGGTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGCAGAAATCAAAGCTAAGT(=8B)GD04*G%&4F,1'A>.C&7=F$,+#6!))43C,5/5+)?-/0>/D3=-,2/+.1?@->;)00!'3!7BH$G)HG+ADC'#-9F)7<7"$?&.>0)@5;4,!0-#C!15CF8&HB+B==H>7,/)C5)5*+(F5A%D,EA<(>G9E0>7&/E?4%;#'92)<5+ at 7 [...]
+r5  0   gi|9626243|ref|NC_001416.1| 48010   42  138M    *   0   0   GTCAGGAAAGTGGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATTAAGTGAATTTACAATATCGTCCTGTTCGGAGGGAAGAACGCGGGATGTTCATTCTTCATCACTTTTAATTGATGTATATGCTCTCTT  9''%<D)A03E1-*7=),:F/0!6,D9:H,<9D%:0B(%'E,(8EFG$E89B$27G8F*2+4,-!,0D5()&=(FGG:5;3*@/.0F-G#5#3->('FDFEG?)5.!)"AGADB3?6(@H(:B<>6!>;>6>G,."?%  AS:i:0  XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:138    YT:Z:UU
+r6  16  gi|9626243|ref|NC_001416.1| 41607   42  72M2D119M   *   0   0   TCGATTTGCAAATACCGGAACATCTCGGTAACTGCATATTCTGCATTAAAAAATCAACGCAAAAAATCGGACGCCTGCAAAGATGAGGAGGGATTGCAGCGTGTTTTTAATGAGGTCATCACGGGATNCCATGTGCGTGACGGNCATCGGGAAACGCCAAAGGAGATTATGTACCGAGGAAGAATGTCGCT 1H#G;H"$E*E#&"*)2%66?=9/9'=;4)4/>@%+5#@#$4A*!<D=="8#1*A9BA=:(1+#C&.#(3#H=9E)AC*5,AC#E'536*2?)H14?>9'B=7(3H/B:+A:8%1-+#(E%&$$&14"76D?>7(&20H5%*&CF8!G5B+A4F$7(:&q [...]
+r7  16  gi|9626243|ref|NC_001416.1| 4692    42  143M    *   0   0   TCAGCCGGACGCGGGCGCTGCAGCCGTACTCGGGGATGACCGGTTACAACGGCATTATCGCCCGTCTGCAACAGGCTGCCAGCGATCCGATGGTGGACAGCATTCTGCTCGATATGGACANGCCCGGCGGGATGGTGGCGGGG -"/@*7A0)>2,AAH@&"%B)*5*23B/,)90.B@%=FE,E063C9?,:26$-0:,.,1849'4.;F>FA;76+5&$<C":$!A*,<B,<)@<'85D%C*:)30 at 85;?.B$05=@95DCDH<53!8G:F:B7/A.E':434> AS:i:-6 XN:i:0  XM:i:2  XO:i:0  XG:i:0  NM:i:2  MD:Z:98G21C22   YT:Z:UU</code></pre>
+<p>The first few lines (beginning with <code>@</code>) are SAM header lines, and the rest of the lines are SAM alignments, one line per read or mate. See the <a href="#sam-output">Bowtie 2 manual section on SAM output</a> and the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM specification</a> for details about how to interpret the SAM file format.</p>
+<h2 id="paired-end-example">Paired-end example</h2>
+<p>To align paired-end reads included with Bowtie 2, stay in the same directory and run:</p>
+<pre><code>$BT2_HOME/bowtie2 -x lambda_virus -1 $BT2_HOME/example/reads/reads_1.fq -2 $BT2_HOME/example/reads/reads_2.fq -S eg2.sam</code></pre>
+<p>This aligns a set of paired-end reads to the reference genome, with results written to the file <code>eg2.sam</code>.</p>
+<h2 id="local-alignment-example">Local alignment example</h2>
+<p>To use <a href="#end-to-end-alignment-versus-local-alignment">local alignment</a> to align some longer reads included with Bowtie 2, stay in the same directory and run:</p>
+<pre><code>$BT2_HOME/bowtie2 --local -x lambda_virus -U $BT2_HOME/example/reads/longreads.fq -S eg3.sam</code></pre>
+<p>This aligns the long reads to the reference genome using local alignment, with results written to the file <code>eg3.sam</code>.</p>
+<h2 id="using-samtoolsbcftools-downstream">Using SAMtools/BCFtools downstream</h2>
+<p><a href="http://samtools.sourceforge.net/">SAMtools</a> is a collection of tools for manipulating and analyzing SAM and BAM alignment files. <a href="http://samtools.sourceforge.net/mpileup.shtml">BCFtools</a> is a collection of tools for calling variants and manipulating VCF and BCF files, and it is typically distributed with <a href="http://samtools.sourceforge.net/">SAMtools</a>. Using these tools together allows you to get from alignments in SAM format to variant calls in VCF form [...]
+<p>Run the paired-end example:</p>
+<pre><code>$BT2_HOME/bowtie2 -x $BT2_HOME/example/index/lambda_virus -1 $BT2_HOME/example/reads/reads_1.fq -2 $BT2_HOME/example/reads/reads_2.fq -S eg2.sam</code></pre>
+<p>Use <code>samtools view</code> to convert the SAM file into a BAM file. BAM is a the binary format corresponding to the SAM text format. Run:</p>
+<pre><code>samtools view -bS eg2.sam > eg2.bam</code></pre>
+<p>Use <code>samtools sort</code> to convert the BAM file to a sorted BAM file.</p>
+<pre><code>samtools sort eg2.bam eg2.sorted</code></pre>
+<p>We now have a sorted BAM file called <code>eg2.sorted.bam</code>. Sorted BAM is a useful format because the alignments are (a) compressed, which is convenient for long-term storage, and (b) sorted, which is conveneint for variant discovery. To generate variant calls in VCF format, run:</p>
+<pre><code>samtools mpileup -uf $BT2_HOME/example/reference/lambda_virus.fa eg2.sorted.bam | bcftools view -bvcg - > eg2.raw.bcf</code></pre>
+<p>Then to view the variants, run:</p>
+<pre><code>bcftools view eg2.raw.bcf</code></pre>
+<p>See the official SAMtools guide to <a href="http://samtools.sourceforge.net/mpileup.shtml">Calling SNPs/INDELs with SAMtools/BCFtools</a> for more details and variations on this process.</p>
+</body>
+</html>
diff --git a/doc/manual.inc.html b/doc/manual.inc.html
new file mode 100644
index 0000000..d778a78
--- /dev/null
+++ b/doc/manual.inc.html
@@ -0,0 +1,884 @@
+<div id="TOC">
+<ul>
+<li><a href="#introduction">Introduction</a><ul>
+<li><a href="#what-is-centrifuge">What is Centrifuge?</a></li>
+</ul></li>
+<li><a href="#obtaining-centrifuge">Obtaining Centrifuge</a><ul>
+<li><a href="#building-from-source">Building from source</a></li>
+</ul></li>
+<li><a href="#running-centrifuge">Running Centrifuge</a><ul>
+<li><a href="#adding-to-path">Adding to PATH</a></li>
+<li><a href="#before-running-centrifuge">Before running Centrifuge</a></li>
+<li><a href="#database-download-and-index-building">Database download and index building</a><ul>
+<li><a href="#building-index-on-all-complete-bacterial-and-viral-genomes">Building index on all complete bacterial and viral genomes</a></li>
+<li><a href="#adding-human-or-mouse-genome-to-the-index">Adding human or mouse genome to the index</a></li>
+<li><a href="#nt-database">nt database</a></li>
+<li><a href="#custom-database">Custom database</a></li>
+<li><a href="#centrifuge-classification-output">Centrifuge classification output</a></li>
+<li><a href="#centrifuge-summary-output-the-default-filename-is-centrifuge_report.tsv">Centrifuge summary output (the default filename is centrifuge_report.tsv)</a></li>
+</ul></li>
+<li><a href="#inspecting-the-centrifuge-index">Inspecting the Centrifuge index</a></li>
+<li><a href="#wrapper">Wrapper</a></li>
+<li><a href="#performance-tuning">Performance tuning</a></li>
+<li><a href="#command-line">Command Line</a><ul>
+<li><a href="#usage">Usage</a></li>
+<li><a href="#main-arguments">Main arguments</a></li>
+<li><a href="#options">Options</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#the-centrifuge-build-indexer">The <code>centrifuge-build</code> indexer</a><ul>
+<li><a href="#command-line-1">Command Line</a><ul>
+<li><a href="#main-arguments-1">Main arguments</a></li>
+<li><a href="#options-1">Options</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#the-centrifuge-inspect-index-inspector">The <code>centrifuge-inspect</code> index inspector</a><ul>
+<li><a href="#command-line-2">Command Line</a><ul>
+<li><a href="#main-arguments-2">Main arguments</a></li>
+<li><a href="#options-2">Options</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#getting-started-with-centrifuge">Getting started with Centrifuge</a><ul>
+<li><a href="#indexing-a-reference-genome">Indexing a reference genome</a></li>
+<li><a href="#classifying-example-reads">Classifying example reads</a></li>
+</ul></li>
+</ul>
+</div>
+<!--
+ ! This manual is written in "markdown" format and thus contains some
+ ! distracting formatting clutter.  See 'MANUAL' for an easier-to-read version
+ ! of this text document, or see the HTML manual online.
+ ! -->
+
+<h1 id="introduction">Introduction</h1>
+<h2 id="what-is-centrifuge">What is Centrifuge?</h2>
+<p><a href="http://www.ccb.jhu.edu/software/centrifuge">Centrifuge</a> is a novel microbial classification engine that enables rapid, accurate, and sensitive labeling of reads and quantification of species on desktop computers. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index, optimized specifically for the metagenomic classification problem. Centrifuge requires a relatively small index (5.8 GB for all complete bact [...]
+<h1 id="obtaining-centrifuge">Obtaining Centrifuge</h1>
+<p>Download Centrifuge and binaries from the Releases sections on the right side. Binaries are available for Intel architectures (<code>x86_64</code>) running Linux, and Mac OS X.</p>
+<h2 id="building-from-source">Building from source</h2>
+<p>Building Centrifuge from source requires a GNU-like environment with GCC, GNU Make and other basics. It should be possible to build Centrifuge on most vanilla Linux installations or on a Mac installation with <a href="http://developer.apple.com/xcode/">Xcode</a> installed. Centrifuge can also be built on Windows using <a href="http://www.cygwin.com/">Cygwin</a> or <a href="http://www.mingw.org/">MinGW</a> (MinGW recommended). For a MinGW build the choice of what compiler is to be used [...]
+<p>First, download the [source package] from the Releases secion on the right side. Unzip the file, change to the unzipped directory, and build the Centrifuge tools by running GNU <code>make</code> (usually with the command <code>make</code>, but sometimes with <code>gmake</code>) with no arguments. If building with MinGW, run <code>make</code> from the MSYS environment.</p>
+<p>Centrifuge is using the multithreading software model in order to speed up execution times on SMP architectures where this is possible. On POSIX platforms (like linux, Mac OS, etc) it needs the pthread library. Although it is possible to use pthread library on non-POSIX platform like Windows, due to performance reasons Centrifuge will try to use Windows native multithreading if possible.</p>
+<p>For the support of SRA data access in HISAT2, please download and install the <a href="https://github.com/ncbi/ngs/wiki/Downloads">NCBI-NGS</a> toolkit. When running <code>make</code>, specify additional variables as follow. <code>make USE_SRA=1 NCBI_NGS_DIR=/path/to/NCBI-NGS-directory NCBI_VDB_DIR=/path/to/NCBI-NGS-directory</code>, where <code>NCBI_NGS_DIR</code> and <code>NCBI_VDB_DIR</code> will be used in Makefile for -I and -L compilation options. For example, $(NCBI_NGS_DIR)/in [...]
+<h1 id="running-centrifuge">Running Centrifuge</h1>
+<h2 id="adding-to-path">Adding to PATH</h2>
+<p>By adding your new Centrifuge directory to your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH environment variable</a>, you ensure that whenever you run <code>centrifuge</code>, <code>centrifuge-build</code>, <code>centrifuge-download</code> or <code>centrifuge-inspect</code> from the command line, you will get the version you just installed without having to specify the entire path. This is recommended for most users. To do this, follow your operating system's instructi [...]
+<p>If you would like to install Centrifuge by copying the Centrifuge executable files to an existing directory in your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH</a>, make sure that you copy all the executables, including <code>centrifuge</code>, <code>centrifuge-class</code>, <code>centrifuge-build</code>, <code>centrifuge-build-bin</code>, <code>centrifuge-download</code> <code>centrifuge-inspect</code> and <code>centrifuge-inspect-bin</code>. Furthermore you need the  [...]
+<h2 id="before-running-centrifuge">Before running Centrifuge</h2>
+<p>Classification is considerably different from alignment in that classification is performed on a large set of genomes as opposed to on just one reference genome as in alignment. Currently, an enormous number of complete genomes are available at the GenBank (e.g. >4,000 bacterial genomes, >10,000 viral genomes, …). These genomes are organized in a taxonomic tree where each genome is located at the bottom of the tree, at the strain or subspecies level. On the taxonomic tree, genom [...]
+<p>Given the gigantic number of genomes available, which continues to expand at a rapid rate, and the development of the taxonomic tree, which continues to evolve with new advancements in research, we have designed Centrifuge to be flexible and general enough to reflect this huge database. We provide several standard indexes that will meet most of users’ needs (see the side panel - Indexes). In our approach our indexes not only include raw genome sequences, but also genome names/sizes an [...]
+<p>We encourage first time users to take a look at and follow a <a href="#centrifuge-example"><code>small example</code></a> that illustrates how to build an index, how to run Centrifuge using the index, how to interpret the classification results, and how to extract additional genomic information from the index. For those who choose to build customized indexes, please take a close look at the following description.</p>
+<h2 id="database-download-and-index-building">Database download and index building</h2>
+<p>Centrifuge indexes can be built with arbritary sequences. Standard choices are all of the complete bacterial and viral genomes, or using the sequences that are part of the BLAST nt database. Centrifuge always needs the nodes.dmp file from the NCBI taxonomy dump to build the taxonomy tree, as well as a sequence ID to taxonomy ID map. The map is a tab-separated file with the sequence ID to taxonomy ID map.</p>
+<p>To download all of the complete archaeal, viral, and bacterial genomes from RefSeq, and build the index:</p>
+<p>Centrifuge indices can be build on arbritary sequences. Usually an ensemble of genomes is used - such as all complete microbial genomes in the RefSeq database, or all sequences in the BLAST nt database.</p>
+<p>To map sequence identifiers to taxonomy IDs, and taxonomy IDs to names and its parents, three files are necessary in addition to the sequence files:</p>
+<ul>
+<li>taxonomy tree: typically nodes.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their parents</li>
+<li>names file: typically names.dmp from the NCBI taxonomy dump. Links taxonomy IDs to their scientific name</li>
+<li>a tab-separated sequence ID to taxonomy ID mapping</li>
+</ul>
+<p>When using the provided scripts to download the genomes, these files are automatically downloaded or generated. When using a custom taxonomy or sequence files, please refer to the section <code>TODO</code> to learn more about their format.</p>
+<h3 id="building-index-on-all-complete-bacterial-and-viral-genomes">Building index on all complete bacterial and viral genomes</h3>
+<p>Use <code>centrifuge-download</code> to download genomes from NCBI. The following two commands download the NCBI taxonomy to <code>taxonomy/</code> in the current directory, and all complete archaeal, bacterial and viral genomes to <code>library/</code>. Low-complexity regions in the genomes are masked after download (parameter <code>-m</code>) using blast+'s <code>dustmasker</code>. <code>centrifuge-download</code> outputs tab-separated sequence ID to taxonomy ID mappings to standard [...]
+<pre><code>centrifuge-download -o taxonomy taxonomy
+centrifuge-download -o library -m -d "archaea,bacteria,viral" refseq > seqid2taxid.map</code></pre>
+<p>To build the index, first concatenate all downloaded sequences into a single file, and then run <code>centrifuge-build</code>:</p>
+<pre><code>cat library/*/*.fna > input-sequences.fna
+
+## build centrifuge index with 4 threads
+centrifuge-build -p 4 --conversion-table seqid2taxid.map \
+                 --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \
+                 input-sequences.fna abv</code></pre>
+<p>After building the index, all files except the index *.[123].cf files may be removed. If you also want to include the human and/or the mouse genome, add their sequences to the library folder before building the index with one of the following commands:</p>
+<p>After the index building, all but the *.[123].cf index files may be removed. I.e. the files in the <code>library/</code> and <code>taxonomy/</code> directories are no longer needed.</p>
+<h3 id="adding-human-or-mouse-genome-to-the-index">Adding human or mouse genome to the index</h3>
+<p>The human and mouse genomes can also be downloaded using <code>centrifuge-download</code>. They are in the domain "vertebrate_mammalian" (argument <code>-d</code>), are assembled at the chromosome level (argument <code>-a</code>) and categorized as reference genomes by RefSeq (<code>-c</code>). The argument <code>-t</code> takes a comma-separated list of taxonomy IDs - e.g. <code>9606</code> for human and <code>10090</code> for mouse:</p>
+<pre><code># download mouse and human reference genomes
+centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606,10090 -c 'reference genome' >> seqid2taxid.map
+# only human
+centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 9606 -c 'reference genome' >> seqid2taxid.map
+# only mouse
+centrifuge-download -o library -d "vertebrate_mammalian" -a "Chromosome" -t 10090 -c 'reference genome' >> seqid2taxid.map</code></pre>
+<h3 id="nt-database">nt database</h3>
+<p>NCBI BLAST's nt database contains all spliced non-redundant coding sequences from multiplpe databases, inferred from genommic sequences. Traditionally used with BLAST, a download of the FASTA is provided on the NCBI homepage. Building an index with any database requires the user to creates a sequence ID to taxonomy ID map that can be generated from a GI taxid dump:</p>
+<pre><code>wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz
+gunzip nt.gz && mv -v nt nt.fa
+
+# Get mapping file
+wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz
+gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
+
+# build index using 16 cores and a small bucket size, which will require less memory
+centrifuge-build -p 16 --bmax 1342177280 --conversion-table gi_taxid_nucl.map \
+                 --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \ 
+                 nt.fa nt</code></pre>
+<h3 id="custom-database">Custom database</h3>
+<p>TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map</p>
+<h3 id="centrifuge-classification-output">Centrifuge classification output</h3>
+<p>The following example shows classification assignments for a read. The assignment output has 8 columns.</p>
+<pre><code>readID    seqID   taxID score      2ndBestScore    hitLength    queryLength numMatches
+1_1       gi|4    9646  4225       0               80   80      1
+
+The first column is the read ID from a raw sequencing read (e.g., 1_1 in the example).
+The second column is the sequence ID of the genomic sequence, where the read is classified (e.g., gi|4).
+The third column is the taxonomic ID of the genomic sequence in the second column (e.g., 9646).
+The fourth column is the score for the classification, which is the weighted sum of hits (e.g., 4225)
+The fifth column is the score for the next best classification (e.g., 0).
+The sixth column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80).
+The seventh column is a pair of two numbers: (1) an approximate number of base pairs of the read that match the genomic sequence and (2) the length of a read or the combined length of mate pairs (e.g., 80 / 80). 
+The eighth column is the number of classifications, indicating how many assignments were made (e.g.,1).</code></pre>
+<h3 id="centrifuge-summary-output-the-default-filename-is-centrifuge_report.tsv">Centrifuge summary output (the default filename is centrifuge_report.tsv)</h3>
+<p>The following example shows a classification summary for each genome or taxonomic unit. The assignment output has 7 columns.</p>
+<pre><code>name                                                            taxID   taxRank    genomeSize   numReads   numUniqueReads   abundance
+Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis 36870   leaf       703004       5981       5964             0.0152317
+
+The first column is the name of a genome, or the name corresponding to a taxonomic ID (the second column) at a rank higher than the strain (e.g., Wigglesworthia glossinidia endosymbiont of Glossina brevipalpis).
+The second column is the taxonomic ID (e.g., 36870).
+The third column is the taxonomic rank (e.g., leaf).
+The fourth column is the length of the genome sequence (e.g., 703004).
+The fifth column is the number of reads classified to this genomic sequence including multi-classified reads (e.g., 5981).
+The sixth column is the number of reads uniquely classified to this genomic sequence (e.g., 5964).
+The seventh column is the proportion of this genome normalized by its genomic length (e.g., 0.0152317).</code></pre>
+<h2 id="inspecting-the-centrifuge-index">Inspecting the Centrifuge index</h2>
+<p>The index can be inspected with <code>centrifuge-inspect</code>. To extract raw sequences:</p>
+<pre><code>centrifuge-inspect <centrifuge index></code></pre>
+<p>Extract the sequence ID to taxonomy ID conversion table from the index</p>
+<pre><code>centrifuge-inspect --conversion-table <centrifuge index></code></pre>
+<p>Extract the taxonomy tree from the index:</p>
+<pre><code>centrifuge-inspect --taxonomy-tree <centrifuge index></code></pre>
+<p>Extract the lengths of the sequences from the index (each row has two columns: taxonomic ID and length):</p>
+<pre><code>centrifuge-inspect --size-table <centrifuge index></code></pre>
+<p>Extract the names from the index (each row has two columns: taxonomic ID and name):</p>
+<pre><code>centrifuge-inspect --name-table <centrifuge index></code></pre>
+<h2 id="wrapper">Wrapper</h2>
+<p>The <code>centrifuge</code>, <code>centrifuge-build</code> and <code>centrifuge-inspect</code> executables are actually wrapper scripts that call binary programs as appropriate. Also, the <code>centrifuge</code> wrapper provides some key functionality, like the ability to handle compressed inputs, and the functionality for [<code>--un</code>], [<code>--al</code>] and related options.</p>
+<p>It is recommended that you always run the centrifuge wrappers and not run the binaries directly.</p>
+<h2 id="performance-tuning">Performance tuning</h2>
+<ol style="list-style-type: decimal">
+<li><p>If your computer has multiple processors/cores, use <code>-p NTHREADS</code></p>
+<p>The <a href="#centrifuge-build-options-p"><code>-p</code></a> option causes Centrifuge to launch a specified number of parallel search threads. Each thread runs on a different processor/core and all threads find alignments in parallel, increasing alignment throughput by approximately a multiple of the number of threads (though in practice, speedup is somewhat worse than linear).</p></li>
+</ol>
+<h2 id="command-line">Command Line</h2>
+<h3 id="usage">Usage</h3>
+<pre><code>centrifuge [options]* -x <centrifuge-idx> {-1 <m1> -2 <m2> | -U <r> | --sra-acc <SRA accession number>} [--report-file <report file name> -S <classification output file name>]</code></pre>
+<h3 id="main-arguments">Main arguments</h3>
+<table><tr><td>
+
+<pre><code>-x <centrifuge-idx></code></pre>
+</td><td>
+
+<p>The basename of the index for the reference genomes. The basename is the name of any of the index files up to but not including the final <code>.1.cf</code> / etc.<br /><code>centrifuge</code> looks for the specified index first in the current directory, then in the directory specified in the <code>CENTRIFUGE_INDEXES</code> environment variable.</p>
+</td></tr><tr><td>
+
+<pre><code>-1 <m1></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing mate 1s (filename usually includes <code>_1</code>), e.g. <code>-1 flyA_1.fq,flyB_1.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m2></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>centrifuge</code> will read the mate 1s from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-2 <m2></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing mate 2s (filename usually includes <code>_2</code>), e.g. <code>-2 flyA_2.fq,flyB_2.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m1></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>centrifuge</code> will read the mate 2s from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-U <r></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing unpaired reads to be aligned, e.g. <code>lane1.fq,lane2.fq,lane3.fq,lane4.fq</code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>centrifuge</code> gets the reads from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>--sra-acc <SRA accession number></code></pre>
+</td><td>
+
+<p>Comma-separated list of SRA accession numbers, e.g. <code>--sra-acc SRR353653,SRR353654</code>. Information about read types is available at http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?sp=runinfo&acc=<b>sra-acc</b>&retmode=xml, where <b>sra-acc</b> is SRA accession number. If users run HISAT2 on a computer cluster, it is recommended to disable SRA-related caching (see the instruction at <a href="https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration">SRA-MANUAL</a>).</p>
+</td></tr><tr><td>
+
+<pre><code>-S <filename></code></pre>
+</td><td>
+
+<p>File to write classification results to. By default, assignments are written to the "standard out" or "stdout" filehandle (i.e. the console).</p>
+</td></tr><tr><td>
+
+<pre><code>--report-file <filename></code></pre>
+</td><td>
+
+<p>File to write a classification summary to (default: centrifuge_report.tsv).</p>
+</td></tr></table>
+
+<h3 id="options">Options</h3>
+<h4 id="input-options">Input options</h4>
+<table>
+<tr><td id="centrifuge-options-q">
+
+<pre><code>-q</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTQ files. FASTQ files usually have extension <code>.fq</code> or <code>.fastq</code>. FASTQ is the default format. See also: <a href="#centrifuge-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#centrifuge-options-int-quals"><code>--int-quals</code></a>.</p>
+</td></tr>
+<tr><td id="centrifuge-options-qseq">
+
+<pre><code>--qseq</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are QSEQ files. QSEQ files usually end in <code>_qseq.txt</code>. See also: <a href="#centrifuge-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#centrifuge-options-int-quals"><code>--int-quals</code></a>.</p>
+</td></tr>
+<tr><td id="centrifuge-options-f">
+
+<pre><code>-f</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTA files. FASTA files usually have extension <code>.fa</code>, <code>.fasta</code>, <code>.mfa</code>, <code>.fna</code> or similar. FASTA files do not have a way of specifying quality values, so when <code>-f</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
+</td></tr>
+<tr><td id="centrifuge-options-r">
+
+<pre><code>-r</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are files with one input sequence per line, without any other information (no read names, no qualities). When <code>-r</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
+</td></tr>
+<tr><td id="centrifuge-options-c">
+
+<pre><code>-c</code></pre>
+</td><td>
+
+<p>The read sequences are given on command line. I.e. <code><m1></code>, <code><m2></code> and <code><singles></code> are comma-separated lists of reads rather than lists of read files. There is no way to specify read names or qualities, so <code>-c</code> also implies <code>--ignore-quals</code>.</p>
+</td></tr>
+<tr><td id="centrifuge-options-s">
+
+<pre><code>-s/--skip <int></code></pre>
+</td><td>
+
+<p>Skip (i.e. do not align) the first <code><int></code> reads or pairs in the input.</p>
+</td></tr>
+<tr><td id="centrifuge-options-u">
+
+<pre><code>-u/--qupto <int></code></pre>
+</td><td>
+
+<p>Align the first <code><int></code> reads or read pairs from the input (after the <a href="#centrifuge-options-s"><code>-s</code>/<code>--skip</code></a> reads or pairs have been skipped), then stop. Default: no limit.</p>
+</td></tr>
+<tr><td id="centrifuge-options-5">
+
+<pre><code>-5/--trim5 <int></code></pre>
+</td><td>
+
+<p>Trim <code><int></code> bases from 5' (left) end of each read before alignment (default: 0).</p>
+</td></tr>
+<tr><td id="centrifuge-options-3">
+
+<pre><code>-3/--trim3 <int></code></pre>
+</td><td>
+
+<p>Trim <code><int></code> bases from 3' (right) end of each read before alignment (default: 0).</p>
+</td></tr><tr><td id="centrifuge-options-phred33-quals">
+
+<pre><code>--phred33</code></pre>
+</td><td>
+
+<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 33. This is also called the "Phred+33" encoding, which is used by the very latest Illumina pipelines.</p>
+</td></tr>
+<tr><td id="centrifuge-options-phred64-quals">
+
+<pre><code>--phred64</code></pre>
+</td><td>
+
+<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 64. This is also called the "Phred+64" encoding.</p>
+</td></tr>
+<tr><td id="centrifuge-options-solexa-quals">
+
+<pre><code>--solexa-quals</code></pre>
+</td><td>
+
+<p>Convert input qualities from <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Solexa</a> (which can be negative) to <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred</a> (which can't). This scheme was used in older Illumina GA Pipeline versions (prior to 1.3). Default: off.</p>
+</td></tr>
+<tr><td id="centrifuge-options-int-quals">
+
+<pre><code>--int-quals</code></pre>
+</td><td>
+
+<p>Quality values are represented in the read input file as space-separated ASCII integers, e.g., <code>40 40 30 40</code>..., rather than ASCII characters, e.g., <code>II?I</code>.... Integers are treated as being on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> scale unless <a href="#centrifuge-options-solexa-quals"><code>--solexa-quals</code></a> is also specified. Default: off.</p>
+</td></tr></table>
+
+<h4 id="classification">Classification</h4>
+<table>
+
+<tr><td id="centrifuge-options-k">
+
+<pre><code>-k <int></code></pre>
+</td><td>
+
+<p>It searches for at most <code><int></code> distinct, primary assignments for each read or pair.<br />Primary assignments mean assignments whose assignment score is equal or higher than any other assignments. If there are more primary assignments than this value, the search will merge some of the assignments into a higher taxonomic rank. The assignment score for a paired-end assignment equals the sum of the assignment scores of the individual mates. Default: 5</p>
+</td></tr>
+
+<tr><td id="centrifuge-options-host-taxids">
+
+<pre><code>--host-taxids</code></pre>
+</td><td>
+
+<p>A comma-separated list of taxonomic IDs that will be preferred in classification procedure. The descendants from these IDs will also be preferred. In case some of a read's assignments correspond to these taxonomic IDs, only those corresponding assignments will be reported.</p>
+</td></tr>
+
+<tr><td id="centrifuge-options-exclude-taxids">
+
+<pre><code>--exclude-taxids</code></pre>
+</td><td>
+
+<p>A comma-separated list of taxonomic IDs that will be excluded in classification procedure. The descendants from these IDs will also be exclude.</p>
+</td></tr>
+
+</table>
+
+
+<!--
+#### Alignment options
+
+<table>
+
+<tr><td id="centrifuge-options-n-ceil">
+
+[`--n-ceil`]: #centrifuge-options-n-ceil
+
+    --n-ceil <func>
+
+</td><td>
+
+Sets a function governing the maximum number of ambiguous characters (usually
+`N`s and/or `.`s) allowed in a read as a function of read length.  For instance,
+specifying `-L,0,0.15` sets the N-ceiling function `f` to `f(x) = 0 + 0.15 * x`,
+where x is the read length.  See also: [setting function options].  Reads
+exceeding this ceiling are [filtered out].  Default: `L,0,0.15`.
+
+[filtered out]: #filtering
+
+</td></tr>
+
+<tr><td id="centrifuge-options-ignore-quals">
+
+[`--ignore-quals`]: #centrifuge-options-ignore-quals
+
+    --ignore-quals
+
+</td><td>
+
+When calculating a mismatch penalty, always consider the quality value at the
+mismatched position to be the highest possible, regardless of the actual value. 
+I.e. input is treated as though all quality values are high.  This is also the
+default behavior when the input doesn't specify quality values (e.g. in [`-f`],
+[`-r`], or [`-c`] modes).
+
+</td></tr>
+<tr><td id="centrifuge-options-nofw">
+
+[`--nofw`]: #centrifuge-options-nofw
+
+    --nofw/--norc
+
+</td><td>
+
+If `--nofw` is specified, `centrifuge` will not attempt to align unpaired reads to
+the forward (Watson) reference strand.  If `--norc` is specified, `centrifuge` will
+not attempt to align unpaired reads against the reverse-complement (Crick)
+reference strand. In paired-end mode, `--nofw` and `--norc` pertain to the
+fragments; i.e. specifying `--nofw` causes `centrifuge` to explore only those
+paired-end configurations corresponding to fragments from the reverse-complement
+(Crick) strand.  Default: both strands enabled. 
+
+</td></tr>
+
+</table>
+
+#### Paired-end options
+
+<table>
+
+<tr><td id="centrifuge-options-fr">
+
+[`--fr`/`--rf`/`--ff`]: #centrifuge-options-fr
+[`--fr`]: #centrifuge-options-fr
+[`--rf`]: #centrifuge-options-fr
+[`--ff`]: #centrifuge-options-fr
+
+    --fr/--rf/--ff
+
+</td><td>
+
+The upstream/downstream mate orientations for a valid paired-end alignment
+against the forward reference strand.  E.g., if `--fr` is specified and there is
+a candidate paired-end alignment where mate 1 appears upstream of the reverse
+complement of mate 2 and the fragment length constraints ([`-I`] and [`-X`]) are
+met, that alignment is valid.  Also, if mate 2 appears upstream of the reverse
+complement of mate 1 and all other constraints are met, that too is valid.
+`--rf` likewise requires that an upstream mate1 be reverse-complemented and a
+downstream mate2 be forward-oriented. ` --ff` requires both an upstream mate 1
+and a downstream mate 2 to be forward-oriented.  Default: `--fr` (appropriate
+for Illumina's Paired-end Sequencing Assay).
+
+</td></tr></table>
+-->
+
+<h4 id="output-options">Output options</h4>
+<table>
+
+<tr><td id="centrifuge-options-t">
+
+<pre><code>-t/--time</code></pre>
+</td><td>
+
+<p>Print the wall-clock time required to load the index files and align the reads. This is printed to the "standard error" ("stderr") filehandle. Default: off.</p>
+</td></tr>
+
+<!--
+<tr><td id="centrifuge-options-un">
+
+[`--un`]: #centrifuge-options-un
+[`--un-gz`]: #centrifuge-options-un
+[`--un-bz2`]: #centrifuge-options-un
+
+    --un <path>
+    --un-gz <path>
+    --un-bz2 <path>
+
+</td><td>
+
+Write unpaired reads that fail to align to file at `<path>`.  These reads
+correspond to the SAM records with the FLAGS `0x4` bit set and neither the
+`0x40` nor `0x80` bits set.  If `--un-gz` is specified, output will be gzip
+compressed. If `--un-bz2` is specified, output will be bzip2 compressed.  Reads
+written in this way will appear exactly as they did in the input file, without
+any modification (same sequence, same name, same quality string, same quality
+encoding).  Reads will not necessarily appear in the same order as they did in
+the input.
+
+</td></tr>
+<tr><td id="centrifuge-options-al">
+
+[`--al`]: #centrifuge-options-al
+[`--al-gz`]: #centrifuge-options-al
+[`--al-bz2`]: #centrifuge-options-al
+
+    --al <path>
+    --al-gz <path>
+    --al-bz2 <path>
+
+</td><td>
+
+Write unpaired reads that align at least once to file at `<path>`.  These reads
+correspond to the SAM records with the FLAGS `0x4`, `0x40`, and `0x80` bits
+unset.  If `--al-gz` is specified, output will be gzip compressed. If `--al-bz2`
+is specified, output will be bzip2 compressed.  Reads written in this way will
+appear exactly as they did in the input file, without any modification (same
+sequence, same name, same quality string, same quality encoding).  Reads will
+not necessarily appear in the same order as they did in the input.
+
+</td></tr>
+<tr><td id="centrifuge-options-un-conc">
+
+[`--un-conc`]: #centrifuge-options-un-conc
+[`--un-conc-gz`]: #centrifuge-options-un-conc
+[`--un-conc-bz2`]: #centrifuge-options-un-conc
+
+    --un-conc <path>
+    --un-conc-gz <path>
+    --un-conc-bz2 <path>
+
+</td><td>
+
+Write paired-end reads that fail to align concordantly to file(s) at `<path>`.
+These reads correspond to the SAM records with the FLAGS `0x4` bit set and
+either the `0x40` or `0x80` bit set (depending on whether it's mate #1 or #2).
+`.1` and `.2` strings are added to the filename to distinguish which file
+contains mate #1 and mate #2.  If a percent symbol, `%`, is used in `<path>`,
+the percent symbol is replaced with `1` or `2` to make the per-mate filenames.
+Otherwise, `.1` or `.2` are added before the final dot in `<path>` to make the
+per-mate filenames.  Reads written in this way will appear exactly as they did
+in the input files, without any modification (same sequence, same name, same
+quality string, same quality encoding).  Reads will not necessarily appear in
+the same order as they did in the inputs.
+
+</td></tr>
+<tr><td id="centrifuge-options-al-conc">
+
+[`--al-conc`]: #centrifuge-options-al-conc
+[`--al-conc-gz`]: #centrifuge-options-al-conc
+[`--al-conc-bz2`]: #centrifuge-options-al-conc
+
+    --al-conc <path>
+    --al-conc-gz <path>
+    --al-conc-bz2 <path>
+
+</td><td>
+
+Write paired-end reads that align concordantly at least once to file(s) at
+`<path>`. These reads correspond to the SAM records with the FLAGS `0x4` bit
+unset and either the `0x40` or `0x80` bit set (depending on whether it's mate #1
+or #2). `.1` and `.2` strings are added to the filename to distinguish which
+file contains mate #1 and mate #2.  If a percent symbol, `%`, is used in
+`<path>`, the percent symbol is replaced with `1` or `2` to make the per-mate
+filenames. Otherwise, `.1` or `.2` are added before the final dot in `<path>` to
+make the per-mate filenames.  Reads written in this way will appear exactly as
+they did in the input files, without any modification (same sequence, same name,
+same quality string, same quality encoding).  Reads will not necessarily appear
+in the same order as they did in the inputs.
+
+</td></tr>
+-->
+
+<tr><td id="centrifuge-options-quiet">
+
+<pre><code>--quiet</code></pre>
+</td><td>
+
+<p>Print nothing besides alignments and serious errors.</p>
+</td></tr>
+<tr><td id="centrifuge-options-met-file">
+
+<pre><code>--met-file <path></code></pre>
+</td><td>
+
+<p>Write <code>centrifuge</code> metrics to file <code><path></code>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#centrifuge-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
+</td></tr>
+<tr><td id="centrifuge-options-met-stderr">
+
+<pre><code>--met-stderr</code></pre>
+</td><td>
+
+<p>Write <code>centrifuge</code> metrics to the "standard error" ("stderr") filehandle. This is not mutually exclusive with <a href="#centrifuge-options-met-file"><code>--met-file</code></a>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#centrifuge-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
+</td></tr>
+<tr><td id="centrifuge-options-met">
+
+<pre><code>--met <int></code></pre>
+</td><td>
+
+<p>Write a new <code>centrifuge</code> metrics record every <code><int></code> seconds. Only matters if either <a href="#centrifuge-options-met-stderr"><code>--met-stderr</code></a> or <a href="#centrifuge-options-met-file"><code>--met-file</code></a> are specified. Default: 1.</p>
+</td></tr>
+</table>
+
+<h4 id="performance-options">Performance options</h4>
+<table><tr>
+
+<td id="centrifuge-options-o">
+
+<pre><code>-o/--offrate <int></code></pre>
+</td><td>
+
+<p>Override the offrate of the index with <code><int></code>. If <code><int></code> is greater than the offrate used to build the index, then some row markings are discarded when the index is read into memory. This reduces the memory footprint of the aligner but requires more time to calculate text offsets. <code><int></code> must be greater than the value used to build the index.</p>
+</td></tr>
+<tr><td id="centrifuge-options-p">
+
+<pre><code>-p/--threads NTHREADS</code></pre>
+</td><td>
+
+<p>Launch <code>NTHREADS</code> parallel search threads (default: 1). Threads will run on separate processors/cores and synchronize when parsing reads and outputting alignments. Searching for alignments is highly parallel, and speedup is close to linear. Increasing <code>-p</code> increases Centrifuge's memory footprint. E.g. when aligning to a human genome index, increasing <code>-p</code> from 1 to 8 increases the memory footprint by a few hundred megabytes. This option is only availab [...]
+</td></tr>
+<tr><td id="centrifuge-options-reorder">
+
+<pre><code>--reorder</code></pre>
+</td><td>
+
+<p>Guarantees that output records are printed in an order corresponding to the order of the reads in the original input file, even when <a href="#centrifuge-build-options-p"><code>-p</code></a> is set greater than 1. Specifying <code>--reorder</code> and setting <a href="#centrifuge-build-options-p"><code>-p</code></a> greater than 1 causes Centrifuge to run somewhat slower and use somewhat more memory then if <code>--reorder</code> were not specified. Has no effect if <a href="#centrifu [...]
+</td></tr>
+<tr><td id="centrifuge-options-mm">
+
+<pre><code>--mm</code></pre>
+</td><td>
+
+<p>Use memory-mapped I/O to load the index, rather than typical file I/O. Memory-mapping allows many concurrent <code>bowtie</code> processes on the same computer to share the same memory image of the index (i.e. you pay the memory overhead just once). This facilitates memory-efficient parallelization of <code>bowtie</code> in situations where using <a href="#centrifuge-build-options-p"><code>-p</code></a> is not possible or not preferable.</p>
+</td></tr></table>
+
+<h4 id="other-options">Other options</h4>
+<table>
+<tr><td id="centrifuge-options-qc-filter">
+
+<pre><code>--qc-filter</code></pre>
+</td><td>
+
+<p>Filter out reads for which the QSEQ filter field is non-zero. Only has an effect when read format is <a href="#centrifuge-options-qseq"><code>--qseq</code></a>. Default: off.</p>
+</td></tr>
+<tr><td id="centrifuge-options-seed">
+
+<pre><code>--seed <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the seed for pseudo-random number generator. Default: 0.</p>
+</td></tr>
+<tr><td id="centrifuge-options-non-deterministic">
+
+<pre><code>--non-deterministic</code></pre>
+</td><td>
+
+<p>Normally, Centrifuge re-initializes its pseudo-random generator for each read. It seeds the generator with a number derived from (a) the read name, (b) the nucleotide sequence, (c) the quality sequence, (d) the value of the <a href="#centrifuge-options-seed"><code>--seed</code></a> option. This means that if two reads are identical (same name, same nucleotides, same qualities) Centrifuge will find and report the same classification(s) for both, even if there was ambiguity. When <code> [...]
+</td></tr>
+<tr><td id="centrifuge-options-version">
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr>
+<tr><td id="centrifuge-options-h">
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr></table>
+
+
+<h1 id="the-centrifuge-build-indexer">The <code>centrifuge-build</code> indexer</h1>
+<p><code>centrifuge-build</code> builds a Centrifuge index from a set of DNA sequences. <code>centrifuge-build</code> outputs a set of 6 files with suffixes <code>.1.cf</code>, <code>.2.cf</code>, and <code>.3.cf</code>. These files together constitute the index: they are all that is needed to align reads to that reference. The original sequence FASTA files are no longer used by Centrifuge once the index is built.</p>
+<p>Use of Karkkainen's <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> allows <code>centrifuge-build</code> to trade off between running time and memory usage. <code>centrifuge-build</code> has two options governing how it makes this trade: <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>/<a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a>, and <a href="#centrifuge-build-options-dcv"><code>--dcv</code></a>. By default,  [...]
+<p>The indexer provides options pertaining to the "shape" of the index, e.g. <a href="#centrifuge-build-options-o"><code>--offrate</code></a> governs the fraction of <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows that are "marked" (i.e., the density of the suffix-array sample; see the original <a href="http://en.wikipedia.org/wiki/FM-index">FM Index</a> paper for details). All of these options are potentially profitable trade [...]
+<p>The Centrifuge index is based on the <a href="http://en.wikipedia.org/wiki/FM-index">FM Index</a> of Ferragina and Manzini, which in turn is based on the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> transform. The algorithm used to build the index is based on the <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> of Karkkainen.</p>
+<h2 id="command-line-1">Command Line</h2>
+<p>Usage:</p>
+<pre><code>centrifuge-build [options]* --conversion-table <table_in> --taxonomy-tree <taxonomy_in> --name-table <table_in2> <reference_in> <cf_base></code></pre>
+<h3 id="main-arguments-1">Main arguments</h3>
+<table><tr><td>
+
+<pre><code><reference_in></code></pre>
+</td><td>
+
+<p>A comma-separated list of FASTA files containing the reference sequences to be aligned to, or, if <a href="#centrifuge-build-options-c"><code>-c</code></a> is specified, the sequences themselves. E.g., <code><reference_in></code> might be <code>chr1.fa,chr2.fa,chrX.fa,chrY.fa</code>, or, if <a href="#centrifuge-build-options-c"><code>-c</code></a> is specified, this might be <code>GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA</code>.</p>
+</td></tr><tr><td>
+
+<pre><code><cf_base></code></pre>
+</td><td>
+
+<p>The basename of the index files to write. By default, <code>centrifuge-build</code> writes files named <code>NAME.1.cf</code>, <code>NAME.2.cf</code>, and <code>NAME.3.cf</code>, where <code>NAME</code> is <code><cf_base></code>.</p>
+</td></tr></table>
+
+<h3 id="options-1">Options</h3>
+<table><tr><td>
+
+<pre><code>-f</code></pre>
+</td><td>
+
+<p>The reference input files (specified as <code><reference_in></code>) are FASTA files (usually having extension <code>.fa</code>, <code>.mfa</code>, <code>.fna</code> or similar).</p>
+</td></tr><tr><td id="centrifuge-build-options-c">
+
+<pre><code>-c</code></pre>
+</td><td>
+
+<p>The reference sequences are given on the command line. I.e. <code><reference_in></code> is a comma-separated list of sequences rather than a list of FASTA files.</p>
+</td></tr>
+<tr><td id="centrifuge-build-options-a">
+
+<pre><code>-a/--noauto</code></pre>
+</td><td>
+
+<p>Disable the default behavior whereby <code>centrifuge-build</code> automatically selects values for the <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>, <a href="#centrifuge-build-options-dcv"><code>--dcv</code></a> and [<code>--packed</code>] parameters according to available memory. Instead, user may specify values for those parameters. If memory is exhausted during indexing, an error message will be printed; it is up to the user to try new parameters.</p>
+</td></tr><tr><td id="centrifuge-build-options-p">
+
+<pre><code>-p/--threads <int></code></pre>
+</td><td>
+
+<p>Launch <code>NTHREADS</code> parallel search threads (default: 1).</p>
+</td></tr><tr><td id="centrifuge-build-options-conversion-table">
+
+<pre><code>--conversion-table <file></code></pre>
+</td><td>
+
+<p>List of UIDs (unique ID) and corresponding taxonomic IDs.</p>
+</td></tr><tr><td id="centrifuge-build-options-taxonomy-tree">
+
+<pre><code>--taxonomy-tree <file></code></pre>
+</td><td>
+
+<p>Taxonomic tree (e.g. nodes.dmp).</p>
+</td></tr><tr><td id="centrifuge-build-options-name-table">
+
+<pre><code>--name-table <file></code></pre>
+</td><td>
+
+<p>Name table (e.g. names.dmp).</p>
+</td></tr><tr><td id="centrifuge-build-options-taxonomy-tree">
+
+<pre><code>--size-table <file></code></pre>
+</td><td>
+
+<p>List of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.</p>
+</td></tr><tr><td id="centrifuge-build-options-bmax">
+
+<pre><code>--bmax <int></code></pre>
+</td><td>
+
+<p>The maximum number of suffixes allowed in a block. Allowing more suffixes per block makes indexing faster, but increases peak memory usage. Setting this option overrides any previous setting for <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>, or <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default (in terms of the <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a> parameter) is <a href="#centrifuge-build-options-bmax [...]
+</td></tr><tr><td id="centrifuge-build-options-bmaxdivn">
+
+<pre><code>--bmaxdivn <int></code></pre>
+</td><td>
+
+<p>The maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference. Setting this option overrides any previous setting for <a href="#centrifuge-build-options-bmax"><code>--bmax</code></a>, or <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default: <a href="#centrifuge-build-options-bmaxdivn"><code>--bmaxdivn</code></a> 4. This is configured automatically by default; use <a href="#centrifuge-build-options-a"><code>-a</co [...]
+</td></tr><tr><td id="centrifuge-build-options-dcv">
+
+<pre><code>--dcv <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the period for the difference-cover sample. A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. Default: 1024. This is configured automatically by default; use <a href="#centrifuge-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
+</td></tr><tr><td id="centrifuge-build-options-nodc">
+
+<pre><code>--nodc</code></pre>
+</td><td>
+
+<p>Disable use of the difference-cover sample. Suffix sorting becomes quadratic-time in the worst case (where the worst case is an extremely repetitive reference). Default: off.</p>
+</td></tr><tr><td id="centrifuge-build-options-o">
+
+<pre><code>-o/--offrate <int></code></pre>
+</td><td>
+
+<p>To map alignments back to positions on the reference sequences, it's necessary to annotate ("mark") some or all of the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows with their corresponding location on the genome. <a href="#centrifuge-build-options-o"><code>-o</code>/<code>--offrate</code></a> governs how many rows get marked: the indexer will mark every 2^<code><int></code> rows. Marking more rows makes reference-position lo [...]
+</td></tr><tr><td>
+
+<pre><code>-t/--ftabchars <int></code></pre>
+</td><td>
+
+<p>The ftab is the lookup table used to calculate an initial <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> range with respect to the first <code><int></code> characters of the query. A larger <code><int></code> yields a larger lookup table but faster query times. The ftab has size 4^(<code><int></code>+1) bytes. The default setting is 10 (ftab is 4MB).</p>
+</td></tr><tr><td>
+
+<pre><code>--seed <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the seed for pseudo-random number generator.</p>
+</td></tr><tr><td>
+
+<pre><code>--kmer-count <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as kmer-size for counting the distinct number of k-mers in the input sequences.</p>
+</td></tr><tr><td>
+
+<pre><code>-q/--quiet</code></pre>
+</td><td>
+
+<p><code>centrifuge-build</code> is verbose by default. With this option <code>centrifuge-build</code> will print only error messages.</p>
+</td></tr><tr><td>
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr><tr><td>
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr></table>
+
+<h1 id="the-centrifuge-inspect-index-inspector">The <code>centrifuge-inspect</code> index inspector</h1>
+<p><code>centrifuge-inspect</code> extracts information from a Centrifuge index about what kind of index it is and what reference sequences were used to build it. When run without any options, the tool will output a FASTA file containing the sequences of the original references (with all non-<code>A</code>/<code>C</code>/<code>G</code>/<code>T</code> characters converted to <code>N</code>s). It can also be used to extract just the reference sequence names using the <a href="#centrifuge-i [...]
+<h2 id="command-line-2">Command Line</h2>
+<p>Usage:</p>
+<pre><code>centrifuge-inspect [options]* <cf_base></code></pre>
+<h3 id="main-arguments-2">Main arguments</h3>
+<table><tr><td>
+
+<pre><code><cf_base></code></pre>
+</td><td>
+
+<p>The basename of the index to be inspected. The basename is name of any of the index files but with the <code>.X.cf</code> suffix omitted. <code>centrifuge-inspect</code> first looks in the current directory for the index files, then in the directory specified in the <code>Centrifuge_INDEXES</code> environment variable.</p>
+</td></tr></table>
+
+<h3 id="options-2">Options</h3>
+<table><tr><td>
+
+<pre><code>-a/--across <int></code></pre>
+</td><td>
+
+<p>When printing FASTA output, output a newline character every <code><int></code> bases (default: 60).</p>
+</td></tr><tr><td id="centrifuge-inspect-options-n">
+
+<pre><code>-n/--names</code></pre>
+</td><td>
+
+<p>Print reference sequence names, one per line, and quit.</p>
+</td></tr><tr><td id="centrifuge-inspect-options-s">
+
+<pre><code>-s/--summary</code></pre>
+</td><td>
+
+<p>Print a summary that includes information about index settings, as well as the names and lengths of the input sequences. The summary has this format:</p>
+<pre><code>Colorspace  <0 or 1>
+SA-Sample   1 in <sample>
+FTab-Chars  <chars>
+Sequence-1  <name>  <len>
+Sequence-2  <name>  <len>
+...
+Sequence-N  <name>  <len></code></pre>
+<p>Fields are separated by tabs. Colorspace is always set to 0 for Centrifuge.</p>
+</td></tr><tr><td id="centrifuge-inspect-options-conversion-table">
+
+<pre><code>--conversion-table</code></pre>
+</td><td>
+
+<p>Print a list of UIDs (unique ID) and corresponding taxonomic IDs.</p>
+</td></tr><tr><td id="centrifuge-inspect-options-taxonomy-tree">
+
+<pre><code>--taxonomy-tree</code></pre>
+</td><td>
+
+<p>Print taxonomic tree.</p>
+</td></tr><tr><td id="centrifuge-inspect-options-name-table">
+
+<pre><code>--name-table</code></pre>
+</td><td>
+
+<p>Print name table.</p>
+</td></tr><tr><td id="centrifuge-inspect-options-taxonomy-tree">
+
+<pre><code>--size-table</code></pre>
+</td><td>
+
+<p>Print a list of taxonomic IDs and lengths of the sequences belonging to the same taxonomic IDs.</p>
+</td></tr><tr><td>
+
+<pre><code>-v/--verbose</code></pre>
+</td><td>
+
+<p>Print verbose output (for debugging).</p>
+</td></tr><tr><td>
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr><tr><td>
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr></table>
+
+<h1 id="getting-started-with-centrifuge">Getting started with Centrifuge</h1>
+<p>Centrifuge comes with some example files to get you started. The example files are not scientifically significant; these files will simply let you start running Centrifuge and downstream tools right away.</p>
+<p>First follow the manual instructions to <a href="#obtaining-centrifuge">obtain Centrifuge</a>. Set the <code>CENTRIFUGE_HOME</code> environment variable to point to the new Centrifuge directory containing the <code>centrifuge</code>, <code>centrifuge-build</code> and <code>centrifuge-inspect</code> binaries. This is important, as the <code>CENTRIFUGE_HOME</code> variable is used in the commands below to refer to that directory.</p>
+<h2 id="indexing-a-reference-genome">Indexing a reference genome</h2>
+<p>To create an index for two small sequences included with Centrifuge, create a new temporary directory (it doesn't matter where), change into that directory, and run:</p>
+<pre><code>$CENTRIFUGE_HOME/centrifuge-build --conversion-table $CENTRIFUGE_HOME/example/reference/gi_to_tid.dmp --taxonomy-tree $CENTRIFUGE_HOME/example/reference/nodes.dmp --name-table $CENTRIFUGE_HOME/example/reference/names.dmp $CENTRIFUGE_HOME/example/reference/test.fa test</code></pre>
+<p>The command should print many lines of output then quit. When the command completes, the current directory will contain ten new files that all start with <code>test</code> and end with <code>.1.cf</code>, <code>.2.cf</code>, <code>.3.cf</code>. These files constitute the index - you're done!</p>
+<p>You can use <code>centrifuge-build</code> to create an index for a set of FASTA files obtained from any source, including sites such as <a href="http://genome.ucsc.edu/cgi-bin/hgGateway">UCSC</a>, <a href="http://www.ncbi.nlm.nih.gov/sites/genome">NCBI</a>, and <a href="http://www.ensembl.org/">Ensembl</a>. When indexing multiple FASTA files, specify all the files using commas to separate file names. For more details on how to create an index with <code>centrifuge-build</code>, see th [...]
+<h2 id="classifying-example-reads">Classifying example reads</h2>
+<p>Stay in the directory created in the previous step, which now contains the <code>test</code> index files. Next, run:</p>
+<pre><code>$CENTRIFUGE_HOME/centrifuge -f -x test $CENTRIFUGE_HOME/example/reads/input.fa</code></pre>
+<p>This runs the Centrifuge classifier, which classifies a set of unpaired reads to the the genomes using the index generated in the previous step. The classification results are reported to stdout, and a short classification summary is written to centrifuge-species_report.tsv.</p>
+<p>You will see something like this:</p>
+<pre><code>readID  seqID taxID     score   2ndBestScore    hitLength   numMatches
+C_1 gi|7     9913      4225 4225        80      2
+C_1 gi|4     9646      4225 4225        80      2
+C_2 gi|4     9646      4225 4225        80      2
+C_2 gi|7     9913      4225 4225        80      2
+C_3 gi|7     9913      4225 4225        80      2
+C_3 gi|4     9646      4225 4225        80      2
+C_4 gi|4     9646      4225 4225        80      2
+C_4 gi|7     9913      4225 4225        80      2
+1_1 gi|4     9646      4225 0       80      1
+1_2 gi|4     9646      4225 0       80      1
+2_1 gi|7     9913      4225 0       80      1
+2_2 gi|7     9913      4225 0       80      1
+2_3 gi|7     9913      4225 0       80      1
+2_4 gi|7     9913      4225 0       80      1
+2_5 gi|7     9913      4225 0       80      1
+2_6 gi|7     9913      4225 0       80      1</code></pre>
diff --git a/doc/manual.inc.html.old b/doc/manual.inc.html.old
new file mode 100644
index 0000000..fab32df
--- /dev/null
+++ b/doc/manual.inc.html.old
@@ -0,0 +1,1186 @@
+<div id="TOC">
+<ul>
+<li><a href="#introduction">Introduction</a><ul>
+<li><a href="#what-is-hisat">What is HISAT?</a></li>
+</ul></li>
+<li><a href="#obtaining-hisat">Obtaining HISAT</a><ul>
+<li><a href="#building-from-source">Building from source</a></li>
+</ul></li>
+<li><a href="#running-hisat">Running HISAT</a><ul>
+<li><a href="#adding-to-path">Adding to PATH</a></li>
+<li><a href="#reporting">Reporting</a></li>
+<li><a href="#alignment-summmary">Alignment summmary</a></li>
+<li><a href="#wrapper">Wrapper</a></li>
+<li><a href="#small-and-large-indexes">Small and large indexes</a></li>
+<li><a href="#performance-tuning">Performance tuning</a></li>
+<li><a href="#command-line">Command Line</a><ul>
+<li><a href="#setting-function-options">Setting function options</a></li>
+<li><a href="#usage">Usage</a></li>
+<li><a href="#main-arguments">Main arguments</a></li>
+<li><a href="#options">Options</a></li>
+</ul></li>
+<li><a href="#sam-output">SAM output</a></li>
+</ul></li>
+<li><a href="#the-hisat-build-indexer">The <code>hisat-build</code> indexer</a><ul>
+<li><a href="#command-line-1">Command Line</a><ul>
+<li><a href="#main-arguments-1">Main arguments</a></li>
+<li><a href="#options-1">Options</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#the-hisat-inspect-index-inspector">The <code>hisat-inspect</code> index inspector</a><ul>
+<li><a href="#command-line-2">Command Line</a><ul>
+<li><a href="#main-arguments-2">Main arguments</a></li>
+<li><a href="#options-2">Options</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#getting-started-with-hisat">Getting started with HISAT</a><ul>
+<li><a href="#indexing-a-reference-genome">Indexing a reference genome</a></li>
+<li><a href="#aligning-example-reads">Aligning example reads</a></li>
+<li><a href="#paired-end-example">Paired-end example</a></li>
+<li><a href="#using-samtoolsbcftools-downstream">Using SAMtools/BCFtools downstream</a></li>
+</ul></li>
+</ul>
+</div>
+<!--
+ ! This manual is written in "markdown" format and thus contains some
+ ! distracting formatting clutter.  See 'MANUAL' for an easier-to-read version
+ ! of this text document, or see the HTML manual online.
+ ! -->
+
+<h1 id="introduction">Introduction</h1>
+<h2 id="what-is-hisat">What is HISAT?</h2>
+<p><a href="http://ccb.jhu.edu/software/hisat">HISAT</a> is a fast and sensitive spliced alignment program. As part of HISAT, we have developed a new indexing scheme based on the Burrows-Wheeler transform (<a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">BWT</a>) and the <a href="http://en.wikipedia.org/wiki/FM-index">FM index</a>, called hierarchical indexing, that employs two types of indexes: (1) one global FM index representing the whole genome, and (2) many separate  [...]
+<h1 id="obtaining-hisat">Obtaining HISAT</h1>
+<p>Download HISAT sources and binaries from the Releases sections on the right side. Binaries are available for Intel architectures (<code>x86_64</code>) running Linux, and Mac OS X.</p>
+<h2 id="building-from-source">Building from source</h2>
+<p>Building HISAT from source requires a GNU-like environment with GCC, GNU Make and other basics. It should be possible to build HISAT on most vanilla Linux installations or on a Mac installation with <a href="http://developer.apple.com/xcode/">Xcode</a> installed. HISAT can also be built on Windows using <a href="http://www.cygwin.com/">Cygwin</a> or <a href="http://www.mingw.org/">MinGW</a> (MinGW recommended). For a MinGW build the choice of what compiler is to be used is important s [...]
+<p>First, download the <a href="http://ccb.jhu.edu/software/hisat/downloads/hisat-0.1.0-beta.zip">source package</a> from the Releases secion on the right side. Unzip the file, change to the unzipped directory, and build the HISAT tools by running GNU <code>make</code> (usually with the command <code>make</code>, but sometimes with <code>gmake</code>) with no arguments. If building with MinGW, run <code>make</code> from the MSYS environment.</p>
+<p>HISAT is using the multithreading software model in order to speed up execution times on SMP architectures where this is possible. On POSIX platforms (like linux, Mac OS, etc) it needs the pthread library. Although it is possible to use pthread library on non-POSIX platform like Windows, due to performance reasons HISAT will try to use Windows native multithreading if possible.</p>
+<h1 id="running-hisat">Running HISAT</h1>
+<h2 id="adding-to-path">Adding to PATH</h2>
+<p>By adding your new HISAT directory to your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH environment variable</a>, you ensure that whenever you run <code>hisat</code>, <code>hisat-build</code> or <code>hisat-inspect</code> from the command line, you will get the version you just installed without having to specify the entire path. This is recommended for most users. To do this, follow your operating system's instructions for adding the directory to your <a href="http://e [...]
+<p>If you would like to install HISAT by copying the HISAT executable files to an existing directory in your <a href="http://en.wikipedia.org/wiki/PATH_(variable)">PATH</a>, make sure that you copy all the executables, including <code>hisat</code>, <code>hisat-align-s</code>, <code>hisat-align-l</code>, <code>hisat-build</code>, <code>hisat-build-s</code>, <code>hisat-build-l</code>, <code>hisat-inspect</code>, <code>hisat-inspect-s</code> and <code>hisat-inspect-l</code>.</p>
+<h2 id="reporting">Reporting</h2>
+<!--
+The reporting mode governs how many alignments HISAT looks for, and how to
+report them.  HISAT has three distinct reporting modes.  The default
+reporting mode is similar to the default reporting mode of many other read
+alignment tools, including [BWA].
+
+In general, when we say that a read has an alignment, we mean that it has a
+[valid alignment].  When we say that a read has multiple alignments, we mean
+that it has multiple alignments that are valid and distinct from one another. 
+
+[valid alignment]: #valid-alignments-meet-or-exceed-the-minimum-score-threshold
+[BWA]: http://bio-bwa.sourceforge.net/
+
+### Distinct alignments map a read to different places
+
+Two alignments for the same individual read are "distinct" if they map the same
+read to different places.  Specifically, we say that two alignments are distinct
+if there are no alignment positions where a particular read offset is aligned
+opposite a particular reference offset in both alignments with the same
+orientation.  E.g. if the first alignment is in the forward orientation and
+aligns the read character at read offset 10 to the reference character at
+chromosome 3, offset 3,445,245, and the second alignment is also in the forward
+orientation and also aligns the read character at read offset 10 to the
+reference character at chromosome 3, offset 3,445,245, they are not distinct
+alignments.
+
+Two alignments for the same pair are distinct if either the mate 1s in the two
+paired-end alignments are distinct or the mate 2s in the two alignments are
+distinct or both.
+
+### Default mode: search for multiple alignments, report the best one
+
+By default, HISAT searches for distinct, valid alignments for each read. When
+it finds a valid alignment, it generally will continue to look for alignments
+that are nearly as good or better.  It will eventually stop looking, either
+because it exceeded a limit placed on search effort (see [`-D`] and [`-R`]) or
+because it already knows all it needs to know to report an alignment.
+Information from the best alignments are used to estimate mapping quality (the
+`MAPQ` [SAM] field) and to set SAM optional fields, such as [`AS:i`] and
+[`XS:i`].  HISAT does not gaurantee that the alignment reported is the best
+possible in terms of alignment score.
+
+See also: [`-D`], which puts an upper limit on the number of dynamic programming
+problems (i.e. seed extensions) that can "fail" in a row before HISAT stops
+searching.  Increasing [`-D`] makes HISAT slower, but increases the
+likelihood that it will report the correct alignment for a read that aligns many
+places.
+
+See also: [`-R`], which sets the maximum number of times HISAT will "re-seed"
+when attempting to align a read with repetitive seeds.  Increasing [`-R`] makes
+HISAT slower, but increases the likelihood that it will report the correct
+alignment for a read that aligns many places.
+
+### -k mode: search for one or more alignments, report each
+
+In [`-k`] mode, HISAT searches for up to N distinct, valid alignments for
+each read, where N equals the integer specified with the `-k` parameter.  That
+is, if `-k 2` is specified, HISAT will search for at most 2 distinct
+alignments.  It reports all alignments found, in descending order by alignment
+score.  The alignment score for a paired-end alignment equals the sum of the
+alignment scores of the individual mates.  Each reported read or pair alignment
+beyond the first has the SAM 'secondary' bit (which equals 256) set in its FLAGS
+field.  See the [SAM specification] for details.
+
+HISAT does not "find" alignments in any specific order, so for reads that
+have more than N distinct, valid alignments, HISAT does not gaurantee that
+the N alignments reported are the best possible in terms of alignment score.
+Still, this mode can be effective and fast in situations where the user cares
+more about whether a read aligns (or aligns a certain number of times) than
+where exactly it originated.
+-->
+
+<h2 id="alignment-summmary">Alignment summmary</h2>
+<p>When HISAT finishes running, it prints messages summarizing what happened. These messages are printed to the "standard error" ("stderr") filehandle. For datasets consisting of unpaired reads, the summary might look like this:</p>
+<pre><code>20000 reads; of these:
+  20000 (100.00%) were unpaired; of these:
+    1247 (6.24%) aligned 0 times
+    18739 (93.69%) aligned exactly 1 time
+    14 (0.07%) aligned >1 times
+93.77% overall alignment rate</code></pre>
+<p>For datasets consisting of pairs, the summary might look like this:</p>
+<pre><code>10000 reads; of these:
+  10000 (100.00%) were paired; of these:
+    650 (6.50%) aligned concordantly 0 times
+    8823 (88.23%) aligned concordantly exactly 1 time
+    527 (5.27%) aligned concordantly >1 times
+    ----
+    650 pairs aligned concordantly 0 times; of these:
+      34 (5.23%) aligned discordantly 1 time
+    ----
+    616 pairs aligned 0 times concordantly or discordantly; of these:
+      1232 mates make up the pairs; of these:
+        660 (53.57%) aligned 0 times
+        571 (46.35%) aligned exactly 1 time
+        1 (0.08%) aligned >1 times
+96.70% overall alignment rate</code></pre>
+<p>The indentation indicates how subtotals relate to totals.</p>
+<h2 id="wrapper">Wrapper</h2>
+<p>The <code>hisat</code>, <code>hisat-build</code> and <code>hisat-inspect</code> executables are actually wrapper scripts that call binary programs as appropriate. The wrappers shield users from having to distinguish between "small" and "large" index formats, discussed briefly in the following section. Also, the <code>hisat</code> wrapper provides some key functionality, like the ability to handle compressed inputs, and the fucntionality for <a href="#hisat-options- [...]
+<p>It is recommended that you always run the hisat wrappers and not run the binaries directly.</p>
+<h2 id="small-and-large-indexes">Small and large indexes</h2>
+<p><code>hisat-build</code> can index reference genomes of any size. For genomes less than about 4 billion nucleotides in length, <code>hisat-build</code> builds a "small" index using 32-bit numbers in various parts of the index. When the genome is longer, <code>hisat-build</code> builds a "large" index using 64-bit numbers. Small indexes are stored in files with the <code>.bt2</code> extension, and large indexes are stored in files with the <code>.bt2l</code> extensi [...]
+<h2 id="performance-tuning">Performance tuning</h2>
+<ol style="list-style-type: decimal">
+<li><p>If your computer has multiple processors/cores, use <code>-p</code></p>
+<p>The <a href="#hisat-options-p"><code>-p</code></a> option causes HISAT to launch a specified number of parallel search threads. Each thread runs on a different processor/core and all threads find alignments in parallel, increasing alignment throughput by approximately a multiple of the number of threads (though in practice, speedup is somewhat worse than linear).</p></li>
+</ol>
+<h2 id="command-line">Command Line</h2>
+<h3 id="setting-function-options">Setting function options</h3>
+<p>Some HISAT options specify a function rather than an individual number or setting. In these cases the user specifies three parameters: (a) a function type <code>F</code>, (b) a constant term <code>B</code>, and (c) a coefficient <code>A</code>. The available function types are constant (<code>C</code>), linear (<code>L</code>), square-root (<code>S</code>), and natural log (<code>G</code>). The parameters are specified as <code>F,B,A</code> - that is, the function type, the constant t [...]
+<p>For example, if the function specification is <code>L,-0.4,-0.6</code>, then the function defined is:</p>
+<pre><code>f(x) = -0.4 + -0.6 * x</code></pre>
+<p>If the function specification is <code>G,1,5.4</code>, then the function defined is:</p>
+<pre><code>f(x) = 1.0 + 5.4 * ln(x)</code></pre>
+<p>See the documentation for the option in question to learn what the parameter <code>x</code> is for. For example, in the case if the <a href="#hisat-options-score-min"><code>--score-min</code></a> option, the function <code>f(x)</code> sets the minimum alignment score necessary for an alignment to be considered valid, and <code>x</code> is the read length.</p>
+<h3 id="usage">Usage</h3>
+<pre><code>hisat [options]* -x <hisat-idx> {-1 <m1> -2 <m2> | -U <r>} -S [<hit>]</code></pre>
+<h3 id="main-arguments">Main arguments</h3>
+<table><tr><td>
+
+<pre><code>-x <hisat-idx></code></pre>
+</td><td>
+
+<p>The basename of the index for the reference genome. The basename is the name of any of the index files up to but not including the final <code>.1.bt2</code> / <code>.rev.1.bt2</code> / etc. <code>hisat</code> looks for the specified index first in the current directory, then in the directory specified in the <code>HISAT_INDEXES</code> environment variable.</p>
+</td></tr><tr><td>
+
+<pre><code>-1 <m1></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing mate 1s (filename usually includes <code>_1</code>), e.g. <code>-1 flyA_1.fq,flyB_1.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m2></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>hisat</code> will read the mate 1s from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-2 <m2></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing mate 2s (filename usually includes <code>_2</code>), e.g. <code>-2 flyA_2.fq,flyB_2.fq</code>. Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <code><m1></code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>hisat</code> will read the mate 2s from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-U <r></code></pre>
+</td><td>
+
+<p>Comma-separated list of files containing unpaired reads to be aligned, e.g. <code>lane1.fq,lane2.fq,lane3.fq,lane4.fq</code>. Reads may be a mix of different lengths. If <code>-</code> is specified, <code>hisat</code> gets the reads from the "standard in" or "stdin" filehandle.</p>
+</td></tr><tr><td>
+
+<pre><code>-S <hit></code></pre>
+</td><td>
+
+<p>File to write SAM alignments to. By default, alignments are written to the "standard out" or "stdout" filehandle (i.e. the console).</p>
+</td></tr></table>
+
+<h3 id="options">Options</h3>
+<h4 id="input-options">Input options</h4>
+<table>
+<tr><td id="hisat-options-q">
+
+<pre><code>-q</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTQ files. FASTQ files usually have extension <code>.fq</code> or <code>.fastq</code>. FASTQ is the default format. See also: <a href="#hisat-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#hisat-options-int-quals"><code>--int-quals</code></a>.</p>
+</td></tr>
+<tr><td id="hisat-options-qseq">
+
+<pre><code>--qseq</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are QSEQ files. QSEQ files usually end in <code>_qseq.txt</code>. See also: <a href="#hisat-options-solexa-quals"><code>--solexa-quals</code></a> and <a href="#hisat-options-int-quals"><code>--int-quals</code></a>.</p>
+</td></tr>
+<tr><td id="hisat-options-f">
+
+<pre><code>-f</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are FASTA files. FASTA files usually have extension <code>.fa</code>, <code>.fasta</code>, <code>.mfa</code>, <code>.fna</code> or similar. FASTA files do not have a way of specifying quality values, so when <code>-f</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
+</td></tr>
+<tr><td id="hisat-options-r">
+
+<pre><code>-r</code></pre>
+</td><td>
+
+<p>Reads (specified with <code><m1></code>, <code><m2></code>, <code><s></code>) are files with one input sequence per line, without any other information (no read names, no qualities). When <code>-r</code> is set, the result is as if <code>--ignore-quals</code> is also set.</p>
+</td></tr>
+<tr><td id="hisat-options-c">
+
+<pre><code>-c</code></pre>
+</td><td>
+
+<p>The read sequences are given on command line. I.e. <code><m1></code>, <code><m2></code> and <code><singles></code> are comma-separated lists of reads rather than lists of read files. There is no way to specify read names or qualities, so <code>-c</code> also implies <code>--ignore-quals</code>.</p>
+</td></tr>
+<tr><td id="hisat-options-s">
+
+<pre><code>-s/--skip <int></code></pre>
+</td><td>
+
+<p>Skip (i.e. do not align) the first <code><int></code> reads or pairs in the input.</p>
+</td></tr>
+<tr><td id="hisat-options-u">
+
+<pre><code>-u/--qupto <int></code></pre>
+</td><td>
+
+<p>Align the first <code><int></code> reads or read pairs from the input (after the <a href="#hisat-options-s"><code>-s</code>/<code>--skip</code></a> reads or pairs have been skipped), then stop. Default: no limit.</p>
+</td></tr>
+<tr><td id="hisat-options-5">
+
+<pre><code>-5/--trim5 <int></code></pre>
+</td><td>
+
+<p>Trim <code><int></code> bases from 5' (left) end of each read before alignment (default: 0).</p>
+</td></tr>
+<tr><td id="hisat-options-3">
+
+<pre><code>-3/--trim3 <int></code></pre>
+</td><td>
+
+<p>Trim <code><int></code> bases from 3' (right) end of each read before alignment (default: 0).</p>
+</td></tr><tr><td id="hisat-options-phred33-quals">
+
+<pre><code>--phred33</code></pre>
+</td><td>
+
+<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 33. This is also called the "Phred+33" encoding, which is used by the very latest Illumina pipelines.</p>
+</td></tr>
+<tr><td id="hisat-options-phred64-quals">
+
+<pre><code>--phred64</code></pre>
+</td><td>
+
+<p>Input qualities are ASCII chars equal to the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> plus 64. This is also called the "Phred+64" encoding.</p>
+</td></tr>
+<tr><td id="hisat-options-solexa-quals">
+
+<pre><code>--solexa-quals</code></pre>
+</td><td>
+
+<p>Convert input qualities from <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Solexa</a> (which can be negative) to <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred</a> (which can't). This scheme was used in older Illumina GA Pipeline versions (prior to 1.3). Default: off.</p>
+</td></tr>
+<tr><td id="hisat-options-int-quals">
+
+<pre><code>--int-quals</code></pre>
+</td><td>
+
+<p>Quality values are represented in the read input file as space-separated ASCII integers, e.g., <code>40 40 30 40</code>..., rather than ASCII characters, e.g., <code>II?I</code>.... Integers are treated as being on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> scale unless <a href="#hisat-options-solexa-quals"><code>--solexa-quals</code></a> is also specified. Default: off.</p>
+</td></tr></table>
+
+<h4 id="alignment-options">Alignment options</h4>
+<table>
+
+<tr><td id="hisat-options-n-ceil">
+
+<pre><code>--n-ceil <func></code></pre>
+</td><td>
+
+<p>Sets a function governing the maximum number of ambiguous characters (usually <code>N</code>s and/or <code>.</code>s) allowed in a read as a function of read length. For instance, specifying <code>-L,0,0.15</code> sets the N-ceiling function <code>f</code> to <code>f(x) = 0 + 0.15 * x</code>, where x is the read length. See also: [setting function options]. Reads exceeding this ceiling are <a href="#filtering">filtered out</a>. Default: <code>L,0,0.15</code>.</p>
+</td></tr>
+
+<tr><td id="hisat-options-ignore-quals">
+
+<pre><code>--ignore-quals</code></pre>
+</td><td>
+
+<p>When calculating a mismatch penalty, always consider the quality value at the mismatched position to be the highest possible, regardless of the actual value. I.e. input is treated as though all quality values are high. This is also the default behavior when the input doesn't specify quality values (e.g. in <a href="#hisat-options-f"><code>-f</code></a>, <a href="#hisat-options-r"><code>-r</code></a>, or <a href="#hisat-options-c"><code>-c</code></a> modes).</p>
+</td></tr>
+<tr><td id="hisat-options-nofw">
+
+<pre><code>--nofw/--norc</code></pre>
+</td><td>
+
+<p>If <code>--nofw</code> is specified, <code>hisat</code> will not attempt to align unpaired reads to the forward (Watson) reference strand. If <code>--norc</code> is specified, <code>hisat</code> will not attempt to align unpaired reads against the reverse-complement (Crick) reference strand. In paired-end mode, <code>--nofw</code> and <code>--norc</code> pertain to the fragments; i.e. specifying <code>--nofw</code> causes <code>hisat</code> to explore only those paired-end configurati [...]
+</td></tr>
+
+<!--
+<tr><td id="hisat-options-end-to-end">
+
+[`--end-to-end`]: #hisat-options-end-to-end
+
+    --end-to-end
+
+</td><td>
+
+In this mode, HISAT requires that the entire read align from one end to the
+other, without any trimming (or "soft clipping") of characters from either end.
+The match bonus [`--ma`] always equals 0 in this mode, so all alignment scores
+are less than or equal to 0, and the greatest possible alignment score is 0.
+This is mutually exclusive with [`--local`].  `--end-to-end` is the default mode.
+
+</td></tr>
+<tr><td id="hisat-options-local">
+
+[`--local`]: #hisat-options-local
+
+    --local
+
+</td><td>
+
+In this mode, HISAT does not require that the entire read align from one end
+to the other.  Rather, some characters may be omitted ("soft clipped") from the
+ends in order to achieve the greatest possible alignment score.  The match bonus
+[`--ma`] is used in this mode, and the best possible alignment score is equal to
+the match bonus ([`--ma`]) times the length of the read.  Specifying `--local`
+and one of the presets (e.g. `--local --very-fast`) is equivalent to specifying
+the local version of the preset (`--very-fast-local`).  This is mutually
+exclusive with [`--end-to-end`].  `--end-to-end` is the default mode.
+
+</td></tr>
+-->
+
+</table>
+
+<h4 id="scoring-options">Scoring options</h4>
+<table>
+
+<tr><td id="hisat-options-ma">
+
+<pre><code>--ma <int></code></pre>
+</td><td>
+
+<p>Sets the match bonus. In [<code>--local</code>] mode <code><int></code> is added to the alignment score for each position where a read character aligns to a reference character and the characters match. Not used in [<code>--end-to-end</code>] mode. Default: 2.</p>
+</td></tr>
+<tr><td id="hisat-options-mp">
+
+<pre><code>--mp MX,MN</code></pre>
+</td><td>
+
+<p>Sets the maximum (<code>MX</code>) and minimum (<code>MN</code>) mismatch penalties, both integers. A number less than or equal to <code>MX</code> and greater than or equal to <code>MN</code> is subtracted from the alignment score for each position where a read character aligns to a reference character, the characters do not match, and neither is an <code>N</code>. If <a href="#hisat-options-ignore-quals"><code>--ignore-quals</code></a> is specified, the number subtracted quals <code> [...]
+</td></tr>
+<tr><td id="hisat-options-np">
+
+<pre><code>--np <int></code></pre>
+</td><td>
+
+<p>Sets penalty for positions where the read, reference, or both, contain an ambiguous character such as <code>N</code>. Default: 1.</p>
+</td></tr>
+<tr><td id="hisat-options-rdg">
+
+<pre><code>--rdg <int1>,<int2></code></pre>
+</td><td>
+
+<p>Sets the read gap open (<code><int1></code>) and extend (<code><int2></code>) penalties. A read gap of length N gets a penalty of <code><int1></code> + N * <code><int2></code>. Default: 5, 3.</p>
+</td></tr>
+<tr><td id="hisat-options-rfg">
+
+<pre><code>--rfg <int1>,<int2></code></pre>
+</td><td>
+
+<p>Sets the reference gap open (<code><int1></code>) and extend (<code><int2></code>) penalties. A reference gap of length N gets a penalty of <code><int1></code> + N * <code><int2></code>. Default: 5, 3.</p>
+</td></tr>
+<tr><td id="hisat-options-score-min">
+
+<pre><code>--score-min <func></code></pre>
+</td><td>
+
+<p>Sets a function governing the minimum alignment score needed for an alignment to be considered "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying <code>L,0,-0.6</code> sets the minimum-score function <code>f</code> to <code>f(x) = 0 + -0.6 * x</code>, where <code>x</code> is the read length. See also: [setting function options]. The default is <code>C,-18,0</code>.</p>
+</td></tr>
+</table>
+
+<h4 id="spliced-alignment-options">Spliced alignment options</h4>
+<table>
+
+<tr><td id="hisat-options-pen-cansplice">
+
+<pre><code>--pen-cansplice <int></code></pre>
+</td><td>
+
+<p>Sets the penalty for a canonical splice site. Default: 0.</p>
+</td></tr>
+
+<tr><td id="hisat-options-pen-noncansplice">
+
+<pre><code>--pen-noncansplice <int></code></pre>
+</td><td>
+
+<p>Sets the penalty for a non-canonical splice site. Default: 3.</p>
+</td></tr>
+<tr><td id="hisat-options-pen-intronlen">
+
+<pre><code>--pen-intronlen <func></code></pre>
+</td><td>
+
+<p>Sets the penalty for long introns so that alignments with shorter introns are preferred to those with longer introns. Default: G,-8,1</p>
+</td></tr>
+
+<tr><td id="hisat-options-known-splicesite-infile">
+
+<pre><code>--known-splicesite-infile <path></code></pre>
+</td><td>
+
+<p>With this mode, you can provide a list of known splice sites, which HISAT makes use of them to align reads with small anchors.<br />You can create such a list using "python extract_splice_sites.py genes.gtf > splicesites.txt", where "extract_splice_sites.py" is included in the HISAT package, "genes.gtf" is a gene annotation file, and "splicesites.txt" is a list of splice sites with which you provide HISAT in this mode.</p>
+</td></tr>
+
+<tr><td id="hisat-options-novel-splice-outfile">
+
+<pre><code>--novel-splicesite-outfile <path></code></pre>
+</td><td>
+
+<p>In this mode, HISAT reports a list of splice sites in the file "path":<br /> chromosome name "tab" genomic position of the flanking base on the left side of an intron "tab" genomic position of the flanking base on the right "tab" strand</p>
+</td></tr>
+
+<tr><td id="hisat-options-novel-splicesite-infile">
+
+<pre><code>--novel-splicesite-infile <path></code></pre>
+</td><td>
+
+<p>With this mode, you can provide a list of novel splice sites that were generated from the above option "--novel-splicesite-outfile".</p>
+</td></tr>
+
+<tr><td id="hisat-options-no-temp-splicesite">
+
+<pre><code>--no-temp-splicesite</code></pre>
+</td><td>
+
+<p>HISAT, by default, makes use of splice sites found by earlier reads to align later reads in the same run, in particular, reads with small anchors (<= 15 bp).<br />The option disables this default alignment strategy.</p>
+</td></tr>
+
+<tr><td id="hisat-options-no-spliced-alignment">
+
+<pre><code>--no-spliced-alignment</code></pre>
+</td><td>
+
+<p>Disable spliced alignment.</p>
+</td></tr>
+
+
+<tr><td id="hisat-options-rna-strandness">
+
+<pre><code>--rna-strandness <string></code></pre>
+</td><td>
+
+<p>Specify strand-specific information: the default is unstranded.<br />For single-end reads, use F or R. 'F' means a read corresponds to a transcript. 'R' means a read corresponds to the reverse complemented counterpart of a transcript. For paired-end reads, use either FR or RF.<br />Every read alignment will have an XS attribute tag: '+' means a read belongs to a transcript on '+' strand of genome. '-' means a read belongs to a transcript on '-' strand of genome. <br />
+(TopHat has a similar option, --library-type option, where fr-firststrand corresponds to R and RF; fr-secondstrand corresponds to F and FR.)</p>
+</td></tr>
+
+</table>
+
+<h4 id="reporting-options">Reporting options</h4>
+<table>
+
+<tr><td id="hisat-options-k">
+
+<pre><code>-k <int></code></pre>
+</td><td>
+
+<p>It searches for at most <code><int></code> distinct, valid alignments for each read. The search terminates when it can't find more distinct valid alignments, or when it finds <code><int></code>, whichever happens first. All alignments found are reported in descending order by alignment score. The alignment score for a paired-end alignment equals the sum of the alignment scores of the individual mates. Each reported read or pair alignment beyond the first has the SAM 'secon [...]
+<p>Note: HISAT is not designed with large values for <code>-k</code> in mind, and when aligning reads to long, repetitive genomes large <code>-k</code> can be very, very slow.</p>
+</td></tr>
+
+</table>
+
+<h4 id="paired-end-options">Paired-end options</h4>
+<table>
+
+<tr><td id="hisat-options-I">
+
+<pre><code>-I/--minins <int></code></pre>
+</td><td>
+
+<p>The minimum fragment length for valid paired-end alignments. E.g. if <code>-I 60</code> is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as <a href="#hisat-options-X"><code>-X</code></a> is also satisfied). A 19-bp gap would not be valid in that case. If trimming options <a href="#hisat-options-3"><code>-3</code></a> or <a href="#hisat-options-5"><code>-5</ [...]
+<p>The larger the difference between <a href="#hisat-options-I"><code>-I</code></a> and <a href="#hisat-options-X"><code>-X</code></a>, the slower HISAT will run. This is because larger differences bewteen <a href="#hisat-options-I"><code>-I</code></a> and <a href="#hisat-options-X"><code>-X</code></a> require that HISAT scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), HISAT is very efficient.</p>
+<p>Default: 0 (essentially imposing no minimum)</p>
+</td></tr>
+<tr><td id="hisat-options-X">
+
+<pre><code>-X/--maxins <int></code></pre>
+</td><td>
+
+<p>The maximum fragment length for valid paired-end alignments. E.g. if <code>-X 100</code> is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as <a href="#hisat-options-I"><code>-I</code></a> is also satisfied). A 61-bp gap would not be valid in that case. If trimming options <a href="#hisat-options-3"><code>-3</code></a> or <a href="#hisat-options-5"><code>-5</code [...]
+<p>The larger the difference between <a href="#hisat-options-I"><code>-I</code></a> and <a href="#hisat-options-X"><code>-X</code></a>, the slower HISAT will run. This is because larger differences bewteen <a href="#hisat-options-I"><code>-I</code></a> and <a href="#hisat-options-X"><code>-X</code></a> require that HISAT scan a larger window to determine if a concordant alignment exists. For typical fragment length ranges (200 to 400 nucleotides), HISAT is very efficient.</p>
+<p>Default: 500.</p>
+</td></tr>
+<tr><td id="hisat-options-fr">
+
+<pre><code>--fr/--rf/--ff</code></pre>
+</td><td>
+
+<p>The upstream/downstream mate orientations for a valid paired-end alignment against the forward reference strand. E.g., if <code>--fr</code> is specified and there is a candidate paired-end alignment where mate 1 appears upstream of the reverse complement of mate 2 and the fragment length constraints (<a href="#hisat-options-I"><code>-I</code></a> and <a href="#hisat-options-X"><code>-X</code></a>) are met, that alignment is valid. Also, if mate 2 appears upstream of the reverse comple [...]
+</td></tr>
+<tr><td id="hisat-options-no-mixed">
+
+<pre><code>--no-mixed</code></pre>
+</td><td>
+
+<p>By default, when <code>hisat</code> cannot find a concordant or discordant alignment for a pair, it then tries to find alignments for the individual mates. This option disables that behavior.</p>
+</td></tr>
+<tr><td id="hisat-options-no-discordant">
+
+<pre><code>--no-discordant</code></pre>
+</td><td>
+
+<p>By default, <code>hisat</code> looks for discordant alignments if it cannot find any concordant alignments. A discordant alignment is an alignment where both mates align uniquely, but that does not satisfy the paired-end constraints (<a href="#hisat-options-fr"><code>--fr</code>/<code>--rf</code>/<code>--ff</code></a>, <a href="#hisat-options-I"><code>-I</code></a>, <a href="#hisat-options-X"><code>-X</code></a>). This option disables that behavior.</p>
+</td></tr>
+<tr><td id="hisat-options-dovetail">
+
+<pre><code>--dovetail</code></pre>
+</td><td>
+
+<p>If the mates "dovetail", that is if one mate alignment extends past the beginning of the other such that the wrong mate begins upstream, consider that to be concordant. See also: <a href="#mates-can-overlap-contain-or-dovetail-each-other">Mates can overlap, contain or dovetail each other</a>. Default: mates cannot dovetail in a concordant alignment.</p>
+</td></tr>
+<tr><td id="hisat-options-no-contain">
+
+<pre><code>--no-contain</code></pre>
+</td><td>
+
+<p>If one mate alignment contains the other, consider that to be non-concordant. See also: <a href="#mates-can-overlap-contain-or-dovetail-each-other">Mates can overlap, contain or dovetail each other</a>. Default: a mate can contain the other in a concordant alignment.</p>
+</td></tr>
+<tr><td id="hisat-options-no-overlap">
+
+<pre><code>--no-overlap</code></pre>
+</td><td>
+
+<p>If one mate alignment overlaps the other at all, consider that to be non-concordant. See also: <a href="#mates-can-overlap-contain-or-dovetail-each-other">Mates can overlap, contain or dovetail each other</a>. Default: mates can overlap in a concordant alignment.</p>
+</td></tr></table>
+
+<h4 id="output-options">Output options</h4>
+<table>
+
+<tr><td id="hisat-options-t">
+
+<pre><code>-t/--time</code></pre>
+</td><td>
+
+<p>Print the wall-clock time required to load the index files and align the reads. This is printed to the "standard error" ("stderr") filehandle. Default: off.</p>
+</td></tr>
+<tr><td id="hisat-options-un">
+
+<pre><code>--un <path>
+--un-gz <path>
+--un-bz2 <path></code></pre>
+</td><td>
+
+<p>Write unpaired reads that fail to align to file at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code> bit set and neither the <code>0x40</code> nor <code>0x80</code> bits set. If <code>--un-gz</code> is specified, output will be gzip compressed. If <code>--un-bz2</code> is specified, output will be bzip2 compressed. Reads written in this way will appear exactly as they did in the input file, without any modification (same sequence, sam [...]
+</td></tr>
+<tr><td id="hisat-options-al">
+
+<pre><code>--al <path>
+--al-gz <path>
+--al-bz2 <path></code></pre>
+</td><td>
+
+<p>Write unpaired reads that align at least once to file at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code>, <code>0x40</code>, and <code>0x80</code> bits unset. If <code>--al-gz</code> is specified, output will be gzip compressed. If <code>--al-bz2</code> is specified, output will be bzip2 compressed. Reads written in this way will appear exactly as they did in the input file, without any modification (same sequence, same name, same q [...]
+</td></tr>
+<tr><td id="hisat-options-un-conc">
+
+<pre><code>--un-conc <path>
+--un-conc-gz <path>
+--un-conc-bz2 <path></code></pre>
+</td><td>
+
+<p>Write paired-end reads that fail to align concordantly to file(s) at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code> bit set and either the <code>0x40</code> or <code>0x80</code> bit set (depending on whether it's mate #1 or #2). <code>.1</code> and <code>.2</code> strings are added to the filename to distinguish which file contains mate #1 and mate #2. If a percent symbol, <code>%</code>, is used in <code><path></code>, the p [...]
+</td></tr>
+<tr><td id="hisat-options-al-conc">
+
+<pre><code>--al-conc <path>
+--al-conc-gz <path>
+--al-conc-bz2 <path></code></pre>
+</td><td>
+
+<p>Write paired-end reads that align concordantly at least once to file(s) at <code><path></code>. These reads correspond to the SAM records with the FLAGS <code>0x4</code> bit unset and either the <code>0x40</code> or <code>0x80</code> bit set (depending on whether it's mate #1 or #2). <code>.1</code> and <code>.2</code> strings are added to the filename to distinguish which file contains mate #1 and mate #2. If a percent symbol, <code>%</code>, is used in <code><path></code [...]
+</td></tr>
+<tr><td id="hisat-options-quiet">
+
+<pre><code>--quiet</code></pre>
+</td><td>
+
+<p>Print nothing besides alignments and serious errors.</p>
+</td></tr>
+<tr><td id="hisat-options-met-file">
+
+<pre><code>--met-file <path></code></pre>
+</td><td>
+
+<p>Write <code>hisat</code> metrics to file <code><path></code>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#hisat-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
+</td></tr>
+<tr><td id="hisat-options-met-stderr">
+
+<pre><code>--met-stderr</code></pre>
+</td><td>
+
+<p>Write <code>hisat</code> metrics to the "standard error" ("stderr") filehandle. This is not mutually exclusive with <a href="#hisat-options-met-file"><code>--met-file</code></a>. Having alignment metric can be useful for debugging certain problems, especially performance issues. See also: <a href="#hisat-options-met"><code>--met</code></a>. Default: metrics disabled.</p>
+</td></tr>
+<tr><td id="hisat-options-met">
+
+<pre><code>--met <int></code></pre>
+</td><td>
+
+<p>Write a new <code>hisat</code> metrics record every <code><int></code> seconds. Only matters if either <a href="#hisat-options-met-stderr"><code>--met-stderr</code></a> or <a href="#hisat-options-met-file"><code>--met-file</code></a> are specified. Default: 1.</p>
+</td></tr>
+</table>
+
+<h4 id="sam-options">SAM options</h4>
+<table>
+
+<tr><td id="hisat-options-no-unal">
+
+<pre><code>--no-unal</code></pre>
+</td><td>
+
+<p>Suppress SAM records for reads that failed to align.</p>
+</td></tr>
+<tr><td id="hisat-options-no-hd">
+
+<pre><code>--no-hd</code></pre>
+</td><td>
+
+<p>Suppress SAM header lines (starting with <code>@</code>).</p>
+</td></tr>
+<tr><td id="hisat-options-no-sq">
+
+<pre><code>--no-sq</code></pre>
+</td><td>
+
+<p>Suppress <code>@SQ</code> SAM header lines.</p>
+</td></tr>
+<tr><td id="hisat-options-rg-id">
+
+<pre><code>--rg-id <text></code></pre>
+</td><td>
+
+<p>Set the read group ID to <code><text></code>. This causes the SAM <code>@RG</code> header line to be printed, with <code><text></code> as the value associated with the <code>ID:</code> tag. It also causes the <code>RG:Z:</code> extra field to be attached to each SAM output record, with value set to <code><text></code>.</p>
+</td></tr>
+<tr><td id="hisat-options-rg">
+
+<pre><code>--rg <text></code></pre>
+</td><td>
+
+<p>Add <code><text></code> (usually of the form <code>TAG:VAL</code>, e.g. <code>SM:Pool1</code>) as a field on the <code>@RG</code> header line. Note: in order for the <code>@RG</code> line to appear, <a href="#hisat-options-rg-id"><code>--rg-id</code></a> must also be specified. This is because the <code>ID</code> tag is required by the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM Spec</a>. Specify <code>--rg</code> multiple times to set multiple fields. See the <a href [...]
+</td></tr>
+<tr><td id="hisat-options-omit-sec-seq">
+
+<pre><code>--omit-sec-seq</code></pre>
+</td><td>
+
+<p>When printing secondary alignments, HISAT by default will write out the <code>SEQ</code> and <code>QUAL</code> strings. Specifying this option causes HISAT to print an asterix in those fields instead.</p>
+</td></tr>
+
+
+</table>
+
+<h4 id="performance-options">Performance options</h4>
+<table><tr>
+
+<td id="hisat-options-o">
+
+<pre><code>-o/--offrate <int></code></pre>
+</td><td>
+
+<p>Override the offrate of the index with <code><int></code>. If <code><int></code> is greater than the offrate used to build the index, then some row markings are discarded when the index is read into memory. This reduces the memory footprint of the aligner but requires more time to calculate text offsets. <code><int></code> must be greater than the value used to build the index.</p>
+</td></tr>
+<tr><td id="hisat-options-p">
+
+<pre><code>-p/--threads NTHREADS</code></pre>
+</td><td>
+
+<p>Launch <code>NTHREADS</code> parallel search threads (default: 1). Threads will run on separate processors/cores and synchronize when parsing reads and outputting alignments. Searching for alignments is highly parallel, and speedup is close to linear. Increasing <code>-p</code> increases HISAT's memory footprint. E.g. when aligning to a human genome index, increasing <code>-p</code> from 1 to 8 increases the memory footprint by a few hundred megabytes. This option is only available if [...]
+</td></tr>
+<tr><td id="hisat-options-reorder">
+
+<pre><code>--reorder</code></pre>
+</td><td>
+
+<p>Guarantees that output SAM records are printed in an order corresponding to the order of the reads in the original input file, even when <a href="#hisat-options-p"><code>-p</code></a> is set greater than 1. Specifying <code>--reorder</code> and setting <a href="#hisat-options-p"><code>-p</code></a> greater than 1 causes HISAT to run somewhat slower and use somewhat more memory then if <code>--reorder</code> were not specified. Has no effect if <a href="#hisat-options-p"><code>-p</code [...]
+</td></tr>
+<tr><td id="hisat-options-mm">
+
+<pre><code>--mm</code></pre>
+</td><td>
+
+<p>Use memory-mapped I/O to load the index, rather than typical file I/O. Memory-mapping allows many concurrent <code>bowtie</code> processes on the same computer to share the same memory image of the index (i.e. you pay the memory overhead just once). This facilitates memory-efficient parallelization of <code>bowtie</code> in situations where using <a href="#hisat-options-p"><code>-p</code></a> is not possible or not preferable.</p>
+</td></tr></table>
+
+<h4 id="other-options">Other options</h4>
+<table>
+<tr><td id="hisat-options-qc-filter">
+
+<pre><code>--qc-filter</code></pre>
+</td><td>
+
+<p>Filter out reads for which the QSEQ filter field is non-zero. Only has an effect when read format is <a href="#hisat-options-qseq"><code>--qseq</code></a>. Default: off.</p>
+</td></tr>
+<tr><td id="hisat-options-seed">
+
+<pre><code>--seed <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the seed for pseudo-random number generator. Default: 0.</p>
+</td></tr>
+<tr><td id="hisat-options-non-deterministic">
+
+<pre><code>--non-deterministic</code></pre>
+</td><td>
+
+<p>Normally, HISAT re-initializes its pseudo-random generator for each read. It seeds the generator with a number derived from (a) the read name, (b) the nucleotide sequence, (c) the quality sequence, (d) the value of the <a href="#hisat-options-seed"><code>--seed</code></a> option. This means that if two reads are identical (same name, same nucleotides, same qualities) HISAT will find and report the same alignment(s) for both, even if there was ambiguity. When <code>--non-deterministic< [...]
+</td></tr>
+<tr><td id="hisat-options-version">
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr>
+<tr><td id="hisat-options-h">
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr></table>
+
+<h2 id="sam-output">SAM output</h2>
+<p>Following is a brief description of the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM</a> format as output by <code>hisat</code>. For more details, see the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM format specification</a>.</p>
+<p>By default, <code>hisat</code> prints a SAM header with <code>@HD</code>, <code>@SQ</code> and <code>@PG</code> lines. When one or more <a href="#hisat-options-rg"><code>--rg</code></a> arguments are specified, <code>hisat</code> will also print an <code>@RG</code> line that includes all user-specified <a href="#hisat-options-rg"><code>--rg</code></a> tokens separated by tabs.</p>
+<p>Each subsequnt line describes an alignment or, if the read failed to align, a read. Each line is a collection of at least 12 fields separated by tabs; from left to right, the fields are:</p>
+<ol style="list-style-type: decimal">
+<li><p>Name of read that aligned.</p>
+<p>Note that the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM specification</a> disallows whitespace in the read name. If the read name contains any whitespace characters, HISAT will truncate the name at the first whitespace character. This is similar to the behavior of other tools.</p></li>
+<li><p>Sum of all applicable flags. Flags relevant to HISAT are:</p>
+<table><tr><td>
+
+<pre><code>1</code></pre>
+</td><td>
+
+<p>The read is one of a pair</p>
+</td></tr><tr><td>
+
+<pre><code>2</code></pre>
+</td><td>
+
+<p>The alignment is one end of a proper paired-end alignment</p>
+</td></tr><tr><td>
+
+<pre><code>4</code></pre>
+</td><td>
+
+<p>The read has no reported alignments</p>
+</td></tr><tr><td>
+
+<pre><code>8</code></pre>
+</td><td>
+
+<p>The read is one of a pair and has no reported alignments</p>
+</td></tr><tr><td>
+
+<pre><code>16</code></pre>
+</td><td>
+
+<p>The alignment is to the reverse reference strand</p>
+</td></tr><tr><td>
+
+<pre><code>32</code></pre>
+</td><td>
+
+<p>The other mate in the paired-end alignment is aligned to the reverse reference strand</p>
+</td></tr><tr><td>
+
+<pre><code>64</code></pre>
+</td><td>
+
+<p>The read is mate 1 in a pair</p>
+</td></tr><tr><td>
+
+<pre><code>128</code></pre>
+</td><td>
+
+<p>The read is mate 2 in a pair</p>
+</td></tr></table>
+
+<p>Thus, an unpaired read that aligns to the reverse reference strand will have flag 16. A paired-end read that aligns and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1).</p></li>
+<li><p>Name of reference sequence where alignment occurs</p></li>
+<li><p>1-based offset into the forward reference strand where leftmost character of the alignment occurs</p></li>
+<li><p>Mapping quality</p></li>
+<li><p>CIGAR string representation of alignment</p></li>
+<li><p>Name of reference sequence where mate's alignment occurs. Set to <code>=</code> if the mate's reference sequence is the same as this alignment's, or <code>*</code> if there is no mate.</p></li>
+<li><p>1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.</p></li>
+<li><p>Inferred fragment length. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if the mates did not align concordantly. However, size is non-0 if the mates aligned discordantly to the same chromosome.</p></li>
+<li><p>Read sequence (reverse-complemented if aligned to the reverse strand)</p></li>
+<li><p>ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the <a href="http://en.wikipedia.org/wiki/Phred_quality_score">Phred quality</a> scale and the encoding is ASCII-offset by 33 (ASCII char <code>!</code>), similarly to a <a href="http://en.wikipedia.org/wiki/FASTQ_format">FASTQ</a> file.</p></li>
+<li><p>Optional fields. Fields are tab-separated. <code>hisat</code> outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:</p>
+<table>
+<tr><td id="hisat-build-opt-fields-as">
+
+<pre><code>AS:i:<N></code></pre>
+</td>
+<td>
+
+<p>Alignment score. Can be negative. Can be greater than 0 in [<code>--local</code>] mode (but not in [<code>--end-to-end</code>] mode). Only present if SAM record is for an aligned read.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-xs">
+
+<pre><code>XS:i:<N></code></pre>
+</td>
+<td>
+
+<p>Alignment score for second-best alignment. Can be negative. Can be greater than 0 in [<code>--local</code>] mode (but not in [<code>--end-to-end</code>] mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-ys">
+
+<pre><code>YS:i:<N></code></pre>
+</td>
+<td>
+
+<p>Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-xn">
+
+<pre><code>XN:i:<N></code></pre>
+</td>
+<td>
+
+<p>The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-xm">
+
+<pre><code>XM:i:<N></code></pre>
+</td>
+<td>
+
+<p>The number of mismatches in the alignment. Only present if SAM record is for an aligned read.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-xo">
+
+<pre><code>XO:i:<N></code></pre>
+</td>
+<td>
+
+<p>The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-xg">
+
+<pre><code>XG:i:<N></code></pre>
+</td>
+<td>
+
+<p>The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-nm">
+
+<pre><code>NM:i:<N></code></pre>
+</td>
+<td>
+
+<p>The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-yf">
+
+<pre><code>YF:Z:<S></code></pre>
+</td><td>
+
+<p>String indicating reason why the read was filtered out. See also: [Filtering]. Only appears for reads that were filtered out.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-yt">
+
+<pre><code>YT:Z:<S></code></pre>
+</td><td>
+
+<p>Value of <code>UU</code> indicates the read was not part of a pair. Value of <code>CP</code> indicates the read was part of a pair and the pair aligned concordantly. Value of <code>DP</code> indicates the read was part of a pair and the pair aligned discordantly. Value of <code>UP</code> indicates the read was part of a pair but the pair failed to aligned either concordantly or discordantly.</p>
+</td></tr>
+<tr><td id="hisat-build-opt-fields-md">
+
+<pre><code>MD:Z:<S></code></pre>
+</td><td>
+
+<p>A string representation of the mismatched reference bases in the alignment. See <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM</a> format specification for details. Only present if SAM record is for an aligned read.</p>
+</td></tr>
+</table>
+</li>
+</ol>
+<h1 id="the-hisat-build-indexer">The <code>hisat-build</code> indexer</h1>
+<p><code>hisat-build</code> builds a HISAT index from a set of DNA sequences. <code>hisat-build</code> outputs a set of 6 files with suffixes <code>.1.bt2</code>, <code>.2.bt2</code>, <code>.3.bt2</code>, <code>.4.bt2</code>, <code>.rev.1.bt2</code>, and <code>.rev.2.bt2</code>. In the case of a large index these suffixes will have a <code>bt2l</code> termination. These files together constitute the index: they are all that is needed to align reads to that reference. The original sequenc [...]
+<p>Use of Karkkainen's <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> allows <code>hisat-build</code> to trade off between running time and memory usage. <code>hisat-build</code> has three options governing how it makes this trade: <a href="#hisat-build-options-p"><code>-p</code>/<code>--packed</code></a>, <a href="#hisat-build-options-bmax"><code>--bmax</code></a>/<a href="#hisat-build-options-bmaxdivn"><code>--bmaxdivn</code></a>, and <a href="#hisat-bu [...]
+<p>The indexer provides options pertaining to the "shape" of the index, e.g. <a href="#hisat-build-options-o"><code>--offrate</code></a> governs the fraction of <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows that are "marked" (i.e., the density of the suffix-array sample; see the original <a href="http://en.wikipedia.org/wiki/FM-index">FM Index</a> paper for details). All of these options are potentially profitable trade-offs [...]
+<p><code>hisat-build</code> can generate either <a href="#small-and-large-indexes">small or large indexes</a>. The wrapper will decide which based on the length of the input genome. If the reference does not exceed 4 billion characters but a large index is preferred, the user can specify <a href="#hisat-build-options-large-index"><code>--large-index</code></a> to force <code>hisat-build</code> to build a large index instead.</p>
+<p>The HISAT index is based on the <a href="http://en.wikipedia.org/wiki/FM-index">FM Index</a> of Ferragina and Manzini, which in turn is based on the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> transform. The algorithm used to build the index is based on the <a href="http://portal.acm.org/citation.cfm?id=1314852">blockwise algorithm</a> of Karkkainen.</p>
+<h2 id="command-line-1">Command Line</h2>
+<p>Usage:</p>
+<pre><code>hisat-build [options]* <reference_in> <bt2_base></code></pre>
+<h3 id="main-arguments-1">Main arguments</h3>
+<table><tr><td>
+
+<pre><code><reference_in></code></pre>
+</td><td>
+
+<p>A comma-separated list of FASTA files containing the reference sequences to be aligned to, or, if <a href="#hisat-build-options-c"><code>-c</code></a> is specified, the sequences themselves. E.g., <code><reference_in></code> might be <code>chr1.fa,chr2.fa,chrX.fa,chrY.fa</code>, or, if <a href="#hisat-build-options-c"><code>-c</code></a> is specified, this might be <code>GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA</code>.</p>
+</td></tr><tr><td>
+
+<pre><code><bt2_base></code></pre>
+</td><td>
+
+<p>The basename of the index files to write. By default, <code>hisat-build</code> writes files named <code>NAME.1.bt2</code>, <code>NAME.2.bt2</code>, <code>NAME.3.bt2</code>, <code>NAME.4.bt2</code>, <code>NAME.5.bt2</code>, <code>NAME.6.bt2</code>, <code>NAME.rev.1.bt2</code>, <code>NAME.rev.2.bt2</code>, <code>NAME.rev.5.bt2</code>, and <code>NAME.rev.6.bt2</code> where <code>NAME</code> is <code><bt2_base></code>.</p>
+</td></tr></table>
+
+<h3 id="options-1">Options</h3>
+<table><tr><td>
+
+<pre><code>-f</code></pre>
+</td><td>
+
+<p>The reference input files (specified as <code><reference_in></code>) are FASTA files (usually having extension <code>.fa</code>, <code>.mfa</code>, <code>.fna</code> or similar).</p>
+</td></tr><tr><td id="hisat-build-options-c">
+
+<pre><code>-c</code></pre>
+</td><td>
+
+<p>The reference sequences are given on the command line. I.e. <code><reference_in></code> is a comma-separated list of sequences rather than a list of FASTA files.</p>
+</td></tr>
+</td>
+</tra>
+<tr><td id="hisat-build-options-large-index">
+
+<pre><code>--large-index</code></pre>
+</td><td>
+
+<p>Force <code>hisat-build</code> to build a <a href="#small-and-large-indexes">large index</a>, even if the reference is less than ~ 4 billion nucleotides inlong.</p>
+</td></tr>
+<tr><td id="hisat-build-options-a">
+
+<pre><code>-a/--noauto</code></pre>
+</td><td>
+
+<p>Disable the default behavior whereby <code>hisat-build</code> automatically selects values for the <a href="#hisat-build-options-bmax"><code>--bmax</code></a>, <a href="#hisat-build-options-dcv"><code>--dcv</code></a> and <a href="#hisat-build-options-p"><code>--packed</code></a> parameters according to available memory. Instead, user may specify values for those parameters. If memory is exhausted during indexing, an error message will be printed; it is up to the user to try new param [...]
+</td></tr><tr><td id="hisat-build-options-p">
+
+<pre><code>-p/--packed</code></pre>
+</td><td>
+
+<p>Use a packed (2-bits-per-nucleotide) representation for DNA strings. This saves memory but makes indexing 2-3 times slower. Default: off. This is configured automatically by default; use <a href="#hisat-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
+</td></tr><tr><td id="hisat-build-options-bmax">
+
+<pre><code>--bmax <int></code></pre>
+</td><td>
+
+<p>The maximum number of suffixes allowed in a block. Allowing more suffixes per block makes indexing faster, but increases peak memory usage. Setting this option overrides any previous setting for <a href="#hisat-build-options-bmax"><code>--bmax</code></a>, or <a href="#hisat-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default (in terms of the <a href="#hisat-build-options-bmaxdivn"><code>--bmaxdivn</code></a> parameter) is <a href="#hisat-build-options-bmaxdivn"><code>--bmaxdi [...]
+</td></tr><tr><td id="hisat-build-options-bmaxdivn">
+
+<pre><code>--bmaxdivn <int></code></pre>
+</td><td>
+
+<p>The maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference. Setting this option overrides any previous setting for <a href="#hisat-build-options-bmax"><code>--bmax</code></a>, or <a href="#hisat-build-options-bmaxdivn"><code>--bmaxdivn</code></a>. Default: <a href="#hisat-build-options-bmaxdivn"><code>--bmaxdivn</code></a> 4. This is configured automatically by default; use <a href="#hisat-build-options-a"><code>-a</code>/<code>--noauto</ [...]
+</td></tr><tr><td id="hisat-build-options-dcv">
+
+<pre><code>--dcv <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the period for the difference-cover sample. A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096. Default: 1024. This is configured automatically by default; use <a href="#hisat-build-options-a"><code>-a</code>/<code>--noauto</code></a> to configure manually.</p>
+</td></tr><tr><td id="hisat-build-options-nodc">
+
+<pre><code>--nodc</code></pre>
+</td><td>
+
+<p>Disable use of the difference-cover sample. Suffix sorting becomes quadratic-time in the worst case (where the worst case is an extremely repetitive reference). Default: off.</p>
+</td></tr><tr><td>
+
+<pre><code>-r/--noref</code></pre>
+</td><td>
+
+<p>Do not build the <code>NAME.3.bt2</code> and <code>NAME.4.bt2</code> portions of the index, which contain a bitpacked version of the reference sequences and are used for paired-end alignment.</p>
+</td></tr><tr><td>
+
+<pre><code>-3/--justref</code></pre>
+</td><td>
+
+<p>Build only the <code>NAME.3.bt2</code> and <code>NAME.4.bt2</code> portions of the index, which contain a bitpacked version of the reference sequences and are used for paired-end alignment.</p>
+</td></tr><tr><td id="hisat-build-options-o">
+
+<pre><code>-o/--offrate <int></code></pre>
+</td><td>
+
+<p>To map alignments back to positions on the reference sequences, it's necessary to annotate ("mark") some or all of the <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> rows with their corresponding location on the genome. <a href="#hisat-build-options-o"><code>-o</code>/<code>--offrate</code></a> governs how many rows get marked: the indexer will mark every 2^<code><int></code> rows. Marking more rows makes reference-position lookups [...]
+</td></tr><tr><td>
+
+<pre><code>-t/--ftabchars <int></code></pre>
+</td><td>
+
+<p>The ftab is the lookup table used to calculate an initial <a href="http://en.wikipedia.org/wiki/Burrows-Wheeler_transform">Burrows-Wheeler</a> range with respect to the first <code><int></code> characters of the query. A larger <code><int></code> yields a larger lookup table but faster query times. The ftab has size 4^(<code><int></code>+1) bytes. The default setting is 10 (ftab is 4MB).</p>
+</td></tr><tr><td id="hisat-build-options-localoffrate">
+
+<pre><code>--localoffrate <int></code></pre>
+</td><td>
+
+<p>This option governs how many rows get marked in a local index: the indexer will mark every 2^<code><int></code> rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 3 (every 8th row is marked, this occupies about 16KB per local index).</p>
+</td></tr><tr><td>
+
+<pre><code>--localftabchars <int></code></pre>
+</td><td>
+
+<p>The local ftab is the lookup table in a local index. The default setting is 6 (ftab is 8KB per local index).</p>
+</td></tr><tr><td>
+
+<pre><code>--seed <int></code></pre>
+</td><td>
+
+<p>Use <code><int></code> as the seed for pseudo-random number generator.</p>
+</td></tr><tr><td>
+
+<pre><code>--cutoff <int></code></pre>
+</td><td>
+
+<p>Index only the first <code><int></code> bases of the reference sequences (cumulative across sequences) and ignore the rest.</p>
+</td></tr><tr><td>
+
+<pre><code>-q/--quiet</code></pre>
+</td><td>
+
+<p><code>hisat-build</code> is verbose by default. With this option <code>hisat-build</code> will print only error messages.</p>
+</td></tr><tr><td>
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr><tr><td>
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr></table>
+
+<h1 id="the-hisat-inspect-index-inspector">The <code>hisat-inspect</code> index inspector</h1>
+<p><code>hisat-inspect</code> extracts information from a HISAT index about what kind of index it is and what reference sequences were used to build it. When run without any options, the tool will output a FASTA file containing the sequences of the original references (with all non-<code>A</code>/<code>C</code>/<code>G</code>/<code>T</code> characters converted to <code>N</code>s). It can also be used to extract just the reference sequence names using the <a href="#hisat-inspect-options- [...]
+<h2 id="command-line-2">Command Line</h2>
+<p>Usage:</p>
+<pre><code>hisat-inspect [options]* <bt2_base></code></pre>
+<h3 id="main-arguments-2">Main arguments</h3>
+<table><tr><td>
+
+<pre><code><bt2_base></code></pre>
+</td><td>
+
+<p>The basename of the index to be inspected. The basename is name of any of the index files but with the <code>.X.bt2</code> or <code>.rev.X.bt2</code> suffix omitted. <code>hisat-inspect</code> first looks in the current directory for the index files, then in the directory specified in the <code>HISAT_INDEXES</code> environment variable.</p>
+</td></tr></table>
+
+<h3 id="options-2">Options</h3>
+<table><tr><td>
+
+<pre><code>-a/--across <int></code></pre>
+</td><td>
+
+<p>When printing FASTA output, output a newline character every <code><int></code> bases (default: 60).</p>
+</td></tr><tr><td id="hisat-inspect-options-n">
+
+<pre><code>-n/--names</code></pre>
+</td><td>
+
+<p>Print reference sequence names, one per line, and quit.</p>
+</td></tr><tr><td id="hisat-inspect-options-s">
+
+<pre><code>-s/--summary</code></pre>
+</td><td>
+
+<p>Print a summary that includes information about index settings, as well as the names and lengths of the input sequences. The summary has this format:</p>
+<pre><code>Colorspace  <0 or 1>
+SA-Sample   1 in <sample>
+FTab-Chars  <chars>
+Sequence-1  <name>  <len>
+Sequence-2  <name>  <len>
+...
+Sequence-N  <name>  <len></code></pre>
+<p>Fields are separated by tabs. Colorspace is always set to 0 for HISAT.</p>
+</td></tr><tr><td>
+
+<pre><code>-v/--verbose</code></pre>
+</td><td>
+
+<p>Print verbose output (for debugging).</p>
+</td></tr><tr><td>
+
+<pre><code>--version</code></pre>
+</td><td>
+
+<p>Print version information and quit.</p>
+</td></tr><tr><td>
+
+<pre><code>-h/--help</code></pre>
+</td><td>
+
+<p>Print usage information and quit.</p>
+</td></tr></table>
+
+<h1 id="getting-started-with-hisat">Getting started with HISAT</h1>
+<p>HISAT comes with some example files to get you started. The example files are not scientifically significant; these files will simply let you start running HISAT and downstream tools right away.</p>
+<p>First follow the manual instructions to <a href="#obtaining-hisat">obtain HISAT</a>. Set the <code>HISAT_HOME</code> environment variable to point to the new HISAT directory containing the <code>hisat</code>, <code>hisat-build</code> and <code>hisat-inspect</code> binaries. This is important, as the <code>HISAT_HOME</code> variable is used in the commands below to refer to that directory.</p>
+<h2 id="indexing-a-reference-genome">Indexing a reference genome</h2>
+<p>To create an index for the genomic region (1 million bps from the human chromosome 22 between 20,000,000 and 20,999,999) included with HISAT, create a new temporary directory (it doesn't matter where), change into that directory, and run:</p>
+<pre><code>$HISAT_HOME/hisat-build $HISAT_HOME/example/reference/22_20-21M.fa 22_20-21M_hisat</code></pre>
+<p>The command should print many lines of output then quit. When the command completes, the current directory will contain ten new files that all start with <code>22_20-21M_hisat</code> and end with <code>.1.bt2</code>, <code>.2.bt2</code>, <code>.3.bt2</code>, <code>.4.bt2</code>, <code>.5.bt2</code>, <code>.6.bt2</code>, <code>.rev.1.bt2</code>, <code>.rev.2.bt2</code>, <code>.rev.5.bt2</code>, and <code>.rev.6.bt2</code>. These files constitute the index - you're done!</p>
+<p>You can use <code>hisat-build</code> to create an index for a set of FASTA files obtained from any source, including sites such as <a href="http://genome.ucsc.edu/cgi-bin/hgGateway">UCSC</a>, <a href="http://www.ncbi.nlm.nih.gov/sites/genome">NCBI</a>, and <a href="http://www.ensembl.org/">Ensembl</a>. When indexing multiple FASTA files, specify all the files using commas to separate file names. For more details on how to create an index with <code>hisat-build</code>, see the <a href= [...]
+<h2 id="aligning-example-reads">Aligning example reads</h2>
+<p>Stay in the directory created in the previous step, which now contains the <code>22_20-21M_hisat</code> index files. Next, run:</p>
+<pre><code>$HISAT_HOME/hisat -x 22_20-21M_hisat -U $HISAT_HOME/example/reads/reads_1.fq -S eg1.sam</code></pre>
+<p>This runs the HISAT aligner, which aligns a set of unpaired reads to the the genome region using the index generated in the previous step. The alignment results in SAM format are written to the file <code>eg1.sam</code>, and a short alignment summary is written to the console. (Actually, the summary is written to the "standard error" or "stderr" filehandle, which is typically printed to the console.)</p>
+<p>To see the first few lines of the SAM output, run:</p>
+<pre><code>head eg1.sam</code></pre>
+<p>You will see something like this:</p>
+<pre><code>@HD VN:1.0   SO:unsorted
+ at SQ SN:22:20000000-20999999 LN:1000000
+ at PG ID:hisat            PN:hisat    VN:0.1.0
+1   0               22:20000000-20999999    4115    255 100M            *   0   0   GGAGCGCAGCGTGGGCGGCCCCGCAGCGCGGCCTCGGACCCCAGAAGGGCTTCCCCGGGTCCGTTGGCGCGCGGGGAGCGGCGTTCCCAGGGCGCGGCGC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+2   16              22:20000000-20999999    4197    255 100M            *   0   0   GTTCCCAGGGCGCGGCGCGGTGCGGCGCGGCGCGGGTCGCAGTCCACGCGGCCGCAACTCGGACCGGTGCGGGGGCCGCCCCCTCCCTCCAGGCCCAGCG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+3   0               22:20000000-20999999    4113    255 100M            *   0   0   CTGGAGCGCAGCGTGGGCGGCCCCGCAGCGCGGCCTCGGACCCCAGAAGGGCTTCCCCGGGTCCGTTGGCGCGCGGGGAGCGGCGTTCCCAGGGCGCGGC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+4   0               22:20000000-20999999    52358   255 100M            *   0   0   TTCAGGGTCTGCCTTTATGCCAGTGAGGAGCAGCAGAGTCTGATACTAGGTCTAGGACCGGCCGAGGTATACCATGAACATGTGGATACACCTGAGCCCA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+5   16              22:20000000-20999999    52680   255 100M            *   0   0   CTTCTGGCCAGTAGGTCTTTGTTCTGGTCCAACGACAGGAGTAGGCTTGTATTTAAAAGCGGCCCCTCCTCTCCTGTGGCCACAGAACACAGGCGTGCTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+6   16              22:20000000-20999999    52664   255 100M            *   0   0   TCTCACCTCTCATGTGCTTCTGGCCAGTAGGTCTTTGTTCTGGTCCAACGACAGGAGTAGGCTTGTATTTAAAAGCGGCCCCTCCTCTCCTGTGGCCACA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+7   0               22:20000000-20999999    52468   255 100M            *   0   0   TGTACACAGGCACTCACATGGCACACACATACACTCCTGCGTGTGCACAAGCACACACATGCAAGCCATATACATGGACACCGACACAGGCACATGTACG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+8   0               22:20000000-20999999    4538    255 100M            *   0   0   CGGCCCCGCACCTGCCCGAACCTCTGCGGCGGCGGTGGCAGGGTACGCGGGACCGCTCCCTCCCAGCCGACTTACGAGAACATCCCCCGACCATCCAGCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU
+9   16              22:20000000-20999999    4667    255 50M19567N50M    *   0   0   CTTCCCCGGACTCTGGCCGCGTAGCCTCCGCCACCACTCCCAGTTCACAGACCTCGCGACCTGTGTCAGCAGAGCCGCCCTGCACCACCATGTGCATCAT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:-1 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU XS:A:+
+10  0               22:20000000-20999999    30948   255 20M9021N80M     *   0   0   CAACAACGAGATCCTCAGTGGGCTGGACATGGAGGAAGGCAAGGAAGGAGGCACATGGCTGGGCATCAGCACACGTGGCAAGCTGGCAGCACTCACCAAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:-1 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU XS:A:+
+11  16              22:20000000-20999999    40044   255 65M8945N35M     *   0   0   TGGCAAGCTGGCAGCACTCACCAACTACCTGCAGCCGCAGCTGGACTGGCAGGCCCGAGGGCGAGGCACCTACGGGCTGAGCAACGCGCTGCTGGAGACT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:-1 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU XS:A:+</code></pre>
+<p>The first few lines (beginning with <code>@</code>) are SAM header lines, and the rest of the lines are SAM alignments, one line per read or mate. See the <a href="#sam-output">HISAT manual section on SAM output</a> and the <a href="http://samtools.sourceforge.net/SAM1.pdf">SAM specification</a> for details about how to interpret the SAM file format.</p>
+<h2 id="paired-end-example">Paired-end example</h2>
+<p>To align paired-end reads included with HISAT, stay in the same directory and run:</p>
+<pre><code>$HISAT_HOME/hisat -x 22_20-21M_hisat -1 $HISAT_HOME/example/reads/reads_1.fq -2 $HISAT_HOME/example/reads/reads_2.fq -S eg2.sam</code></pre>
+<p>This aligns a set of paired-end reads to the reference genome, with results written to the file <code>eg2.sam</code>.</p>
+<h2 id="using-samtoolsbcftools-downstream">Using SAMtools/BCFtools downstream</h2>
+<p><a href="http://samtools.sourceforge.net">SAMtools</a> is a collection of tools for manipulating and analyzing SAM and BAM alignment files. <a href="http://samtools.sourceforge.net/mpileup.shtml">BCFtools</a> is a collection of tools for calling variants and manipulating VCF and BCF files, and it is typically distributed with <a href="http://samtools.sourceforge.net">SAMtools</a>. Using these tools together allows you to get from alignments in SAM format to variant calls in VCF format [...]
+<p>Run the paired-end example:</p>
+<pre><code>$HISAT_HOME/hisat -x $HISAT_HOME/example/index/22_20-21M_hisat -1 $HISAT_HOME/example/reads/reads_1.fq -2 $HISAT_HOME/example/reads/reads_2.fq -S eg2.sam</code></pre>
+<p>Use <code>samtools view</code> to convert the SAM file into a BAM file. BAM is a the binary format corresponding to the SAM text format. Run:</p>
+<pre><code>samtools view -bS eg2.sam > eg2.bam</code></pre>
+<p>Use <code>samtools sort</code> to convert the BAM file to a sorted BAM file.</p>
+<pre><code>samtools sort eg2.bam eg2.sorted</code></pre>
+<p>We now have a sorted BAM file called <code>eg2.sorted.bam</code>. Sorted BAM is a useful format because the alignments are (a) compressed, which is convenient for long-term storage, and (b) sorted, which is conveneint for variant discovery. To generate variant calls in VCF format, run:</p>
+<pre><code>samtools mpileup -uf $HISAT_HOME/example/reference/22_20-21M.fa eg2.sorted.bam | bcftools view -bvcg - > eg2.raw.bcf</code></pre>
+<p>Then to view the variants, run:</p>
+<pre><code>bcftools view eg2.raw.bcf</code></pre>
+<p>See the official SAMtools guide to <a href="http://samtools.sourceforge.net/mpileup.shtml">Calling SNPs/INDELs with SAMtools/BCFtools</a> for more details and variations on this process.</p>
diff --git a/doc/manual.shtml b/doc/manual.shtml
new file mode 100644
index 0000000..fe7ae95
--- /dev/null
+++ b/doc/manual.shtml
@@ -0,0 +1,37 @@
+<!--#set var="Title" value="Centrifuge" -->
+<!--#set var="NoCrumbs" value="1" -->
+<!--#set var="SubTitle" value="Classifier for metagenomic sequences"-->
+<!--#set var="ExtraCSS" value="/software/centrifuge/add.css"-->
+<!--#include virtual="/iheader_r.shtml"-->
+<div id="mainContent">
+  <div id="main">
+  
+     <div id="rightside">
+
+ <!--  #  set var="BwtIndexes" value="1" -->
+ <!--#include virtual="sidebar.inc.shtml"-->
+          
+	</div> <!-- End of "rightside" -->
+
+	
+  <div id="leftside">
+  <h1>Table of Contents</h1>
+ <!--#include virtual="manual.inc.html"-->
+  </div>
+  </div>
+</div>
+
+<!--#include virtual="footer.inc.html"-->
+
+<!-- Google analytics code -->
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+var pageTracker = _gat._getTracker("UA-6101038-1");
+pageTracker._trackPageview();
+</script>
+
+</body>
+</html>
diff --git a/doc/sidebar.inc.shtml b/doc/sidebar.inc.shtml
new file mode 100644
index 0000000..5942da2
--- /dev/null
+++ b/doc/sidebar.inc.shtml
@@ -0,0 +1,124 @@
+<h2>Site Map</h2>
+<div class="box">
+ <ul>
+   <li><a href="index.shtml">Home</a></li>
+   <li><a href="manual.shtml">Manual</a></li>
+   <li><a href="faq.shtml">FAQ</a></li>
+ </ul>
+</div>
+
+<h2>News and Updates</h2>
+<div class="box">
+ <ul>
+   <table width="100%">
+	 <tbody><tr><td>New releases and related tools will be announced through the Bowtie
+     <a href="https://lists.sourceforge.net/lists/listinfo/bowtie-bio-announce"><b>mailing list</b></a>.</td></tr>
+   </tbody></table>
+ </ul>
+</div>
+
+<h2>Getting Help</h2>
+<div class="box">
+ <ul>
+   <table width="100%">
+     <tbody><tr><td>
+		<!-- Questions and comments about HISAT2 can be posted on the 
+		<a href="https://groups.google.com/forum/#!forum/hisat-tools-users"><b>HISAT Tools Users Google Group</b></a>. -->
+		Please use <a href="mailto:centrifuge.metagenomics at gmail.com">centrifuge.metagenomics at gmail.com</a> for 
+		private communications only. Please do not email technical questions to Centrifuge contributors directly.</td></tr>
+   </tbody></table>
+ </ul>
+</div>
+
+<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/downloads"><h2><u>Releases</u></h2></a>
+<div class="box">
+ <ul>
+   <table width="100%"><tbody><tr><td>version 1.0.2-beta</td> <td align="right">5/25/2016</td></tr>
+       <tr>
+	 <td><a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/downloads/centrifuge-1.0.2-beta-source.zip" onclick="javascript: pageTracker._trackPageview('/downloads/centrifuge'); ">   Source code</a></td>
+       </tr>
+       <tr>
+	 <td><a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/downloads/centrifuge-1.0.2-beta-Linux_x86_64.zip" onclick="javascript: pageTracker._trackPageview('/downloads/centrifuge'); ">   Linux x86_64 binary</a></td>
+       </tr>
+       <tr>
+	 <td><a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/downloads/centrifuge-1.0.2-beta-OSX_x86_64.zip" onclick="javascript: pageTracker._trackPageview('/downloads/centrifuge'); ">   Mac OS X x86_64 binary</a></td>
+       </tr>
+   </tbody></table>
+ </ul>
+</div>
+
+<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data"><h2><u>Indexes</u></h2></a>
+  <div class="box">
+    <table width="100%">
+      <tr>
+        <td>
+	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/b_compressed.tar.gz"><i>Bacteria (compressed)</i></a>
+        </td>
+	<td align="right" style="font-size: x-small">
+	  <b>3.4 GB</b>
+        </td>
+      </tr>
+      <tr>
+        <td>
+	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/b_compressed+h+v.tar.gz"><i>Bacteria, Viruses, Human (compressed)</i></a>
+        </td>
+	<td align="right" style="font-size: x-small">
+	  <b>3.9 GB</b>
+        </td>
+      </tr>
+      <tr>
+        <td>
+	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/b+h+v.tar.gz"><i>Bacteria, Viruses, Human </i></a>
+        </td>
+	<td align="right" style="font-size: x-small">
+	  <b>6.3 GB</b>
+        </td>
+      </tr>
+      <tr>
+        <td>
+	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/nt.tar.gz"><i>NCBI nucleotide non-redundant sequences </i></a>
+        </td>
+	<td align="right" style="font-size: x-small">
+	  <b>44.2 GB</b>
+        </td>
+      </tr>
+    </table>
+  </div>
+
+<h2>Related Tools</h2>
+<div class="box">
+ <ul>
+   <!--
+   <li><a href="http://www.ccb.jhu.edu/software/hisat">HISAT</a>: Fast and sensitive spliced alignment</li>
+   <li><a href="http://bowtie-bio.sourceforge.net/bowtie2">Bowtie2</a>: Ultrafast read alignment</li>
+   <li><a href="http://www.ccb.jhu.edu/software/tophat">TopHat2</a>: Spliced read mapper for RNA-Seq</li>
+   <li><a href="http://cufflinks.cbcb.umd.edu">Cufflinks</a>: Isoform assembly and quantitation for RNA-Seq</li>
+   <li><a href="http://www.ccb.jhu.edu/software/stringtie">StringTie</a>: Transcript assembly and quantification for RNA-Seq</li>
+   -->
+ </ul>
+</div>
+
+<h2>Publications</h2>
+<div class="box">
+ <ul>
+   <li><p>Kim D, Song L, Breitwieser F, and Salzberg SL. <a href="http://biorxiv.org/content/early/2016/05/25/054965"><b>Centrifuge: rapid and sensitive classification of metagenomic sequences</b></a>. <i>bioRxiv</i> 2016</p></li>
+ </ul>
+</div>
+
+<h2>Contributors</h2>
+<div class="box">
+ <ul>
+   <li><a href="http://www.ccb.jhu.edu/people/infphilo">Daehwan Kim</a></li>
+   <li><a href="http://ccb.jhu.edu/people/lsong/">Li Song</a></li>
+   <li><a href="http://www.ccb.jhu.edu/people/fbreitwieser">Florian Breitwieser</a></li>
+   <li><a href="http://salzberg-lab.org/about-me/">Steven Salzberg</a></li>
+ </ul>
+</div>
+
+<h2>Links</h2>
+<div class="box">
+ <ul>
+   <li><a href="http://www.ccb.jhu.edu/">Center for Computational Biology at Johns Hopkins University </a></li>
+   <li><a href="http://www.cs.jhu.edu/">Computer Science Department at Johns Hopkins University </a></li>
+ </ul>
+</div>        
diff --git a/doc/strip_markdown.pl b/doc/strip_markdown.pl
new file mode 100644
index 0000000..0ecc595
--- /dev/null
+++ b/doc/strip_markdown.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/perl -w
+
+##
+# strip_markdown.pl
+#
+# Used to convert MANUAL.markdown to MANUAL.  Leaves all manual content, but
+# strips away some of the clutter that makes it hard to read the markdown.
+#
+
+use strict;
+use warnings;
+
+my $lastBlank = 0;
+
+while(<>) {
+	# Skip comments
+	next if /^\s*<!--/;
+	next if /^\s*!/;
+	next if /^\s*-->/;
+	# Skip internal links
+	next if /\[.*\]: #/;
+	# Skip HTML
+	next if /^\s?\s?\s?<.*>\s*$/;
+	# Skip HTML
+	next if /^\s*<table/;
+	next if /^\s*<\/td/;
+	next if /^\s*<.*>\s*$/;
+	# Strip [`...`]
+	s/\[`/`/g;
+	s/`\]/`/g;
+	# Strip [#...]
+	#s/\[#[^\]]*\]//g;
+	# Strip (#...)
+	s/\(#[^\)]*\)//g;
+	# Turn hashes into spaces
+	#s/^####/   /;
+	#s/^###/ /;
+	if(/^\s*$/) {
+		next if $lastBlank;
+		$lastBlank = 1;
+	} else {
+		$lastBlank = 0;
+	}
+	print $_;
+}
diff --git a/doc/style.css b/doc/style.css
new file mode 100644
index 0000000..b4014e1
--- /dev/null
+++ b/doc/style.css
@@ -0,0 +1,306 @@
+/* 
+Stylesheet for the free sNews15_1 template
+from http://www.free-css-templates.com
+*/
+
+/* Reset all margins and paddings for browsers */
+* { 
+	padding: 0;
+	margin: 0;
+}
+
+body { 
+	font: .8em Verdana, Arial, Sans-Serif; 
+	line-height: 1.6em; 
+	margin: 0;
+	/* background-image: url(../images/bg.jpg); */
+	/* background-repeat: repeat */
+}
+
+#wrap {	margin: 0 auto;	width: 95% }
+
+/* TOP HEADER -------- */
+#top {
+	margin: 0 auto;
+	padding: 0;
+	background:#1E6BAC url(../images/ccbstrip.jpg) repeat-x top;
+	height: 141px;
+}
+#top h1 { padding: 10px 0 0 25px; color: #FFF; font-size: 240%; background: transparent;}
+#top h2 { padding: 0px 0 0 25px; color: #bbb; font-size: 100%; background: transparent;}
+#top .padding { padding-top: 5px; }
+/*
+#top .lefts { 
+	background: transparent url(../images/topl.jpg) no-repeat left; 
+	height: 81px; 
+}
+#top .rights {
+	background: transparent url(../images/topr.jpg) no-repeat right;
+	float: right;
+	height: 81px;
+	width: 18px;
+}
+*/
+/* SEARCH BOX AND BUTTON ----------*/
+#search { float: right;  padding: 10px 25px 0 0;  }
+
+#search input.text { 
+	border: 1px solid #eee;
+	display: inline;
+	margin-top: 5px;
+	width: 120px;
+	height: 12px;
+	font-size: 10px;
+ }
+ #search input.searchbutton {
+	border: 0;
+	background: transparent;
+	color: #FFF;
+	cursor: pointer;
+	font: bold 0.8em Arial, Arial, Sans-Serif
+ }
+
+#subheader { 
+	clear: both; 
+	border-top: 1px dotted #888;	
+	border-bottom: 1px dotted #888;
+	background: #eaeaea;
+	color: #505050;
+	padding: 1em;
+	margin: 15px 0px 10px 0px;
+	
+}
+#subheader a { text-decoration: none; /* border-bottom: 1px dashed #0066B3; */ } 
+ 
+ 
+/* TOP MENU ---------- */
+#topmenu {  	margin: 0px 8px 0 8px; 
+			padding: 0;
+			background: url(../images/menu.jpg) repeat-x top;
+			height: 30px;
+			
+}
+#topmenu .lefts { 
+	background: url(../images/menul.jpg) no-repeat left; 
+	height: 30px; 
+	padding-left: 0px;
+}
+#topmenu .rights {
+	background: url(../images/menur.jpg) no-repeat right;
+	float: right;
+	height: 30px;
+	width: 8px;
+}
+#topmenu li a { 
+	color: #FFF;
+	text-align: left;
+	padding-left: 10px;
+	padding-right: 15px;
+	text-decoration: none;
+	background: transparent;
+	font-weight: bold
+} 
+#topmenu li { padding: 0px;
+	float: left;
+	margin: 0;
+	font-size: 11px;
+	line-height: 30px;
+	white-space: nowrap;
+	/* list-style-type: none; */
+	width: auto;
+	background: url(../images/sep.gif) no-repeat top right
+	
+}
+
+#main { background: #FFF; margin: 25px 0 15px 0; color: #666; }
+
+#main #rightside {
+	width: 300px;
+	float: right;
+	background: #FFF;
+	margin-right: 0px;
+	color: #555;
+	
+} 
+
+#main #rightside .box {
+	background: #efefef;
+	margin-bottom: 10px;
+	padding: 5px;
+	color: #555;
+}
+
+#main #rightside h2 {
+	font: bold 1.0em Arial, Arial, Sans-Serif; 
+    background: #CDCDCD url(../images/greyc.gif) no-repeat top right;
+	height: 18px;
+	padding: 3px;
+	color: #666;
+}
+
+/* LEFT SIDE - ARTICLES AREA -------- */
+#leftside {
+	padding-left: 8px;
+	color: #555;
+	background: #FFF;
+	margin-right: 255px;
+	margin-left: 0px;
+	
+}
+
+#manual {
+	margin-right: 305px;
+	margin-left: 0px;
+	width: auto;
+}
+
+#leftside h1 { padding: 15px 0 10px 0 }
+#leftside h2 { padding: 15px 0 10px 0; color: #555; text-indent: 17px; background: #FFF url(../images/head.gif) no-repeat left; }
+#leftside h3 { padding: 15px 0 10px 0; font-size: 100%; margin-left: 5px; text-indent: 17px; background: #FFF url(../images/head.gif) no-repeat left; }
+#leftside ul { margin-left: 24px; padding-left 24px; list-style-type: circle }
+#leftside li { }
+#leftside p { padding: 0px 0 10px 0 }
+
+#footer {
+	clear: both;
+	background: #FFF url(../images/footer.jpg) repeat-x;
+	height: 46px;
+	margin-left: 0px;
+	margin-right: 0px;
+	font-size: 75%;
+	color: #666;
+}
+#footer p  { padding: 5px }
+#footer .rside { float: right; display: inline; padding: 5px; text-align: right}
+
+#toc ol { list-style: roman }
+
+a { color: #0066B3; background: inherit; text-decoration: none }
+h1 { font: bold 1.9em Arial, Arial, Sans-Serif }
+h2 { font: bold 1.2em Arial, Arial, Sans-Serif; padding: 0; margin: 0 }
+ul { padding: 0; margin: 0; list-style-type: none }
+li {  }
+ol { margin-left: 24px;
+     padding-left 24px;
+     list-style: decimal }
+/* blockquote { margin-left: 35px; font-family: "Courier New", Courier, monospace; } */
+blockquote { margin-left: 35px; font-family: "Courier New", Courier; }
+tt { font-family: "Courier New", Courier, monospace; }
+.date { border-top: 1px solid #e5e5e5; text-align: right; margin-bottom: 25px; margin-top: 5px;}
+#main #leftside .date a, #main #rightside a { border: 0; text-decoration: none; }
+ 
+.comment .date { text-align: left; border: 0;}	
+
+
+#breadcrumbs { 
+	float: left;
+	padding-left: 8px;
+	padding-top: 0px;
+	font: bold .8em Arial, Arial, Sans-Serif; 
+	color: #666;
+	width: 100%;
+	height: 25px;
+	margin-top: 10px;
+	margin-bottom: 10px;
+	clear: both;
+}
+
+
+
+#leftside #txt {width: 100%; height: 10em; padding: 3px 3px 3px 6px; margin-left:0em;}
+#leftside textarea { border: 1px solid #bbb; width: 100%;  }
+
+
+/* SNEWS */
+#main #leftside fieldset { float: left; width: 100%; border: 1px solid #ccc; padding: 10px 8px; margin: 0 10px 8px 0; background: #FFF; color: #000; }
+#main #leftside fieldset p { width: 100%; }
+#main input { padding: 3px; margin: 0; border: 1px solid #bbb }
+/*p { margin-top: 5px; }*/
+p { margin-top: 10px; }
+/*input.search { border: 1px solid #ccc; padding: 4px; width: 160px; }*/
+.comment { background: #FFF; color: #808080; padding: 10px; margin: 0 0 10px 0; border-top: 1px solid #ccc; }
+.commentsbox { background: #FFF; color: #808080; padding: 10px; margin: 0 0 10px 0; border-top: 1px solid #ccc; }
+
+
+#box-table-a
+{
+	font-family: .8em Verdana, Arial, Sans-Serif; 
+	/*font-size: 12px;*/
+	margin: 45px;
+	width: 600px;
+	text-align: left;
+	border-collapse: collapse;
+}
+#box-table-a th
+{
+	font-size: 13px;
+	font-weight: normal;
+	padding: 8px;
+	background: #b9c9fe;
+	border-top: 4px solid #aabcfe;
+	border-bottom: 1px solid #fff;
+	color: #039;
+}
+#box-table-a td
+{
+	padding: 8px;
+	background: #e8edff; 
+	border-bottom: 2px solid #fff;
+	color: #669;
+	border-top: 2px solid transparent;
+}
+#box-table-a tr:hover td
+{
+	background: #d0dafd;
+	color: #339;
+}
+
+
+#box-table-b
+{
+	font-family: .8em Verdana, Arial, Sans-Serif; 
+	/*font-size: 12px;*/
+	margin: 45px;
+	width: 480px;
+	text-align: center;
+	border-collapse: collapse;
+	border-top: 7px solid #9baff1;
+	border-bottom: 7px solid #9baff1;
+}
+#box-table-b th
+{
+	font-size: 13px;
+	font-weight: normal;
+	padding: 8px;
+	background: #e8edff;
+	border-right: 1px solid #9baff1;
+	border-left: 1px solid #9baff1;
+	color: #039;
+}
+#box-table-b td
+{
+	padding: 8px;
+	background: #e8edff; 
+	border-right: 1px solid #aabcfe;
+	border-left: 1px solid #aabcfe;
+	color: #669;
+}
+
+#manual h1  { margin: 0 15px 10px 15px; padding: 10px 0 10px 0; font: bold 1.9em Arial, Arial, Sans-Serif }
+#manual h2  { margin: 0 15px 10px 15px; padding: 10px 0 10px 0; font: bold 1.2em Arial, Arial, Sans-Serif }
+#manual h3  { margin: 0 15px 10px 20px; padding: 10px 0 10px 0; font: 1.2em Arial, Arial, Sans-Serif }
+#manual h4  { margin: 0 15px 10px 25px; padding: 10px 0 10px 0; font: 1.1em Arial, Arial, Sans-Serif }
+#manual p   { margin: 0 15px 10px 15px; color: #444 }
+#manual table { margin-top: 15px }
+#manual ul  { margin: 0 15px 10px 15px; padding: 0; margin: 0 }
+#manual pre { margin: 0 15px 15px 25px }
+#manual li  { margin: 0 15px 1px 15px; color: #444 }
+#manual ol  { margin-left: 24px; padding-left 24px; list-style: decimal }
+#manual td  { vertical-align: top; }
+#manual blockquote { margin-left: 35px; font-family: "Courier New", Courier; }
+#manual tt { font: .8em; font-family: "Courier New", Courier; }
+#manual code { font: .8em; font-family: "Courier New", Courier; }
+#manual .date { border-top: 1px solid #e5e5e5; text-align: right; margin-bottom: 25px; margin-top: 5px;}
+#manual .date a, #main #rightside a { border: 0; text-decoration: none; }
+#manual .date a, #main #rightside a { border: 0; text-decoration: none; }
+#manual td { vertical-align: top; }
diff --git a/dp_framer.cpp b/dp_framer.cpp
new file mode 100644
index 0000000..d7e359a
--- /dev/null
+++ b/dp_framer.cpp
@@ -0,0 +1,910 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "dp_framer.h"
+
+using namespace std;
+
+/**
+ * Set up variables that describe the shape of a dynamic programming matrix to
+ * be filled in.  The matrix is built around the diagonal containing the seed
+ * hit: the "seed diagonal".  The N diagonals to the right of the seed diagonal
+ * are the "RHS gap" diagonals, where N is the maximum number of read or
+ * reference gaps permitted (whichever is larger).  The N diagonals to the left
+ * of the seed diagonal are the "LHS gap" diagonals.
+ *
+ * The way the rectangle is currently formulated, there are another N diagonals
+ * to the left of the "LHS gap" diagonals called the "LHS extra diagonals".  It
+ * might also be possible to split the "extra diagonals" into two subsets and
+ * place them both to the left of the LHS gap diagonals and to the right of the
+ * RHS gap diagonals.
+ *
+ * The purpose of arranging and these groupings of diagonals is that a subset
+ * of them, the "core diagonals", can now be considered "covered."  By
+ * "covered" I mean that any alignment that overlaps a cell in any of the core
+ * diagonals cannot possibly overlap another, higher-scoring alignment that
+ * falls partially outside the rectangle.
+ *
+ * Say the read is 5 characters long, the maximum number of read or ref gaps is
+ * 2, and the seed hit puts the main diagonal at offset 10 in the reference.
+ * The larger rectangle explored looks like this:
+ *
+ *  off=10, maxgap=2
+ *
+ * Ref      1
+ * off: 67890123456   0: seed diagonal
+ *      **OO0oo++----   o: "RHS gap" diagonals
+ *      -**OO0oo++---   O: "LHS gap" diagonals
+ *      --**OO0oo++--   *: "LHS extra" diagonals
+ *      ---**OO0oo++-   +: "RHS extra" diagonals
+ *      ----**OO0oo++   -: cells that can't possibly be involved in a valid    
+ *                         alignment that overlaps one of the core diagonals
+ *
+ * The "core diagonals" are marked with 0's, O's or o's.
+ *
+ * A caveat is that, for performance reasons, we place an upper limit on N -
+ * the maximum number of read or reference gaps.  It is constrained to be no
+ * greater than 'maxgap'.  This means that in some situations, we may report an
+ * alignment that spuriously trumps a better alignment that falls partially
+ * outside the rectangle.  Also, we may fail to find a valid alignment with
+ * more than 'maxgap' gaps.
+ *
+ * Another issue is trimming: if the seed hit is sufficiently close to one or
+ * both ends of the reference sequence, and either (a) overhang is not
+ * permitted, or (b) the number of Ns permitted is less than the number of
+ * columns that overhang the reference, then we want to exclude the trimmed
+ * columns from the rectangle.
+ *
+ * We need to return enough information so that downstream routines can fully
+ * understand the shape of the rectangle, which diagonals are which (esp. which
+ * are the "core" diagonals, since we needn't examine any more seed hits from
+ * those columns in the future), and how the rectangle is trimmed.  The
+ * information returned should be compatible with the sort of information
+ * returned by the routines that set up rectangles for mate finding.
+ */
+bool DynProgFramer::frameSeedExtensionRect(
+	int64_t  off,      // ref offset implied by seed hit assuming no gaps
+	size_t   rdlen,    // length of read sequence used in DP table (so len
+	                   // of +1 nucleotide sequence for colorspace reads)
+	int64_t  reflen,   // length of reference sequence aligned to
+	size_t   maxrdgap, // max # of read gaps permitted in opp mate alignment
+	size_t   maxrfgap, // max # of ref gaps permitted in opp mate alignment
+	int64_t  maxns,    // # Ns permitted
+	size_t   maxhalf,  // max width in either direction
+	DPRect&  rect)     // out: DP rectangle
+{
+	assert_gt(rdlen, 0);
+	assert_gt(reflen, 0);
+	// Set N, the maximum number of reference or read gaps permitted, whichever
+	// is larger.  Also, enforce ceiling: can't be larger than 'maxhalf'.
+	size_t maxgap = max(maxrdgap, maxrfgap);
+	maxgap = min(maxgap, maxhalf);
+	// Leave room for "LHS gap" and "LHS extra" diagonals
+	int64_t refl = off - 2 * maxgap;               // inclusive
+	// Leave room for "RHS gap" and "RHS extra" diagonals
+	int64_t refr = off + (rdlen - 1) + 2 * maxgap; // inclusive
+	size_t triml = 0, trimr = 0;
+	// Check if we have to trim to fit the extents of the reference
+	if(trimToRef_) {
+		maxns = 0; // no leeway
+	} else if(maxns == (int64_t)rdlen) {
+		maxns--;
+	}
+	// Trim from RHS of rectangle
+	if(refr >= reflen + maxns) {
+		trimr = (size_t)(refr - (reflen + maxns - 1));
+	}
+	// Trim from LHS of rectangle
+	if(refl < -maxns) {
+		triml = (size_t)(-refl) - (size_t)maxns;
+	}
+	rect.refl_pretrim = refl;
+	rect.refr_pretrim = refr;
+	rect.refl  = refl + triml;
+	rect.refr  = refr - trimr;
+	rect.triml = triml;
+	rect.trimr = trimr;
+	rect.maxgap = maxgap;
+	// Remember which diagonals are "core" as offsets from the LHS of the
+	// untrimmed rectangle
+	rect.corel = maxgap;
+	rect.corer = rect.corel + 2 * maxgap; // inclusive
+	assert(rect.repOk());
+	return !rect.entirelyTrimmed();
+}
+
+/**
+ * Set up variables that describe the shape of a dynamic programming matrix to
+ * be filled in.  The matrix is built around the diagonals that terminate in
+ * the range of columns where the RHS of the opposite mate must fall in order
+ * to satisfy the fragment-length constraint.  These are the "mate" diagonals
+ * and they also happen to be the "core" diagonals in this case.
+ *
+ * The N diagonals to the right of the mate diagonals are the "RHS gap"
+ * diagonals, where N is the maximum number of read or reference gaps permitted
+ * (whichever is larger).  The N diagonals to the left of the mate diagonals
+ * are the "LHS gap" diagonals.
+ *
+ * The purpose of arranging and these groupings of diagonals is that a subset
+ * of them, the "core diagonals", can now be considered "covered."  By
+ * "covered" I mean that any alignment that overlaps a cell in any of the core
+ * diagonals cannot possibly overlap another, higher-scoring alignment that
+ * falls partially outside the rectangle.
+ *
+ *   |Anchor| 
+ *   o---------OO0000000000000oo------  0: mate diagonal (also core diags!)
+ *   -o---------OO0000000000000oo-----  o: "RHS gap" diagonals
+ *   --o---------OO0000000000000oo----  O: "LHS gap" diagonals
+ *   ---oo--------OO0000000000000oo---  *: "LHS extra" diagonals
+ *   -----o--------OO0000000000000oo--  -: cells that can't possibly be
+ *   ------o--------OO0000000000000oo-     involved in a valid alignment that
+ *   -------o--------OO0000000000000oo     overlaps one of the core diagonals
+ *                     XXXXXXXXXXXXX
+ *                     | RHS Range |
+ *                     ^           ^
+ *                     rl          rr
+ *
+ * The "core diagonals" are marked with 0s.
+ *
+ * A caveat is that, for performance reasons, we place an upper limit on N -
+ * the maximum number of read or reference gaps.  It is constrained to be no
+ * greater than 'maxgap'.  This means that in some situations, we may report an
+ * alignment that spuriously trumps a better alignment that falls partially
+ * outside the rectangle.  Also, we may fail to find a valid alignment with
+ * more than 'maxgap' gaps.
+ *
+ * Another issue is trimming: if the seed hit is sufficiently close to one or
+ * both ends of the reference sequence, and either (a) overhang is not
+ * permitted, or (b) the number of Ns permitted is less than the number of
+ * columns that overhang the reference, then we want to exclude the trimmed
+ * columns from the rectangle.
+ */
+bool DynProgFramer::frameFindMateAnchorLeftRect(
+	int64_t ll,       // leftmost Watson off for LHS of opp alignment
+	int64_t lr,       // rightmost Watson off for LHS of opp alignment
+	int64_t rl,       // leftmost Watson off for RHS of opp alignment
+	int64_t rr,       // rightmost Watson off for RHS of opp alignment
+	size_t  rdlen,    // length of opposite mate
+	int64_t reflen,   // length of reference sequence aligned to
+	size_t  maxrdgap, // max # of read gaps permitted in opp mate alignment
+	size_t  maxrfgap, // max # of ref gaps permitted in opp mate alignment
+	int64_t maxns,    // max # ns permitted in the alignment
+	size_t  maxhalf,  // max width in either direction
+	DPRect& rect)     // out: DP rectangle
+	const
+{
+	assert_geq(lr, ll);  // LHS rightmost must be >= LHS leftmost
+	assert_geq(rr, rl);  // RHS rightmost must be >= RHS leftmost
+	assert_geq(rr, lr);  // RHS rightmost must be >= LHS rightmost
+	assert_geq(rl, ll);  // RHS leftmost must be >= LHS leftmost
+	assert_gt(rdlen, 0);
+	assert_gt(reflen, 0);
+	size_t triml = 0, trimr = 0;
+	size_t maxgap = max(maxrdgap, maxrfgap);
+	maxgap = max(maxgap, maxhalf);
+	// Amount of padding we have to add to account for the fact that alignments
+	// ending between en_left/en_right might start in various columns in the
+	// first row
+	int64_t pad_left = maxgap;
+	int64_t pad_right = maxgap;
+	int64_t en_left  = rl;
+	int64_t en_right = rr;
+	int64_t st_left  = en_left - (rdlen-1);
+	ASSERT_ONLY(int64_t st_right = en_right - (rdlen-1));
+	int64_t en_right_pad = en_right + pad_right;
+	ASSERT_ONLY(int64_t en_left_pad  = en_left  - pad_left);
+	ASSERT_ONLY(int64_t st_right_pad = st_right + pad_right);
+	int64_t st_left_pad  = st_left  - pad_left;
+	assert_leq(st_left, en_left);
+	assert_geq(en_right, st_right);
+	assert_leq(st_left_pad, en_left_pad);
+	assert_geq(en_right_pad, st_right_pad);
+	int64_t refl = st_left_pad;
+	int64_t refr = en_right_pad;
+	if(trimToRef_) {
+		maxns = 0;
+	} else if(maxns == (int64_t)rdlen) {
+		maxns--;
+	}
+	// Trim from the RHS of the rectangle?
+	if(refr >= reflen + maxns) {
+		trimr = (size_t)(refr - (reflen + maxns - 1));
+	}
+	// Trim from the LHS of the rectangle?
+	if(refl < -maxns) {
+		triml = (size_t)(-refl) - (size_t)maxns;
+	}
+	size_t width = (size_t)(refr - refl + 1);
+	rect.refl_pretrim = refl;
+	rect.refr_pretrim = refr;
+	rect.refl  = refl + triml;
+	rect.refr  = refr - trimr;
+	rect.triml = triml;
+	rect.trimr = trimr;
+	rect.maxgap = maxgap;
+	rect.corel = maxgap;
+	rect.corer = width - maxgap - 1; // inclusive
+	assert(rect.repOk());
+	return !rect.entirelyTrimmed();
+}
+
+/**
+ * Set up variables that describe the shape of a dynamic programming matrix to
+ * be filled in.  The matrix is built around the diagonals that begin in the
+ * range of columns where the LHS of the opposite mate must fall in order to
+ * satisfy the fragment-length constraint.  These are the "mate" diagonals and
+ * they also happen to be the "core" diagonals in this case.
+ *
+ * The N diagonals to the right of the mate diagonals are the "RHS gap"
+ * diagonals, where N is the maximum number of read or reference gaps permitted
+ * (whichever is larger).  The N diagonals to the left of the mate diagonals
+ * are the "LHS gap" diagonals.
+ *
+ * The purpose of arranging and these groupings of diagonals is that a subset
+ * of them, the "core diagonals", can now be considered "covered."  By
+ * "covered" I mean that any alignment that overlaps a cell in any of the core
+ * diagonals cannot possibly overlap another, higher-scoring alignment that
+ * falls partially outside the rectangle.
+ *
+ *    ll          lr
+ *    v           v
+ *    | LHS Range |
+ *    XXXXXXXXXXXXX          |Anchor|
+ *  OO0000000000000oo--------o--------  0: mate diagonal (also core diags!)
+ *  -OO0000000000000oo--------o-------  o: "RHS gap" diagonals
+ *  --OO0000000000000oo--------o------  O: "LHS gap" diagonals
+ *  ---OO0000000000000oo--------oo----  *: "LHS extra" diagonals
+ *  ----OO0000000000000oo---------o---  -: cells that can't possibly be
+ *  -----OO0000000000000oo---------o--     involved in a valid alignment that
+ *  ------OO0000000000000oo---------o-     overlaps one of the core diagonals
+ *
+ * The "core diagonals" are marked with 0s.
+ *
+ * A caveat is that, for performance reasons, we place an upper limit on N -
+ * the maximum number of read or reference gaps.  It is constrained to be no
+ * greater than 'maxgap'.  This means that in some situations, we may report an
+ * alignment that spuriously trumps a better alignment that falls partially
+ * outside the rectangle.  Also, we may fail to find a valid alignment with
+ * more than 'maxgap' gaps.
+ *
+ * Another issue is trimming: if the seed hit is sufficiently close to one or
+ * both ends of the reference sequence, and either (a) overhang is not
+ * permitted, or (b) the number of Ns permitted is less than the number of
+ * columns that overhang the reference, then we want to exclude the trimmed
+ * columns from the rectangle.
+ */
+bool DynProgFramer::frameFindMateAnchorRightRect(
+	int64_t ll,       // leftmost Watson off for LHS of opp alignment
+	int64_t lr,       // rightmost Watson off for LHS of opp alignment
+	int64_t rl,       // leftmost Watson off for RHS of opp alignment
+	int64_t rr,       // rightmost Watson off for RHS of opp alignment
+	size_t rdlen,     // length of opposite mate
+	int64_t reflen,   // length of reference sequence aligned to
+	size_t maxrdgap,  // max # of read gaps permitted in opp mate alignment
+	size_t maxrfgap,  // max # of ref gaps permitted in opp mate alignment
+	int64_t maxns,    // max # ns permitted in the alignment
+	size_t maxhalf,   // max width in either direction
+	DPRect& rect)     // out: DP rectangle
+	const
+{
+	assert_geq(lr, ll);
+	assert_geq(rr, rl);
+	assert_geq(rr, lr);
+	assert_geq(rl, ll);
+	assert_gt(rdlen, 0);
+	assert_gt(reflen, 0);
+	size_t triml = 0, trimr = 0;
+	size_t maxgap = max(maxrdgap, maxrfgap);
+	maxgap = max(maxgap, maxhalf);
+	int64_t pad_left = maxgap;
+	int64_t pad_right = maxgap;
+	int64_t st_left = ll;
+	int64_t st_right = lr;
+	ASSERT_ONLY(int64_t en_left = st_left + (rdlen-1));
+	int64_t en_right = st_right + (rdlen-1);
+	int64_t en_right_pad = en_right + pad_right;
+	ASSERT_ONLY(int64_t en_left_pad  = en_left  - pad_left);
+	ASSERT_ONLY(int64_t st_right_pad = st_right + pad_right);
+	int64_t st_left_pad  = st_left  - pad_left;
+	assert_leq(st_left, en_left);
+	assert_geq(en_right, st_right);
+	assert_leq(st_left_pad, en_left_pad);
+	assert_geq(en_right_pad, st_right_pad);
+	// We have enough info to deduce where the boundaries of our rectangle
+	// should be.  Finalize the boundaries, ignoring reference trimming for now
+	int64_t refl = st_left_pad;
+	int64_t refr = en_right_pad;
+	if(trimToRef_) {
+		maxns = 0;
+	} else if(maxns == (int64_t)rdlen) {
+		maxns--;
+	}
+	// Trim from the RHS of the rectangle?
+	if(refr >= reflen + maxns) {
+		trimr = (size_t)(refr - (reflen + maxns - 1));
+	}
+	// Trim from the LHS of the rectangle?
+	if(refl < -maxns) {
+		triml = (size_t)(-refl) - (size_t)maxns;
+	}
+	size_t width = (size_t)(refr - refl + 1);
+	rect.refl_pretrim = refl;
+	rect.refr_pretrim = refr;
+	rect.refl  = refl + triml;
+	rect.refr  = refr - trimr;
+	rect.triml = triml;
+	rect.trimr = trimr;
+	rect.maxgap = maxgap;
+	rect.corel = maxgap;
+	rect.corer = width - maxgap - 1; // inclusive
+	assert(rect.repOk());
+	return !rect.entirelyTrimmed();
+}
+
+#ifdef MAIN_DP_FRAMER
+
+#include <iostream>
+
+static void testCaseFindMateAnchorLeft(
+	const char *testName,
+	bool trimToRef,
+	int64_t ll,
+	int64_t lr,
+	int64_t rl,
+	int64_t rr,
+	size_t rdlen,
+	size_t reflen,
+	size_t maxrdgap,
+	size_t maxrfgap,
+	size_t ex_width,
+	size_t ex_solwidth,
+	size_t ex_trimup,
+	size_t ex_trimdn,
+	int64_t ex_refl,
+	int64_t ex_refr,
+	const char *ex_st,    // string of '0'/'1' chars
+	const char *ex_en)    // string of '0'/'1' chars
+{
+	cerr << testName << "...";
+	DynProgFramer fr(trimToRef);
+	size_t width, solwidth;
+	int64_t refl, refr;
+	EList<bool> st, en;
+	size_t trimup, trimdn;
+	size_t maxhalf = 500;
+	size_t maxgaps = 0;
+	fr.frameFindMateAnchorLeft(
+		ll,       // leftmost Watson off for LHS of opp alignment
+		lr,       // rightmost Watson off for LHS of opp alignment
+		rl,       // leftmost Watson off for RHS of opp alignment
+		rr,       // rightmost Watson off for RHS of opp alignment
+		rdlen,    // length of opposite mate
+		reflen,   // length of reference sequence aligned to
+		maxrdgap, // max # of read gaps permitted in opp mate alignment
+		maxrfgap, // max # of ref gaps permitted in opp mate alignment
+		maxns,    // max # Ns permitted
+		maxhalf,  // max width in either direction
+		width,    // out: calculated width stored here
+		maxgaps,  // out: max # gaps
+		trimup,   // out: number of bases trimmed from upstream end
+		trimdn,   // out: number of bases trimmed from downstream end
+		refl,     // out: ref pos of upper LHS of parallelogram
+		refr,     // out: ref pos of lower RHS of parallelogram
+		st,       // out: legal starting columns stored here
+		en);      // out: legal ending columns stored here
+	assert_eq(ex_width, width);
+	assert_eq(ex_solwidth, solwidth);
+	assert_eq(ex_trimup, trimup);
+	assert_eq(ex_trimdn, trimdn);
+	assert_eq(ex_refl, refl);
+	assert_eq(ex_refr, refr);
+	for(size_t i = 0; i < width; i++) {
+		assert_eq((ex_st[i] == '1'), st[i]);
+		assert_eq((ex_en[i] == '1'), en[i]);
+	}
+	cerr << "PASSED" << endl;
+}
+
+static void testCaseFindMateAnchorRight(
+	const char *testName,
+	bool trimToRef,
+	int64_t ll,
+	int64_t lr,
+	int64_t rl,
+	int64_t rr,
+	size_t rdlen,
+	size_t reflen,
+	size_t maxrdgap,
+	size_t maxrfgap,
+	size_t ex_width,
+	size_t ex_solwidth,
+	size_t ex_trimup,
+	size_t ex_trimdn,
+	int64_t ex_refl,
+	int64_t ex_refr,
+	const char *ex_st,    // string of '0'/'1' chars
+	const char *ex_en)    // string of '0'/'1' chars
+{
+	cerr << testName << "...";
+	DynProgFramer fr(trimToRef);
+	size_t width, solwidth;
+	size_t maxgaps;
+	int64_t refl, refr;
+	EList<bool> st, en;
+	size_t trimup, trimdn;
+	size_t maxhalf = 500;
+	fr.frameFindMateAnchorRight(
+		ll,       // leftmost Watson off for LHS of opp alignment
+		lr,       // rightmost Watson off for LHS of opp alignment
+		rl,       // leftmost Watson off for RHS of opp alignment
+		rr,       // rightmost Watson off for RHS of opp alignment
+		rdlen,    // length of opposite mate
+		reflen,   // length of reference sequence aligned to
+		maxrdgap, // max # of read gaps permitted in opp mate alignment
+		maxrfgap, // max # of ref gaps permitted in opp mate alignment
+		maxns,    // max # Ns permitted
+		maxhalf,  // max width in either direction
+		width,    // out: calculated width stored here
+		maxgaps,  // out: calcualted max # gaps
+		trimup,   // out: number of bases trimmed from upstream end
+		trimdn,   // out: number of bases trimmed from downstream end
+		refl,     // out: ref pos of upper LHS of parallelogram
+		refr,     // out: ref pos of lower RHS of parallelogram
+		st,       // out: legal starting columns stored here
+		en);      // out: legal ending columns stored here
+	assert_eq(ex_width, width);
+	assert_eq(ex_trimup, trimup);
+	assert_eq(ex_trimdn, trimdn);
+	assert_eq(ex_refl, refl);
+	assert_eq(ex_refr, refr);
+	for(size_t i = 0; i < width; i++) {
+		assert_eq((ex_st[i] == '1'), st[i]);
+		assert_eq((ex_en[i] == '1'), en[i]);
+	}
+	cerr << "PASSED" << endl;
+}
+
+int main(void) {
+	
+	///////////////////////////
+	//
+	// ANCHOR ON THE LEFT
+	//
+	///////////////////////////
+
+	//    -------------
+	//       o     o
+	//        o     o
+	//         o     o
+	//          o     o
+	//        <<<------->>>
+	// 012345678901234567890
+	// 0         1         2
+	testCaseFindMateAnchorLeft(
+		"FindMateAnchorLeft1",
+		false,            // trim to reference
+		3,                // left offset of upper parallelogram extent
+		15,               // right offset of upper parallelogram extent
+		10,               // left offset of lower parallelogram extent
+		16,               // right offset of lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		3,                // max # of read gaps permitted in opp mate alignment
+		3,                // max # of ref gaps permitted in opp mate alignment
+		13,               // expected width
+		0,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		3,                // ref offset of upstream column
+		19,               // ref offset of downstream column
+		"1111111111111",  // expected starting bools
+		"0001111111000"); // expected ending bools
+
+	//        *******
+	//     <<===-----
+	//       o    o
+	//        o    o
+	//         o    o
+	//          o    o
+	//         <<=----->>
+	//            *******
+	// 012345678901234567890
+	// 0         1         2
+	testCaseFindMateAnchorLeft(
+		"FindMateAnchorLeft2",
+		false,            // trim to reference
+		9,                // left offset of left upper parallelogram extent
+		14,               // right offset of left upper parallelogram extent
+		10,               // left offset of left lower parallelogram extent
+		15,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		7,                // expected width
+		3,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		7,                // ref offset of upstream column
+		17,               // ref offset of downstream column
+		"0011111",        // expected starting bools
+		"1111100");       // expected ending bools
+
+	//        *******
+	//     <<===--->>
+	//       o    o
+	//        o    o
+	//         o    o
+	//          o    o
+	//           o    o
+	//         <<=----->>
+	//            *******
+	// 01234567890123456xxxx
+	// 0         1         2
+	testCaseFindMateAnchorLeft(
+		"FindMateAnchorLeft3",
+		true,             // trim to reference
+		9,                // left offset of left upper parallelogram extent
+		14,               // right offset of left upper parallelogram extent
+		10,               // left offset of left lower parallelogram extent
+		15,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		17,               // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		7,                // expected width
+		3,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		7,                // ref offset of upstream column
+		17,               // ref offset of downstream column
+		"0011111",        // expected starting bools
+		"1111100");       // expected ending bools
+
+	//        ******
+	//     <<===-----
+	//       o    o
+	//        o    o
+	//         o    o
+	//          o    o
+	//         <<=----=>>
+	//            ******
+	// 012345678901234xxxxxx
+	// 0         1         2
+	testCaseFindMateAnchorLeft(
+		"FindMateAnchorLeft4",
+		true,             // trim to reference
+		9,                // left offset of left upper parallelogram extent
+		14,               // right offset of left upper parallelogram extent
+		10,               // left offset of left lower parallelogram extent
+		15,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		15,               // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		6,                // expected width
+		3,                // expected # bases trimmed from upstream end
+		1,                // expected # bases trimmed from downstream end
+		7,                // ref offset of upstream column
+		16,               // ref offset of downstream column
+		"001111",         // expected starting bools
+		"111100");        // expected ending bools
+
+	// -1         0         2
+	//  xxxxxxxxxx012345678xx
+	//
+	//           *******
+	//        <<===-----
+	//          o    o
+	//           o    o
+	//            o    o
+	//             o    o
+	//              o    o
+	//            <<=----->>
+	//               *******
+	//                
+	//  xxxxxxxxxx012345678xx
+	// -1         0         2
+	testCaseFindMateAnchorLeft(
+		"FindMateAnchorLeft5",
+		true,             // trim to reference
+		1,                // left offset of left upper parallelogram extent
+		7,                // right offset of left upper parallelogram extent
+		2,                // left offset of left lower parallelogram extent
+		7,                // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		9,                // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		7,                // expected width
+		3,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		-1,               // ref offset of upstream column
+		9,                // ref offset of downstream column
+		"0011111",        // expected starting bools
+		"1111100");       // expected ending bools
+
+	//   <<<<==-===>>
+	//       o    o
+	//        o    o
+	//         o    o
+	//          o    o
+	//       <<<<------>>
+	//           ******
+	// 012345678901234567890
+	// 0         1         2
+	testCaseFindMateAnchorLeft(
+		"FindMateAnchorLeft6",
+		false,            // trim to reference
+		8,                // left offset of left upper parallelogram extent
+		8,                // right offset of left upper parallelogram extent
+		10,               // left offset of left lower parallelogram extent
+		15,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		4,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		6,                // expected width
+		4,                // expected # bases trimmed from upstream end
+		2,                // expected # bases trimmed from downstream end
+		6,                // ref offset of upstream column
+		15,               // ref offset of downstream column
+		"001000",         // expected starting bools
+		"111111");        // expected ending bools
+
+	///////////////////////////
+	//
+	// ANCHOR ON THE RIGHT
+	//
+	///////////////////////////
+
+	//        <<<------->>>
+	//           o     o
+	//            o     o
+	//             o     o
+	//              o     o
+	//            <<<------->>>
+	// 012345678901234567890123456789
+	// 0         1         2
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight1",
+		false,            // trim to reference
+		10,               // left offset of left upper parallelogram extent
+		16,               // right offset of left upper parallelogram extent
+		11,               // left offset of left lower parallelogram extent
+		23,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		3,                // max # of read gaps permitted in opp mate alignment
+		3,                // max # of ref gaps permitted in opp mate alignment
+		13,               // expected width
+		0,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		7,                // ref offset of upstream column
+		23,               // ref offset of downstream column
+		"0001111111000",  // expected starting bools
+		"1111111111111"); // expected ending bools
+
+	// 0         1         2
+	// 012345678901234567890
+	//        *******
+	//     <<------>>
+	//        o    o
+	//         o    o
+	//          o    o
+	//           o    o
+	//         <<===--->>
+	//            *******
+	// 012345678901234567890
+	// 0         1         2
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight2",
+		false,            // trim to reference
+		6,                // left offset of left upper parallelogram extent
+		11,               // right offset of left upper parallelogram extent
+		13,               // left offset of left lower parallelogram extent
+		18,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		7,                // expected width
+		3,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		7,                // ref offset of upstream column
+		17,               // ref offset of downstream column
+		"1111100",        // expected starting bools
+		"0011111");       // expected ending bools
+
+	// Reference trimming takes off the left_pad of the left mate
+	//
+	//             *******
+	//          <<------>>
+	//            o    o
+	//             o    o
+	//              o    o
+	//               o    o
+	//                o    o
+	//              <<===--->>
+	//                 *******
+	//  0123456789012345678901234567890
+	// -1         0         1         2
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight3",
+		true,             // trim to reference
+		0,                // left offset of left upper parallelogram extent
+		5,                // right offset of left upper parallelogram extent
+		7,                // left offset of left lower parallelogram extent
+		11,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		7,                // expected width
+		3,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		1,                // ref offset of upstream column
+		11,               // ref offset of downstream column
+		"1111100",        // expected starting bools
+		"0011111");       // expected ending bools
+
+	// Reference trimming takes off the leftmost 5 positions of the left mate,
+	// and takes 1 from the right mate
+	//
+	//            *****
+	//       <<------>>
+	//         o    o
+	//          o    o
+	//           o    o
+	//            o    o
+	//             o    o
+	//           <<===--->>
+	//                *****
+	//  0987654321012345678901234567890
+	// -1         0         1         2
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight4",
+		true,             // trim to reference
+		-3,               // left offset of left upper parallelogram extent
+		2,                // right offset of left upper parallelogram extent
+		4,                // left offset of left lower parallelogram extent
+		10,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		5,                // expected width
+		5,                // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		0,                // ref offset of upstream column
+		8,                // ref offset of downstream column
+		"11100",          // expected starting bools
+		"11111");         // expected ending bools
+
+	// Reference trimming takes off the leftmost 5 positions of the left mate,
+	// and takes 1 from the left of the right mate.  Also, it takes 2 from the
+	// right of the right mate.
+	//
+	//            ***
+	//       <<------>>
+	//         o    o
+	//          o    o
+	//           o    o
+	//            o    o
+	//             o    o
+	//           <<===--->>
+	//                ***
+	//  0987654321012345678901234567890
+	// -1         0         1         2
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight5",
+		true,             // trim to reference
+		-3,               // left offset of left upper parallelogram extent
+		2,                // right offset of left upper parallelogram extent
+		4,                // left offset of left lower parallelogram extent
+		10,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		7,                // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		3,                // expected width
+		5,                // expected # bases trimmed from upstream end
+		2,                // expected # bases trimmed from downstream end
+		0,                // ref offset of upstream column
+		6,                // ref offset of downstream column
+		"111",            // expected starting bools
+		"111");           // expected ending bools
+
+	//       ******
+	//     <<------>>>>
+	//        o    o
+	//         o    o
+	//          o    o
+	//           o    o
+	//         <<====-=>>>>
+	//           ******
+	// 012345678901234567890
+	// 0         1         2
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight6",
+		false,            // trim to reference
+		6,                // left offset of left upper parallelogram extent
+		11,               // right offset of left upper parallelogram extent
+		14,               // left offset of left lower parallelogram extent
+		14,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		4,                // max # of read gaps permitted in opp mate alignment
+		2,                // max # of ref gaps permitted in opp mate alignment
+		6,                // expected width
+		2,                // expected # bases trimmed from upstream end
+		4,                // expected # bases trimmed from downstream end
+		6,                // ref offset of upstream column
+		15,               // ref offset of downstream column
+		"111111",         // expected starting bools
+		"000010");        // expected ending bools
+
+	//         ****
+	//   <<<<==---->>
+	//       o    o
+	//        o    o
+	//         o    o
+	//          o    o
+	//           o    o
+	//       <<<<====-=>>
+	//             ****
+	// 012345678901234567890
+	// 0         1         2
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight7",
+		false,            // trim to reference
+		6,                // left offset of left upper parallelogram extent
+		11,               // right offset of left upper parallelogram extent
+		14,               // left offset of left lower parallelogram extent
+		14,               // right offset of left lower parallelogram extent
+		5,                // length of opposite mate
+		30,               // length of reference sequence aligned to
+		2,                // max # of read gaps permitted in opp mate alignment
+		4,                // max # of ref gaps permitted in opp mate alignment
+		4,                // expected width
+		6,                // expected # bases trimmed from upstream end
+		2,                // expected # bases trimmed from downstream end
+		8,                // ref offset of upstream column
+		15,               // ref offset of downstream column
+		"1111",           // expected starting bools
+		"0010");          // expected ending bools
+	
+	testCaseFindMateAnchorRight(
+		"FindMateAnchorRight8",
+		true,             // trim to reference
+		-37,              // left offset of left upper parallelogram extent
+		13,               // right offset of left upper parallelogram extent
+		-37,              // left offset of left lower parallelogram extent
+		52,               // right offset of left lower parallelogram extent
+		10,               // length of opposite mate
+		53,               // length of reference sequence aligned to
+		0,                // max # of read gaps permitted in opp mate alignment
+		0,                // max # of ref gaps permitted in opp mate alignment
+		14,               // expected width
+		37,               // expected # bases trimmed from upstream end
+		0,                // expected # bases trimmed from downstream end
+		0,                // ref offset of upstream column
+		22,               // ref offset of downstream column
+		"11111111111111", // expected starting bools
+		"11111111111111");// expected ending bools
+}
+
+#endif /*def MAIN_DP_FRAMER*/
diff --git a/dp_framer.h b/dp_framer.h
new file mode 100644
index 0000000..4209f41
--- /dev/null
+++ b/dp_framer.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ *  dp_framer.h
+ *
+ * Classes and routines for framing dynamic programming problems.  There are 2
+ * basic types of dynamic programming problems solved in Bowtie 2:
+ *
+ * 1. Seed extension: we found a seed hit using Burrows-Wheeler techniques and
+ *    now we would like to extend it into a full alignment by doing dynamic
+ *    programming in the vicinity of the seed hit.
+ *
+ * 2. Mate finding: we would a full alignment for one mate in a pair and now we
+ *    would like to extend it into a full alignment by doing dynamic
+ *    programming in the area prescribed by the maximum and minimum fragment
+ *    lengths.
+ *
+ * By "framing" the dynamic programming problem, we mean that all of the
+ * following DP inputs are calculated:
+ *
+ * 1. The width of the parallelogram/rectangle to explore.
+ * 2. The 0-based offset of the reference position associated with the leftmost
+ *    diagnomal/column in the parallelogram/rectangle to explore
+ * 3. An EList<bool> of length=width encoding which columns the alignment may
+ *    start in
+ * 4. An EList<bool> of length=width encoding which columns the alignment may
+ *    end in
+ */
+
+#ifndef DP_FRAMER_H_
+#define DP_FRAMER_H_
+
+#include <stdint.h>
+#include "ds.h"
+#include "ref_coord.h"
+
+/**
+ * Describes a dynamic programming rectangle.
+ *
+ * Only knows about reference offsets, not reference sequences.
+ */
+struct DPRect {
+
+	DPRect(int cat = 0) /*: st(cat), en(cat)*/ {
+		refl = refr = triml = trimr = corel = corer = 0;
+	}
+
+	int64_t refl;         // leftmost ref offset involved post trimming (incl)
+	int64_t refr;         // rightmost ref offset involved post trimming (incl)
+
+	int64_t refl_pretrim; // leftmost ref offset involved pre trimming (incl)
+	int64_t refr_pretrim; // rightmost ref offset involved pre trimming (incl)
+	
+	size_t  triml;        // positions trimmed from LHS
+	size_t  trimr;        // positions trimmed from RHS
+	
+	// If "core" diagonals are specified, then any alignment reported has to
+	// overlap one of the core diagonals.  This is to avoid the situation where
+	// an alignment is reported that overlaps a better-scoring alignment that
+	// falls partially outside the rectangle.  This is used in both seed
+	// extensions and in mate finding.  Filtering based on the core diagonals
+	// should happen in the backtrace routine.  I.e. it should simply never
+	// return an alignment that doesn't overlap a core diagonal, even if there
+	// is such an alignment and it's valid.
+	
+	size_t  corel; // offset of column where leftmost "core" diagonal starts
+	size_t  corer; // offset of column where rightmost "core" diagonal starts
+	// [corel, corer] is an inclusive range and offsets are with respect to the
+	// original, untrimmed rectangle.
+	
+	size_t  maxgap; // max # gaps - width of the gap bands
+	
+	/**
+	 * Return true iff the combined effect of triml and trimr is to trim away
+	 * the entire rectangle.
+	 */
+	bool entirelyTrimmed() const {
+		bool tr = refr < refl;
+		ASSERT_ONLY(size_t width = (size_t)(refr_pretrim - refl_pretrim + 1));
+		assert(tr == (width <= triml + trimr));
+		return tr;
+	}
+	
+#ifndef NDEBUG
+	bool repOk() const {
+		assert_geq(corer, corel);
+		return true;
+	}
+#endif
+	
+	/**
+	 * Set the given interval to the range of diagonals that are "covered" by
+	 * this dynamic programming problem.
+	 */
+	void initIval(Interval& iv) {
+		iv.setOff(refl_pretrim + (int64_t)corel);
+		iv.setLen(corer - corel + 1);
+	}
+};
+
+/**
+ * Encapsulates routines for calculating parameters for the various types of
+ * dynamic programming problems solved in Bowtie2.
+ */
+class DynProgFramer {
+
+public:
+
+	DynProgFramer(bool trimToRef) : trimToRef_(trimToRef) { }
+
+	/**
+	 * Similar to frameSeedExtensionParallelogram but we're being somewhat more
+	 * inclusive in order to ensure all characters aling the "width" in the last
+	 * row are exhaustively scored.
+	 */
+	bool frameSeedExtensionRect(
+		int64_t off,      // ref offset implied by seed hit assuming no gaps
+		size_t rdlen,     // length of read sequence used in DP table (so len
+						  // of +1 nucleotide sequence for colorspace reads)
+		int64_t reflen,   // length of reference sequence aligned to
+		size_t maxrdgap,  // max # of read gaps permitted in opp mate alignment
+		size_t maxrfgap,  // max # of ref gaps permitted in opp mate alignment
+		int64_t maxns,    // # Ns permitted
+		size_t maxhalf,   // max width in either direction
+		DPRect& rect);    // out: DP rectangle
+
+	/**
+	 * Given information about an anchor mate hit, and information deduced by
+	 * PairedEndPolicy about where the opposite mate can begin and start given
+	 * the fragment length range, return parameters for the dynamic programming
+	 * problem to solve.
+	 */
+	bool frameFindMateRect(
+		bool anchorLeft,  // true iff anchor alignment is to the left
+		int64_t ll,       // leftmost Watson off for LHS of opp alignment
+		int64_t lr,       // rightmost Watson off for LHS of opp alignment
+		int64_t rl,       // leftmost Watson off for RHS of opp alignment
+		int64_t rr,       // rightmost Watson off for RHS of opp alignment
+		size_t  rdlen,    // length of opposite mate
+		int64_t reflen,   // length of reference sequence aligned to
+		size_t  maxrdgap, // max # of read gaps permitted in opp mate alignment
+		size_t  maxrfgap, // max # of ref gaps permitted in opp mate alignment
+		int64_t maxns,    // max # Ns permitted
+		size_t  maxhalf,  // max width in either direction
+		DPRect& rect)     // out: DP rectangle
+		const
+	{
+		if(anchorLeft) {
+			return frameFindMateAnchorLeftRect(
+				ll,
+				lr,
+				rl,
+				rr,
+				rdlen,
+				reflen,
+				maxrdgap,
+				maxrfgap,
+				maxns,
+				maxhalf,
+				rect);
+		} else {
+			return frameFindMateAnchorRightRect(
+				ll,
+				lr,
+				rl,
+				rr,
+				rdlen,
+				reflen,
+				maxrdgap,
+				maxrfgap,
+				maxns,
+				maxhalf,
+				rect);
+		}
+	}
+
+	/**
+	 * Given information about an anchor mate hit, and information deduced by
+	 * PairedEndPolicy about where the opposite mate can begin and start given
+	 * the fragment length range, return parameters for the dynamic programming
+	 * problem to solve.
+	 */
+	bool frameFindMateAnchorLeftRect(
+		int64_t ll,       // leftmost Watson off for LHS of opp alignment
+		int64_t lr,       // rightmost Watson off for LHS of opp alignment
+		int64_t rl,       // leftmost Watson off for RHS of opp alignment
+		int64_t rr,       // rightmost Watson off for RHS of opp alignment
+		size_t  rdlen,    // length of opposite mate
+		int64_t reflen,   // length of reference sequence aligned to
+		size_t  maxrdgap, // max # of read gaps permitted in opp mate alignment
+		size_t  maxrfgap, // max # of ref gaps permitted in opp mate alignment
+		int64_t maxns,    // max # Ns permitted in alignment
+		size_t  maxhalf,  // max width in either direction
+		DPRect& rect)     // out: DP rectangle
+		const;
+
+	/**
+	 * Given information about an anchor mate hit, and information deduced by
+	 * PairedEndPolicy about where the opposite mate can begin and start given
+	 * the fragment length range, return parameters for the dynamic programming
+	 * problem to solve.
+	 */
+	bool frameFindMateAnchorRightRect(
+		int64_t ll,       // leftmost Watson off for LHS of opp alignment
+		int64_t lr,       // rightmost Watson off for LHS of opp alignment
+		int64_t rl,       // leftmost Watson off for RHS of opp alignment
+		int64_t rr,       // rightmost Watson off for RHS of opp alignment
+		size_t  rdlen,    // length of opposite mate
+		int64_t reflen,   // length of reference sequence aligned to
+		size_t  maxrdgap, // max # of read gaps permitted in opp mate alignment
+		size_t  maxrfgap, // max # of ref gaps permitted in opp mate alignment
+		int64_t maxns,    // max # Ns permitted in alignment
+		size_t  maxhalf,  // max width in either direction
+		DPRect& rect)     // out: DP rectangle
+		const;
+
+protected:
+
+	/**
+	 * Trim the given parallelogram width and reference window so that neither
+	 * overhangs the beginning or end of the reference.  Return true if width
+	 * is still > 0 after trimming, otherwise return false.
+	 */
+	void trimToRef(
+		size_t   reflen,  // in: length of reference sequence aligned to
+		int64_t& refl,    // in/out: ref pos of upper LHS of parallelogram
+		int64_t& refr,    // in/out: ref pos of lower RHS of parallelogram
+		size_t&  trimup,  // out: number of bases trimmed from upstream end
+		size_t&  trimdn)  // out: number of bases trimmed from downstream end
+	{
+		if(refl < 0) {
+			trimup = (size_t)(-refl);
+			//refl = 0;
+		}
+		if(refr >= (int64_t)reflen) {
+			trimdn = (size_t)(refr - reflen + 1);
+			//refr = (int64_t)reflen-1;
+		}
+	}
+
+	bool trimToRef_;
+};
+
+#endif /*ndef DP_FRAMER_H_*/
diff --git a/ds.cpp b/ds.cpp
new file mode 100644
index 0000000..b98eb95
--- /dev/null
+++ b/ds.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ds.h"
+
+MemoryTally gMemTally;
+
+/**
+ * Tally a memory allocation of size amt bytes.
+ */
+void MemoryTally::add(int cat, uint64_t amt) {
+	ThreadSafe ts(&mutex_m);
+	tots_[cat] += amt;
+	tot_ += amt;
+	if(tots_[cat] > peaks_[cat]) {
+		peaks_[cat] = tots_[cat];
+	}
+	if(tot_ > peak_) {
+		peak_ = tot_;
+	}
+}
+
+/**
+ * Tally a memory free of size amt bytes.
+ */
+void MemoryTally::del(int cat, uint64_t amt) {
+	ThreadSafe ts(&mutex_m);
+	assert_geq(tots_[cat], amt);
+	assert_geq(tot_, amt);
+	tots_[cat] -= amt;
+	tot_ -= amt;
+}
+	
+#ifdef MAIN_DS
+
+#include <limits>
+#include "random_source.h"
+
+using namespace std;
+
+int main(void) {
+	cerr << "Test EHeap 1...";
+	{
+		EHeap<float> h;
+		h.insert(0.5f);  // 1
+		h.insert(0.6f);  // 2
+		h.insert(0.25f); // 3
+		h.insert(0.75f); // 4
+		h.insert(0.1f);  // 5
+		h.insert(0.9f);  // 6
+		h.insert(0.4f);  // 7
+		assert_eq(7, h.size());
+		if(h.pop() != 0.1f) {
+			throw 1;
+		}
+		assert_eq(6, h.size());
+		if(h.pop() != 0.25f) {
+			throw 1;
+		}
+		assert_eq(5, h.size());
+		if(h.pop() != 0.4f) {
+			throw 1;
+		}
+		assert_eq(4, h.size());
+		if(h.pop() != 0.5f) {
+			throw 1;
+		}
+		assert_eq(3, h.size());
+		if(h.pop() != 0.6f) {
+			throw 1;
+		}
+		assert_eq(2, h.size());
+		if(h.pop() != 0.75f) {
+			throw 1;
+		}
+		assert_eq(1, h.size());
+		if(h.pop() != 0.9f) {
+			throw 1;
+		}
+		assert_eq(0, h.size());
+		assert(h.empty());
+	}
+	cerr << "PASSED" << endl;
+
+	cerr << "Test EHeap 2...";
+	{
+		EHeap<size_t> h;
+		RandomSource rnd(12);
+		size_t lim = 2000;
+		while(h.size() < lim) {
+			h.insert(rnd.nextU32());
+		}
+		size_t last = std::numeric_limits<size_t>::max();
+		bool first = true;
+		while(!h.empty()) {
+			size_t p = h.pop();
+			assert(first || p >= last);
+			last = p;
+			first = false;
+		}
+	}
+	cerr << "PASSED" << endl;
+
+	cerr << "Test EBitList 1...";
+	{
+		EBitList<128> l;
+		assert_eq(0, l.size());
+		assert_eq(std::numeric_limits<size_t>::max(), l.max());
+		
+		assert(!l.test(0));
+		assert(!l.test(1));
+		assert(!l.test(10));
+		
+		for(int i = 0; i < 3; i++) {
+			l.set(10);
+			assert(!l.test(0));
+			assert(!l.test(1));
+			assert(!l.test(9));
+			assert(l.test(10));
+			assert(!l.test(11));
+		}
+		
+		assert_eq(10, l.max());
+		l.clear();
+		assert(!l.test(10));
+		assert_eq(std::numeric_limits<size_t>::max(), l.max());
+		
+		RandomSource rnd(12);
+		size_t lim = 2000;
+		for(size_t i = 0; i < lim; i++) {
+			uint32_t ri = rnd.nextU32() % 10000;
+			l.set(ri);
+			assert(l.test(ri));
+		}
+	}
+	cerr << "PASSED" << endl;
+}
+
+#endif /*def MAIN_SSTRING*/
diff --git a/ds.h b/ds.h
new file mode 100644
index 0000000..9814935
--- /dev/null
+++ b/ds.h
@@ -0,0 +1,4305 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DS_H_
+#define DS_H_
+
+#include <algorithm>
+#include <stdexcept>
+#include <utility>
+#include <stdint.h>
+#include <string.h>
+#include <limits>
+#include "assert_helpers.h"
+#include "threading.h"
+#include "random_source.h"
+#include "btypes.h"
+
+/**
+ * Tally how much memory is allocated to certain 
+ */
+class MemoryTally {
+
+public:
+
+	MemoryTally() : tot_(0), peak_(0) {
+		memset(tots_,  0, 256 * sizeof(uint64_t));
+		memset(peaks_, 0, 256 * sizeof(uint64_t));
+	}
+
+	/**
+	 * Tally a memory allocation of size amt bytes.
+	 */
+	void add(int cat, uint64_t amt);
+
+	/**
+	 * Tally a memory free of size amt bytes.
+	 */
+	void del(int cat, uint64_t amt);
+	
+	/**
+	 * Return the total amount of memory allocated.
+	 */
+	uint64_t total() { return tot_; }
+
+	/**
+	 * Return the total amount of memory allocated in a particular
+	 * category.
+	 */
+	uint64_t total(int cat) { return tots_[cat]; }
+
+	/**
+	 * Return the peak amount of memory allocated.
+	 */
+	uint64_t peak() { return peak_; }
+
+	/**
+	 * Return the peak amount of memory allocated in a particular
+	 * category.
+	 */
+	uint64_t peak(int cat) { return peaks_[cat]; }
+
+#ifndef NDEBUG
+	/**
+	 * Check that memory tallies are internally consistent;
+	 */
+	bool repOk() const {
+		uint64_t tot = 0;
+		for(int i = 0; i < 256; i++) {
+			assert_leq(tots_[i], peaks_[i]);
+			tot += tots_[i];
+		}
+		assert_eq(tot, tot_);
+		return true;
+	}
+#endif
+
+protected:
+
+	MUTEX_T mutex_m;
+	uint64_t tots_[256];
+	uint64_t tot_;
+	uint64_t peaks_[256];
+	uint64_t peak_;
+};
+
+extern MemoryTally gMemTally;
+
+/**
+ * A simple fixed-length array of type T, automatically freed in the
+ * destructor.
+ */
+template<typename T>
+class AutoArray {
+public:
+
+	AutoArray(size_t sz, int cat = 0) : cat_(cat) {
+		t_ = NULL;
+		t_ = new T[sz];
+		gMemTally.add(cat_, sz);
+		memset(t_, 0, sz * sizeof(T));
+		sz_ = sz;
+	}
+	
+	~AutoArray() {
+		if(t_ != NULL) {
+			delete[] t_;
+			gMemTally.del(cat_, sz_);
+		}
+	}
+	
+	T& operator[](size_t sz) {
+		return t_[sz];
+	}
+	
+	const T& operator[](size_t sz) const {
+		return t_[sz];
+	}
+	
+	size_t size() const { return sz_; }
+
+private:
+	int cat_;
+	T *t_;
+	size_t sz_;
+};
+
+/**
+ * A wrapper for a non-array pointer that associates it with a memory
+ * category for tracking purposes and calls delete on it when the
+ * PtrWrap is destroyed.
+ */
+template<typename T>
+class PtrWrap {
+public:
+
+	explicit PtrWrap(
+		T* p,
+		bool freeable = true,
+		int cat = 0) :
+		cat_(cat),
+		p_(NULL)
+	{
+		init(p, freeable);
+	}
+
+	explicit PtrWrap(int cat = 0) :
+		cat_(cat),
+		p_(NULL)
+	{
+		reset();
+	}
+
+	void reset() {
+		free();
+		init(NULL);
+	}
+
+	~PtrWrap() { free(); }
+	
+	void init(T* p, bool freeable = true) {
+		assert(p_ == NULL);
+		p_ = p;
+		freeable_ = freeable;
+		if(p != NULL && freeable_) {
+			gMemTally.add(cat_, sizeof(T));
+		}
+	}
+	
+	void free() {
+		if(p_ != NULL) {
+			if(freeable_) {
+				delete p_;
+				gMemTally.del(cat_, sizeof(T));
+			}
+			p_ = NULL;
+		}
+	}
+	
+	inline T* get() { return p_; }
+	inline const T* get() const { return p_; }
+
+private:
+	int cat_;
+	T *p_;
+	bool freeable_;
+};
+
+/**
+ * A wrapper for an array pointer that associates it with a memory
+ * category for tracking purposes and calls delete[] on it when the
+ * PtrWrap is destroyed.
+ */
+template<typename T>
+class APtrWrap {
+public:
+
+	explicit APtrWrap(
+		T* p,
+		size_t sz,
+		bool freeable = true,
+		int cat = 0) :
+		cat_(cat),
+		p_(NULL)
+	{
+		init(p, sz, freeable);
+	}
+
+	explicit APtrWrap(int cat = 0) :
+		cat_(cat),
+		p_(NULL)
+	{
+		reset();
+	}
+	
+	void reset() {
+		free();
+		init(NULL, 0);
+	}
+
+	~APtrWrap() { free(); }
+	
+	void init(T* p, size_t sz, bool freeable = true) {
+		assert(p_ == NULL);
+		p_ = p;
+		sz_ = sz;
+		freeable_ = freeable;
+		if(p != NULL && freeable_) {
+			gMemTally.add(cat_, sizeof(T) * sz_);
+		}
+	}
+	
+	void free() {
+		if(p_ != NULL) {
+			if(freeable_) {
+				delete[] p_;
+				gMemTally.del(cat_, sizeof(T) * sz_);
+			}
+			p_ = NULL;
+		}
+	}
+	
+	inline T* get() { return p_; }
+	inline const T* get() const { return p_; }
+
+private:
+	int cat_;
+	T *p_;
+	bool freeable_;
+	size_t sz_;
+};
+
+/**
+ * An EList<T> is an expandable list with these features:
+ *
+ *  - Payload type is a template parameter T.
+ *  - Initial size can be specified at construction time, otherwise
+ *    default of 128 is used.
+ *  - When allocated initially or when expanding, the new[] operator is
+ *    used, which in turn calls the default constructor for T.
+ *  - All copies (e.g. assignment of a const T& to an EList<T> element,
+ *    or during expansion) use operator=.
+ *  - When the EList<T> is resized to a smaller size (or cleared, which
+ *    is like resizing to size 0), the underlying containing is not
+ *    reshaped.  Thus, ELists<T>s never release memory before
+ *    destruction.
+ *
+ * And these requirements:
+ *
+ *  - Payload type T must have a default constructor.
+ *
+ * For efficiency reasons, ELists should not be declared on the stack
+ * in often-called worker functions.  Best practice is to declare
+ * ELists at a relatively stable layer of the stack (such that it
+ * rarely bounces in and out of scope) and let the worker function use
+ * it and *expand* it only as needed.  The effect is that only
+ * relatively few allocations and copies will be incurred, and they'll
+ * occur toward the beginning of the computation before stabilizing at
+ * a "high water mark" for the remainder of the computation.
+ *
+ * A word about multidimensional lists.  One way to achieve a
+ * multidimensional lists is to nest ELists.  This works, but it often
+ * involves a lot more calls to the default constructor and to
+ * operator=, especially when the outermost EList needs expanding, than
+ * some of the alternatives.  One alternative is use a most specialized
+ * container that still uses ELists but knows to use xfer instead of
+ * operator= when T=EList.
+ *
+ * The 'cat_' fiends encodes a category.  This makes it possible to
+ * distinguish between object subgroups in the global memory tally.
+ *
+ * Memory allocation is lazy.  Allocation is only triggered when the
+ * user calls push_back, expand, resize, or another function that
+ * increases the size of the list.  This saves memory and also makes it
+ * easier to deal with nested ELists, since the default constructor
+ * doesn't set anything in stone.
+ */
+template <typename T, int S = 128>
+class EList {
+
+public:
+
+	/**
+	 * Allocate initial default of S elements.
+	 */
+	explicit EList() :
+		cat_(0), allocCat_(-1), list_(NULL), sz_(S), cur_(0) { }
+
+	/**
+	 * Allocate initial default of S elements.
+	 */
+	explicit EList(int cat) :
+		cat_(cat), allocCat_(-1), list_(NULL), sz_(S), cur_(0)
+	{
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Initially allocate given number of elements; should be > 0.
+	 */
+	explicit EList(size_t isz, int cat = 0) :
+		cat_(cat), allocCat_(-1), list_(NULL), sz_(isz), cur_(0)
+	{
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Copy from another EList using operator=.
+	 */
+	EList(const EList<T, S>& o) :
+		cat_(0), allocCat_(-1), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+	}
+
+	/**
+	 * Copy from another EList using operator=.
+	 */
+	explicit EList(const EList<T, S>& o, int cat) :
+		cat_(cat), allocCat_(-1), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Destructor.
+	 */
+	~EList() { free(); }
+
+	/**
+	 * Make this object into a copy of o by allocat
+	 */
+	EList<T, S>& operator=(const EList<T, S>& o) {
+		assert_eq(cat_, o.cat());
+		if(o.cur_ == 0) {
+			// Nothing to copy
+			cur_ = 0;
+			return *this;
+		}
+		if(list_ == NULL) {
+			// cat_ should already be set
+			lazyInit();
+		}
+		if(sz_ < o.cur_) expandNoCopy(o.cur_ + 1);
+		assert_geq(sz_, o.cur_);
+		cur_ = o.cur_;
+		for(size_t i = 0; i < cur_; i++) {
+			list_[i] = o.list_[i];
+		}
+		return *this;
+	}
+	
+	/**
+	 * Transfer the guts of another EList into this one without using
+	 * operator=, etc.  We have to set EList o's list_ field to NULL to
+	 * avoid o's destructor from deleting list_ out from under us.
+	 */
+	void xfer(EList<T, S>& o) {
+		// What does it mean to transfer to a different-category list?
+		assert_eq(cat_, o.cat());
+		// Can only transfer into an empty object
+		free();
+		allocCat_ = cat_;
+		list_ = o.list_;
+		sz_ = o.sz_;
+		cur_ = o.cur_;
+		o.list_ = NULL;
+		o.sz_ = o.cur_ = 0;
+		o.allocCat_ = -1;
+	}
+
+	/**
+	 * Return number of elements.
+	 */
+	inline size_t size() const { return cur_; }
+
+	/**
+	 * Return number of elements allocated.
+	 */
+	inline size_t capacity() const { return sz_; }
+	
+	/**
+	 * Return the total size in bytes occupied by this list.
+	 */
+	size_t totalSizeBytes() const {
+		return 	2 * sizeof(int) +
+		        2 * sizeof(size_t) +
+				cur_ * sizeof(T);
+	}
+
+	/**
+	 * Return the total capacity in bytes occupied by this list.
+	 */
+	size_t totalCapacityBytes() const {
+		return 	2 * sizeof(int) +
+		        2 * sizeof(size_t) +
+				sz_ * sizeof(T);
+	}
+	
+	/**
+	 * Ensure that there is sufficient capacity to expand to include
+	 * 'thresh' more elements without having to expand.
+	 */
+	inline void ensure(size_t thresh) {
+		if(list_ == NULL) lazyInit();
+		expandCopy(cur_ + thresh);
+	}
+
+	/**
+	 * Ensure that there is sufficient capacity to include 'newsz' elements.
+	 * If there isn't enough capacity right now, expand capacity to exactly
+	 * equal 'newsz'.
+	 */
+	inline void reserveExact(size_t newsz) {
+		if(list_ == NULL) lazyInitExact(newsz);
+		expandCopyExact(newsz);
+	}
+
+	/**
+	 * Return true iff there are no elements.
+	 */
+	inline bool empty() const { return cur_ == 0; }
+	
+	/**
+	 * Return true iff list hasn't been initialized yet.
+	 */
+	inline bool null() const { return list_ == NULL; }
+
+	/**
+	 * Add an element to the back and immediately initialize it via
+	 * operator=.
+	 */
+	void push_back(const T& el) {
+		if(list_ == NULL) lazyInit();
+		if(cur_ == sz_) expandCopy(sz_+1);
+		list_[cur_++] = el;
+	}
+
+	/**
+	 * Add an element to the back.  No intialization is done.
+	 */
+	void expand() {
+		if(list_ == NULL) lazyInit();
+		if(cur_ == sz_) expandCopy(sz_+1);
+		cur_++;
+	}
+
+	/**
+	 * Add an element to the back.  No intialization is done.
+	 */
+	void fill(size_t begin, size_t end, const T& v) {
+		assert_leq(begin, end);
+		assert_leq(end, cur_);
+		for(size_t i = begin; i < end; i++) {
+			list_[i] = v;
+		}
+	}
+
+	/**
+	 * Add an element to the back.  No intialization is done.
+	 */
+	void fill(const T& v) {
+		for(size_t i = 0; i < cur_; i++) {
+			list_[i] = v;
+		}
+	}
+
+	/**
+	 * Set all bits in specified range of elements in list array to 0.
+	 */
+	void fillZero(size_t begin, size_t end) {
+		assert_leq(begin, end);
+		memset(&list_[begin], 0, sizeof(T) * (end-begin));
+	}
+
+	/**
+	 * Set all bits in the list array to 0.
+	 */
+	void fillZero() {
+		memset(list_, 0, sizeof(T) * cur_);
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resizeNoCopy(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInit();
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) expandNoCopy(sz);
+		cur_ = sz;
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resize(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInit();
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) {
+			expandCopy(sz);
+		}
+		cur_ = sz;
+	}
+
+	/**
+	 * If size is less than requested size, resize up to exactly sz and set
+	 * cur_ to requested sz.
+	 */
+	void resizeExact(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInitExact(sz);
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) expandCopyExact(sz);
+		cur_ = sz;
+	}
+
+	/**
+	 * Erase element at offset idx.
+	 */
+	void erase(size_t idx) {
+		assert_lt(idx, cur_);
+		for(size_t i = idx; i < cur_-1; i++) {
+			list_[i] = list_[i+1];
+		}
+		cur_--;
+	}
+
+	/**
+	 * Erase range of elements starting at offset idx and going for len.
+	 */
+	void erase(size_t idx, size_t len) {
+		assert_geq(len, 0);
+		if(len == 0) {
+			return;
+		}
+		assert_lt(idx, cur_);
+		for(size_t i = idx; i < cur_-len; i++) {
+			list_[i] = list_[i+len];
+		}
+		cur_ -= len;
+	}
+
+	/**
+	 * Insert value 'el' at offset 'idx'
+	 */
+	void insert(const T& el, size_t idx) {
+		if(list_ == NULL) lazyInit();
+		assert_leq(idx, cur_);
+		if(cur_ == sz_) expandCopy(sz_+1);
+		for(size_t i = cur_; i > idx; i--) {
+			list_[i] = list_[i-1];
+		}
+		list_[idx] = el;
+		cur_++;
+	}
+
+	/**
+	 * Insert contents of list 'l' at offset 'idx'
+	 */
+	void insert(const EList<T>& l, size_t idx) {
+		if(list_ == NULL) lazyInit();
+		assert_lt(idx, cur_);
+		if(l.cur_ == 0) return;
+		if(cur_ + l.cur_ > sz_) expandCopy(cur_ + l.cur_);
+		for(size_t i = cur_ + l.cur_ - 1; i > idx + (l.cur_ - 1); i--) {
+			list_[i] = list_[i - l.cur_];
+		}
+		for(size_t i = 0; i < l.cur_; i++) {
+			list_[i+idx] = l.list_[i];
+		}
+		cur_ += l.cur_;
+	}
+
+	/**
+	 * Remove an element from the top of the stack.
+	 */
+	void pop_back() {
+		assert_gt(cur_, 0);
+		cur_--;
+	}
+
+	/**
+	 * Make the stack empty.
+	 */
+	void clear() {
+		cur_ = 0; // re-use stack memory
+		// Don't clear heap; re-use it
+	}
+
+	/**
+	 * Get the element on the top of the stack.
+	 */
+	inline T& back() {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Reverse list elements.
+	 */
+	void reverse() {
+		if(cur_ > 1) {
+			size_t n = cur_ >> 1;
+			for(size_t i = 0; i < n; i++) {
+				T tmp = list_[i];
+				list_[i] = list_[cur_ - i - 1];
+				list_[cur_ - i - 1] = tmp;
+			}
+		}
+	}
+
+	/**
+	 * Get the element on the top of the stack, const version.
+	 */
+	inline const T& back() const {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Get the frontmost element (bottom of stack).
+	 */
+	inline T& front() {
+		assert_gt(cur_, 0);
+		return list_[0];
+	}
+
+	/**
+	 * Get the element on the bottom of the stack, const version.
+	 */
+	inline const T& front() const { return front(); }
+
+	/**
+	 * Return true iff this list and list o contain the same elements in the
+	 * same order according to type T's operator==.
+	 */
+	bool operator==(const EList<T, S>& o) const {
+		if(size() != o.size()) {
+			return false;
+		}
+		for(size_t i = 0; i < size(); i++) {
+			if(!(get(i) == o.get(i))) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	/**
+	 * Return true iff this list contains all of the elements in o according to
+	 * type T's operator==.
+	 */
+	bool isSuperset(const EList<T, S>& o) const {
+		if(o.size() > size()) {
+			// This can't be a superset if the other set contains more elts
+			return false;
+		}
+		// For each element in o
+		for(size_t i = 0; i < o.size(); i++) {
+			bool inthis = false;
+			// Check if it's in this
+			for(size_t j = 0; j < size(); j++) {
+				if(o[i] == (*this)[j]) {
+					inthis = true;
+					break;
+				}
+			}
+			if(!inthis) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline T& operator[](size_t i) {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const T& operator[](size_t i) const {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline T& get(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const T& get(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	T& getSlow(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	const T& getSlow(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Sort some of the contents.
+	 */
+	void sortPortion(size_t begin, size_t num) {
+		sortPortion(begin, num, std::less<T>());
+	}
+
+	template<class Compare>
+	void sortPortion(size_t begin, size_t num, Compare comp) {
+			assert_leq(begin+num, cur_);
+			if(num < 2) return;
+			std::sort(list_ + begin, list_ + begin + num, comp);
+	}
+
+	/**
+	 * Shuffle a portion of the list.
+	 */
+	void shufflePortion(size_t begin, size_t num, RandomSource& rnd) {
+		assert_leq(begin+num, cur_);
+		if(num < 2) return;
+		size_t left = num;
+		for(size_t i = begin; i < begin + num - 1; i++) {
+			uint32_t rndi = rnd.nextU32() % left;
+			if(rndi > 0) {
+				std::swap(list_[i], list_[i + rndi]);
+			}
+			left--;
+		}
+	}
+	
+	/**
+	 * Sort contents
+	 */
+	void sort() {
+		sortPortion(0, cur_, std::less<T>());
+	}
+
+	template <class Compare>
+	void sort(Compare comp)  {
+		sortPortion(0, cur_, comp);
+	}
+
+	/**
+	 * Return true iff every element is < its successor.  Only operator< is
+	 * used.
+	 */
+	bool sorted() const {
+		for(size_t i = 1; i < cur_; i++) {
+			if(!(list_[i-1] < list_[i])) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	/**
+	 * Delete element at position 'idx'; slide subsequent chars up.
+	 */
+	void remove(size_t idx) {
+		assert_lt(idx, cur_);
+		assert_gt(cur_, 0);
+		for(size_t i = idx; i < cur_-1; i++) {
+			list_[i] = list_[i+1];
+		}
+		cur_--;
+	}
+	
+	/**
+	 * Return a pointer to the beginning of the buffer.
+	 */
+	T *ptr() { return list_; }
+
+	/**
+	 * Return a const pointer to the beginning of the buffer.
+	 */
+	const T *ptr() const { return list_; }
+
+	/**
+	 * Set the memory category for this object.
+	 */
+	void setCat(int cat) {
+		// What does it mean to set the category after the list_ is
+		// already allocated?
+		assert(null());
+		assert_gt(cat, 0); cat_ = cat;
+	}
+
+	/**
+	 * Return memory category.
+	 */
+	int cat() const { return cat_; }
+
+	/**
+	 * Perform a binary search for the first element that is not less
+	 * than 'el'.  Return cur_ if all elements are less than el.
+	 */
+	size_t bsearchLoBound(const T& el) const {
+		size_t hi = cur_;
+		size_t lo = 0;
+		while(true) {
+			if(lo == hi) {
+				return lo;
+			}
+			size_t mid = lo + ((hi-lo)>>1);
+			assert_neq(mid, hi);
+			if(list_[mid] < el) {
+				if(lo == mid) {
+					return hi;
+				}
+				lo = mid;
+			} else {
+				hi = mid;
+			}
+		}
+	}
+
+private:
+
+	/**
+	 * Initialize memory for EList.
+	 */
+	void lazyInit() {
+		assert(list_ == NULL);
+		list_ = alloc(sz_);
+	}
+
+	/**
+	 * Initialize exactly the prescribed number of elements for EList.
+	 */
+	void lazyInitExact(size_t sz) {
+		assert_gt(sz, 0);
+		assert(list_ == NULL);
+		sz_ = sz;
+		list_ = alloc(sz);
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	T *alloc(size_t sz) {
+		T* tmp = new T[sz];
+		assert(tmp != NULL);
+		gMemTally.add(cat_, sz);
+		allocCat_ = cat_;
+		return tmp;
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	void free() {
+		if(list_ != NULL) {
+			assert_neq(-1, allocCat_);
+			assert_eq(allocCat_, cat_);
+			delete[] list_;
+			gMemTally.del(cat_, sz_);
+			list_ = NULL;
+			sz_ = cur_ = 0;
+		}
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.  Size
+	 * increases quadratically with number of expansions.  Copy old contents
+	 * into new buffer using operator=.
+	 */
+	void expandCopy(size_t thresh) {
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		expandCopyExact(newsz);
+	}
+
+	/**
+	 * Expand the list_ buffer until it has exactly 'newsz' elements.  Copy
+	 * old contents into new buffer using operator=.
+	 */
+	void expandCopyExact(size_t newsz) {
+		if(newsz <= sz_) return;
+		T* tmp = alloc(newsz);
+		assert(tmp != NULL);
+		size_t cur = cur_;
+		if(list_ != NULL) {
+ 			for(size_t i = 0; i < cur_; i++) {
+				// Note: operator= is used
+				tmp[i] = list_[i];
+			}
+			free();
+		}
+		list_ = tmp;
+		sz_ = newsz;
+		cur_ = cur;
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Size increases quadratically with number of expansions.  Don't copy old
+	 * contents into the new buffer.
+	 */
+	void expandNoCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		expandNoCopyExact(newsz);
+	}
+
+	/**
+	 * Expand the list_ buffer until it has exactly 'newsz' elements.  Don't
+	 * copy old contents into the new buffer.
+	 */
+	void expandNoCopyExact(size_t newsz) {
+		assert(list_ != NULL);
+		assert_gt(newsz, 0);
+		free();
+		T* tmp = alloc(newsz);
+		assert(tmp != NULL);
+		list_ = tmp;
+		sz_ = newsz;
+		assert_gt(sz_, 0);
+	}
+
+	int cat_;      // memory category, for accounting purposes
+	int allocCat_; // category at time of allocation
+	T *list_;      // list pointer, returned from new[]
+	size_t sz_;    // capacity
+	size_t cur_;   // occupancy (AKA size)
+};
+
+/**
+ * An ELList<T> is an expandable list of lists with these features:
+ *
+ *  - Payload type of the inner list is a template parameter T.
+ *  - Initial size can be specified at construction time, otherwise
+ *    default of 128 is used.
+ *  - When allocated initially or when expanding, the new[] operator is
+ *    used, which in turn calls the default constructor for EList<T>.
+ *  - Upon expansion, instead of copies, xfer is used.
+ *  - When the ELList<T> is resized to a smaller size (or cleared,
+ *    which is like resizing to size 0), the underlying containing is
+ *    not reshaped.  Thus, ELLists<T>s never release memory before
+ *    destruction.
+ *
+ * And these requirements:
+ *
+ *  - Payload type T must have a default constructor.
+ *
+ */
+template <typename T, int S1 = 128, int S2 = 128>
+class ELList {
+
+public:
+
+	/**
+	 * Allocate initial default of 128 elements.
+	 */
+	explicit ELList(int cat = 0) :
+		cat_(cat), list_(NULL), sz_(S2), cur_(0)
+	{
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Initially allocate given number of elements; should be > 0.
+	 */
+	explicit ELList(size_t isz, int cat = 0) :
+		cat_(cat), list_(NULL), sz_(isz), cur_(0)
+	{
+		assert_gt(isz, 0);
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Copy from another ELList using operator=.
+	 */
+	ELList(const ELList<T, S1, S2>& o) :
+		cat_(0), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+	}
+
+	/**
+	 * Copy from another ELList using operator=.
+	 */
+	explicit ELList(const ELList<T, S1, S2>& o, int cat) :
+		cat_(cat), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Destructor.
+	 */
+	~ELList() { free(); }
+
+	/**
+	 * Make this object into a copy of o by allocating enough memory to
+	 * fit the number of elements in o (note: the number of elements
+	 * may be substantially less than the memory allocated in o) and
+	 * using operator= to copy them over.
+	 */
+	ELList<T, S1, S2>& operator=(const ELList<T, S1, S2>& o) {
+		assert_eq(cat_, o.cat());
+		if(list_ == NULL) {
+			lazyInit();
+		}
+		if(o.cur_ == 0) {
+			cur_ = 0;
+			return *this;
+		}
+		if(sz_ < o.cur_) expandNoCopy(o.cur_ + 1);
+		assert_geq(sz_, o.cur_);
+		cur_ = o.cur_;
+		for(size_t i = 0; i < cur_; i++) {
+			// Note: using operator=, not xfer
+			assert_eq(list_[i].cat(), o.list_[i].cat());
+			list_[i] = o.list_[i];
+		}
+		return *this;
+	}
+	
+	/**
+	 * Transfer the guts of another EList into this one without using
+	 * operator=, etc.  We have to set EList o's list_ field to NULL to
+	 * avoid o's destructor from deleting list_ out from under us.
+	 */
+	void xfer(ELList<T, S1, S2>& o) {
+		assert_eq(cat_, o.cat());
+		list_ = o.list_; // list_ is an array of EList<T>s
+		sz_   = o.sz_;
+		cur_  = o.cur_;
+		o.list_ = NULL;
+		o.sz_ = o.cur_ = 0;
+	}
+
+	/**
+	 * Return number of elements.
+	 */
+	inline size_t size() const { return cur_; }
+
+	/**
+	 * Return true iff there are no elements.
+	 */
+	inline bool empty() const { return cur_ == 0; }
+
+	/**
+	 * Return true iff list hasn't been initialized yet.
+	 */
+	inline bool null() const { return list_ == NULL; }
+
+	/**
+	 * Add an element to the back.  No intialization is done.
+	 */
+	void expand() {
+		if(list_ == NULL) lazyInit();
+		if(cur_ == sz_) expandCopy(sz_+1);
+		cur_++;
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resize(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInit();
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) {
+			expandCopy(sz);
+		}
+		cur_ = sz;
+	}
+
+	/**
+	 * Make the stack empty.
+	 */
+	void clear() {
+		cur_ = 0; // re-use stack memory
+		// Don't clear heap; re-use it
+	}
+
+	/**
+	 * Get the element on the top of the stack.
+	 */
+	inline EList<T, S1>& back() {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Get the element on the top of the stack, const version.
+	 */
+	inline const EList<T, S1>& back() const {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Get the frontmost element (bottom of stack).
+	 */
+	inline EList<T, S1>& front() {
+		assert_gt(cur_, 0);
+		return list_[0];
+	}
+
+	/**
+	 * Get the element on the bottom of the stack, const version.
+	 */
+	inline const EList<T, S1>& front() const { return front(); }
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline EList<T, S1>& operator[](size_t i) {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const EList<T, S1>& operator[](size_t i) const {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline EList<T, S1>& get(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const EList<T, S1>& get(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	EList<T, S1>& getSlow(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	const EList<T, S1>& getSlow(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a pointer to the beginning of the buffer.
+	 */
+	EList<T, S1> *ptr() { return list_; }
+	
+	/**
+	 * Set the memory category for this object and all children.
+	 */
+	void setCat(int cat) {
+		assert_gt(cat, 0);
+		cat_ = cat;
+		if(cat_ != 0) {
+			for(size_t i = 0; i < sz_; i++) {
+				assert(list_[i].null());
+				list_[i].setCat(cat_);
+			}
+		}
+	}
+
+	/**
+	 * Return memory category.
+	 */
+	int cat() const { return cat_; }
+
+protected:
+
+	/**
+	 * Initialize memory for EList.
+	 */
+	void lazyInit() {
+		assert(list_ == NULL);
+		list_ = alloc(sz_);
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	EList<T, S1> *alloc(size_t sz) {
+		assert_gt(sz, 0);
+		EList<T, S1> *tmp = new EList<T, S1>[sz];
+		gMemTally.add(cat_, sz);
+		if(cat_ != 0) {
+			for(size_t i = 0; i < sz; i++) {
+				assert(tmp[i].ptr() == NULL);
+				tmp[i].setCat(cat_);
+			}
+		}
+		return tmp;
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	void free() {
+		if(list_ != NULL) {
+			delete[] list_;
+			gMemTally.del(cat_, sz_);
+			list_ = NULL;
+		}
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.  Copy old contents into new buffer
+	 * using operator=.
+	 */
+	void expandCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		EList<T, S1>* tmp = alloc(newsz);
+		if(list_ != NULL) {
+			for(size_t i = 0; i < cur_; i++) {
+				assert_eq(cat_, tmp[i].cat());
+				tmp[i].xfer(list_[i]);
+				assert_eq(cat_, tmp[i].cat());
+			}
+			free();
+		}
+		list_ = tmp;
+		sz_ = newsz;
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.  Don't copy old contents over.
+	 */
+	void expandNoCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		free();
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		EList<T, S1>* tmp = alloc(newsz);
+		list_ = tmp;
+		sz_ = newsz;
+		assert_gt(sz_, 0);
+	}
+
+	int cat_;    // memory category, for accounting purposes
+	EList<T, S1> *list_; // list pointer, returned from new[]
+	size_t sz_;  // capacity
+	size_t cur_; // occupancy (AKA size)
+
+};
+
+/**
+ * An ELLList<T> is an expandable list of expandable lists with these
+ * features:
+ *
+ *  - Payload type of the innermost list is a template parameter T.
+ *  - Initial size can be specified at construction time, otherwise
+ *    default of 128 is used.
+ *  - When allocated initially or when expanding, the new[] operator is
+ *    used, which in turn calls the default constructor for ELList<T>.
+ *  - Upon expansion, instead of copies, xfer is used.
+ *  - When the ELLList<T> is resized to a smaller size (or cleared,
+ *    which is like resizing to size 0), the underlying containing is
+ *    not reshaped.  Thus, ELLLists<T>s never release memory before
+ *    destruction.
+ *
+ * And these requirements:
+ *
+ *  - Payload type T must have a default constructor.
+ *
+ */
+template <typename T, int S1 = 128, int S2 = 128, int S3 = 128>
+class ELLList {
+
+public:
+
+	/**
+	 * Allocate initial default of 128 elements.
+	 */
+	explicit ELLList(int cat = 0) :
+		cat_(cat), list_(NULL), sz_(S3), cur_(0)
+	{
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Initially allocate given number of elements; should be > 0.
+	 */
+	explicit ELLList(size_t isz, int cat = 0) :
+		cat_(cat), list_(NULL), sz_(isz), cur_(0)
+	{
+		assert_geq(cat, 0);
+		assert_gt(isz, 0);
+	}
+
+	/**
+	 * Copy from another ELLList using operator=.
+	 */
+	ELLList(const ELLList<T, S1, S2, S3>& o) :
+		cat_(0), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+	}
+
+	/**
+	 * Copy from another ELLList using operator=.
+	 */
+	explicit ELLList(const ELLList<T, S1, S2, S3>& o, int cat) :
+		cat_(cat), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Destructor.
+	 */
+	~ELLList() { free(); }
+
+	/**
+	 * Make this object into a copy of o by allocating enough memory to
+	 * fit the number of elements in o (note: the number of elements
+	 * may be substantially less than the memory allocated in o) and
+	 * using operator= to copy them over.
+	 */
+	ELLList<T, S1, S2, S3>& operator=(const ELLList<T, S1, S2, S3>& o) {
+		assert_eq(cat_, o.cat());
+		if(list_ == NULL) lazyInit();
+		if(o.cur_ == 0) {
+			cur_ = 0;
+			return *this;
+		}
+		if(sz_ < o.cur_) expandNoCopy(o.cur_ + 1);
+		assert_geq(sz_, o.cur_);
+		cur_ = o.cur_;
+		for(size_t i = 0; i < cur_; i++) {
+			// Note: using operator=, not xfer
+			assert_eq(list_[i].cat(), o.list_[i].cat());
+			list_[i] = o.list_[i];
+		}
+		return *this;
+	}
+	
+	/**
+	 * Transfer the guts of another EList into this one without using
+	 * operator=, etc.  We have to set EList o's list_ field to NULL to
+	 * avoid o's destructor from deleting list_ out from under us.
+	 */
+	void xfer(ELLList<T, S1, S2, S3>& o) {
+		assert_eq(cat_, o.cat());
+		list_ = o.list_; // list_ is an array of EList<T>s
+		sz_   = o.sz_;
+		cur_  = o.cur_;
+		o.list_ = NULL;
+		o.sz_ = o.cur_ = 0;
+	}
+
+	/**
+	 * Return number of elements.
+	 */
+	inline size_t size() const { return cur_; }
+
+	/**
+	 * Return true iff there are no elements.
+	 */
+	inline bool empty() const { return cur_ == 0; }
+
+	/**
+	 * Return true iff list hasn't been initialized yet.
+	 */
+	inline bool null() const { return list_ == NULL; }
+
+	/**
+	 * Add an element to the back.  No intialization is done.
+	 */
+	void expand() {
+		if(list_ == NULL) lazyInit();
+		if(cur_ == sz_) expandCopy(sz_+1);
+		cur_++;
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resize(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInit();
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) expandCopy(sz);
+		cur_ = sz;
+	}
+
+	/**
+	 * Make the stack empty.
+	 */
+	void clear() {
+		cur_ = 0; // re-use stack memory
+		// Don't clear heap; re-use it
+	}
+
+	/**
+	 * Get the element on the top of the stack.
+	 */
+	inline ELList<T, S1, S2>& back() {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Get the element on the top of the stack, const version.
+	 */
+	inline const ELList<T, S1, S2>& back() const {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Get the frontmost element (bottom of stack).
+	 */
+	inline ELList<T, S1, S2>& front() {
+		assert_gt(cur_, 0);
+		return list_[0];
+	}
+
+	/**
+	 * Get the element on the bottom of the stack, const version.
+	 */
+	inline const ELList<T, S1, S2>& front() const { return front(); }
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline ELList<T, S1, S2>& operator[](size_t i) {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const ELList<T, S1, S2>& operator[](size_t i) const {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline ELList<T, S1, S2>& get(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const ELList<T, S1, S2>& get(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	ELList<T, S1, S2>& getSlow(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	const ELList<T, S1, S2>& getSlow(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a pointer to the beginning of the buffer.
+	 */
+	ELList<T, S1, S2> *ptr() { return list_; }
+
+	/**
+	 * Set the memory category for this object and all children.
+	 */
+	void setCat(int cat) {
+		assert_gt(cat, 0);
+		cat_ = cat;
+		if(cat_ != 0) {
+			for(size_t i = 0; i < sz_; i++) {
+				assert(list_[i].null());
+				list_[i].setCat(cat_);
+			}
+		}
+	}
+	
+	/**
+	 * Return memory category.
+	 */
+	int cat() const { return cat_; }
+
+protected:
+
+	/**
+	 * Initialize memory for EList.
+	 */
+	void lazyInit() {
+		assert(null());
+		list_ = alloc(sz_);
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	ELList<T, S1, S2> *alloc(size_t sz) {
+		assert_gt(sz, 0);
+		ELList<T, S1, S2> *tmp = new ELList<T, S1, S2>[sz];
+		gMemTally.add(cat_, sz);
+		if(cat_ != 0) {
+			for(size_t i = 0; i < sz; i++) {
+				assert(tmp[i].ptr() == NULL);
+				tmp[i].setCat(cat_);
+			}
+		}
+		return tmp;
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	void free() {
+		if(list_ != NULL) {
+			delete[] list_;
+			gMemTally.del(cat_, sz_);
+			list_ = NULL;
+		}
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.  Copy old contents into new buffer
+	 * using operator=.
+	 */
+	void expandCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		ELList<T, S1, S2>* tmp = alloc(newsz);
+		if(list_ != NULL) {
+			for(size_t i = 0; i < cur_; i++) {
+				assert_eq(cat_, tmp[i].cat());
+				tmp[i].xfer(list_[i]);
+				assert_eq(cat_, tmp[i].cat());
+			}
+			free();
+		}
+		list_ = tmp;
+		sz_ = newsz;
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.  Don't copy old contents over.
+	 */
+	void expandNoCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		free();
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		ELList<T, S1, S2>* tmp = alloc(newsz);
+		list_ = tmp;
+		sz_ = newsz;
+		assert_gt(sz_, 0);
+	}
+
+	int cat_;    // memory category, for accounting purposes
+	ELList<T, S1, S2> *list_; // list pointer, returned from new[]
+	size_t sz_;  // capacity
+	size_t cur_; // occupancy (AKA size)
+
+};
+
+/**
+ * Expandable set using a heap-allocated sorted array.
+ *
+ * Note that the copy constructor and operator= routines perform
+ * shallow copies (w/ memcpy).
+ */
+template <typename T>
+class ESet {
+public:
+
+	/**
+	 * Allocate initial default of 128 elements.
+	 */
+	ESet(int cat = 0) :
+		cat_(cat),
+		list_(NULL),
+		sz_(0),
+		cur_(0)
+	{
+		if(sz_ > 0) {
+			list_ = alloc(sz_);
+		}
+	}
+
+	/**
+	 * Initially allocate given number of elements; should be > 0.
+	 */
+	ESet(size_t isz, int cat = 0) :
+		cat_(cat),
+		list_(NULL),
+		sz_(isz),
+		cur_(0)
+	{
+		assert_gt(isz, 0);
+		if(sz_ > 0) {
+			list_ = alloc(sz_);
+		}
+	}
+
+	/**
+	 * Copy from another ESet.
+	 */
+	ESet(const ESet<T>& o, int cat = 0) :
+		cat_(cat), list_(NULL)
+	{
+		assert_eq(cat_, o.cat());
+		*this = o;
+	}
+
+	/**
+	 * Destructor.
+	 */
+	~ESet() { free(); }
+
+	/**
+	 * Copy contents of given ESet into this ESet.
+	 */
+	ESet& operator=(const ESet<T>& o) {
+		assert_eq(cat_, o.cat());
+		sz_ = o.sz_;
+		cur_ = o.cur_;
+		free();
+		if(sz_ > 0) {
+			list_ = alloc(sz_);
+			memcpy(list_, o.list_, cur_ * sizeof(T));
+		} else {
+			list_ = NULL;
+		}
+		return *this;
+	}
+
+	/**
+	 * Return number of elements.
+	 */
+	size_t size() const { return cur_; }
+
+	/**
+	 * Return the total size in bytes occupied by this set.
+	 */
+	size_t totalSizeBytes() const {
+		return sizeof(int) + cur_ * sizeof(T) + 2 * sizeof(size_t);
+	}
+
+	/**
+	 * Return the total capacity in bytes occupied by this set.
+	 */
+	size_t totalCapacityBytes() const {
+		return sizeof(int) + sz_ * sizeof(T) + 2 * sizeof(size_t);
+	}
+	
+	/**
+	 * Return true iff there are no elements.
+	 */
+	bool empty() const { return cur_ == 0; }
+
+	/**
+	 * Return true iff list isn't initialized yet.
+	 */
+	bool null() const { return list_ == NULL; }
+
+	/**
+	 * Insert a new element into the set in sorted order.
+	 */
+	bool insert(const T& el) {
+		size_t i = 0;
+		if(cur_ == 0) {
+			insert(el, 0);
+			return true;
+		}
+		if(cur_ < 16) {
+			// Linear scan
+			i = scanLoBound(el);
+		} else {
+			// Binary search
+			i = bsearchLoBound(el);
+		}
+		if(i < cur_ && list_[i] == el) return false;
+		insert(el, i);
+		return true;
+	}
+
+	/**
+	 * Return true iff this set contains 'el'.
+	 */
+	bool contains(const T& el) const {
+		if(cur_ == 0) {
+			return false;
+		}
+		else if(cur_ == 1) {
+			return el == list_[0];
+		}
+		size_t i;
+		if(cur_ < 16) {
+			// Linear scan
+			i = scanLoBound(el);
+		} else {
+			// Binary search
+			i = bsearchLoBound(el);
+		}
+		return i != cur_ && list_[i] == el;
+	}
+
+	/**
+	 * Remove element from set.
+	 */
+	void remove(const T& el) {
+		size_t i;
+		if(cur_ < 16) {
+			// Linear scan
+			i = scanLoBound(el);
+		} else {
+			// Binary search
+			i = bsearchLoBound(el);
+		}
+		assert(i != cur_ && list_[i] == el);
+		erase(i);
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resize(size_t sz) {
+		if(sz <= cur_) return;
+		if(sz_ < sz) expandCopy(sz);
+	}
+
+	/**
+	 * Clear set without deallocating (or setting) anything.
+	 */
+	void clear() { cur_ = 0; }
+
+	/**
+	 * Return memory category.
+	 */
+	int cat() const { return cat_; }
+	
+	/**
+	 * Set the memory category for this object.
+	 */
+	void setCat(int cat) {
+		cat_ = cat;
+	}
+
+	/**
+	 * Transfer the guts of another EList into this one without using
+	 * operator=, etc.  We have to set EList o's list_ field to NULL to
+	 * avoid o's destructor from deleting list_ out from under us.
+	 */
+	void xfer(ESet<T>& o) {
+		// What does it mean to transfer to a different-category list?
+		assert_eq(cat_, o.cat());
+		// Can only transfer into an empty object
+		free();
+		list_ = o.list_;
+		sz_ = o.sz_;
+		cur_ = o.cur_;
+		o.list_ = NULL;
+		o.sz_ = o.cur_ = 0;
+	}
+
+	/**
+	 * Return a pointer to the beginning of the buffer.
+	 */
+	T *ptr() { return list_; }
+
+	/**
+	 * Return a const pointer to the beginning of the buffer.
+	 */
+	const T *ptr() const { return list_; }
+
+private:
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	T *alloc(size_t sz) {
+		assert_gt(sz, 0);
+		T *tmp = new T[sz];
+		gMemTally.add(cat_, sz);
+		return tmp;
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	void free() {
+		if(list_ != NULL) {
+			delete[] list_;
+			gMemTally.del(cat_, sz_);
+			list_ = NULL;
+		}
+	}
+
+	/**
+	 * Simple linear scan that returns the index of the first element
+	 * of list_ that is not less than el, or cur_ if all elements are
+	 * less than el.
+	 */
+	size_t scanLoBound(const T& el) const {
+		for(size_t i = 0; i < cur_; i++) {
+			if(!(list_[i] < el)) {
+				// Shouldn't be equal
+				return i;
+			}
+		}
+		return cur_;
+	}
+
+	/**
+	 * Perform a binary search for the first element that is not less
+	 * than 'el'.  Return cur_ if all elements are less than el.
+	 */
+	size_t bsearchLoBound(const T& el) const {
+		size_t hi = cur_;
+		size_t lo = 0;
+		while(true) {
+			if(lo == hi) {
+#ifndef NDEBUG
+				if((rand() % 10) == 0) {
+					assert_eq(lo, scanLoBound(el));
+				}
+#endif
+				return lo;
+			}
+			size_t mid = lo + ((hi-lo)>>1);
+			assert_neq(mid, hi);
+			if(list_[mid] < el) {
+				if(lo == mid) {
+#ifndef NDEBUG
+					if((rand() % 10) == 0) {
+						assert_eq(hi, scanLoBound(el));
+					}
+#endif
+					return hi;
+				}
+				lo = mid;
+			} else {
+				hi = mid;
+			}
+		}
+	}
+
+	/**
+	 * Return true if sorted, assert otherwise.
+	 */
+	bool sorted() const {
+		if(cur_ <= 1) return true;
+#ifndef NDEBUG
+		if((rand() % 20) == 0) {
+			for(size_t i = 0; i < cur_-1; i++) {
+				assert(list_[i] < list_[i+1]);
+			}
+		}
+#endif
+		return true;
+	}
+
+	/**
+	 * Insert value 'el' at offset 'idx'.  It's OK to insert at cur_,
+	 * which is equivalent to appending.
+	 */
+	void insert(const T& el, size_t idx) {
+		assert_leq(idx, cur_);
+		if(cur_ == sz_) {
+			expandCopy(sz_+1);
+			assert(sorted());
+		}
+		for(size_t i = cur_; i > idx; i--) {
+			list_[i] = list_[i-1];
+		}
+		list_[idx] = el;
+		cur_++;
+		assert(sorted());
+	}
+
+	/**
+	 * Erase element at offset idx.
+	 */
+	void erase(size_t idx) {
+		assert_lt(idx, cur_);
+		for(size_t i = idx; i < cur_-1; i++) {
+			list_[i] = list_[i+1];
+		}
+		cur_--;
+		assert(sorted());
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.
+	 */
+	void expandCopy(size_t thresh) {
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) {
+			newsz *= 2;
+		}
+		T* tmp = alloc(newsz);
+		for(size_t i = 0; i < cur_; i++) {
+			tmp[i] = list_[i];
+		}
+		free();
+		list_ = tmp;
+		sz_ = newsz;
+	}
+
+	int cat_;    // memory category, for accounting purposes
+	T *list_;    // list pointer, returned from new[]
+	size_t sz_;  // capacity
+	size_t cur_; // occupancy (AKA size)
+};
+
+template <typename T, int S = 128>
+class ELSet {
+
+public:
+
+	/**
+	 * Allocate initial default of 128 elements.
+	 */
+	explicit ELSet(int cat = 0) :
+		cat_(cat), list_(NULL), sz_(S), cur_(0)
+	{
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Initially allocate given number of elements; should be > 0.
+	 */
+	explicit ELSet(size_t isz, int cat = 0) :
+		cat_(cat), list_(NULL), sz_(isz), cur_(0)
+	{
+		assert_gt(isz, 0);
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Copy from another ELList using operator=.
+	 */
+	ELSet(const ELSet<T, S>& o) :
+		cat_(0), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+	}
+
+	/**
+	 * Copy from another ELList using operator=.
+	 */
+	explicit ELSet(const ELSet<T, S>& o, int cat) :
+		cat_(cat), list_(NULL), sz_(0), cur_(0)
+	{
+		*this = o;
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Destructor.
+	 */
+	~ELSet() { free(); }
+
+	/**
+	 * Make this object into a copy of o by allocating enough memory to
+	 * fit the number of elements in o (note: the number of elements
+	 * may be substantially less than the memory allocated in o) and
+	 * using operator= to copy them over.
+	 */
+	ELSet<T, S>& operator=(const ELSet<T, S>& o) {
+		assert_eq(cat_, o.cat());
+		if(list_ == NULL) {
+			lazyInit();
+		}
+		if(o.cur_ == 0) {
+			cur_ = 0;
+			return *this;
+		}
+		if(sz_ < o.cur_) expandNoCopy(o.cur_ + 1);
+		assert_geq(sz_, o.cur_);
+		cur_ = o.cur_;
+		for(size_t i = 0; i < cur_; i++) {
+			// Note: using operator=, not xfer
+			assert_eq(list_[i].cat(), o.list_[i].cat());
+			list_[i] = o.list_[i];
+		}
+		return *this;
+	}
+	
+	/**
+	 * Transfer the guts of another ESet into this one without using
+	 * operator=, etc.  We have to set ESet o's list_ field to NULL to
+	 * avoid o's destructor from deleting list_ out from under us.
+	 */
+	void xfer(ELSet<T, S>& o) {
+		assert_eq(cat_, o.cat());
+		list_ = o.list_; // list_ is an array of ESet<T>s
+		sz_   = o.sz_;
+		cur_  = o.cur_;
+		o.list_ = NULL;
+		o.sz_ = o.cur_ = 0;
+	}
+
+	/**
+	 * Return number of elements.
+	 */
+	inline size_t size() const { return cur_; }
+
+	/**
+	 * Return true iff there are no elements.
+	 */
+	inline bool empty() const { return cur_ == 0; }
+
+	/**
+	 * Return true iff list hasn't been initialized yet.
+	 */
+	inline bool null() const { return list_ == NULL; }
+
+	/**
+	 * Add an element to the back.  No intialization is done.
+	 */
+	void expand() {
+		if(list_ == NULL) lazyInit();
+		if(cur_ == sz_) expandCopy(sz_+1);
+		cur_++;
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resize(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInit();
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) {
+			expandCopy(sz);
+		}
+		cur_ = sz;
+	}
+
+	/**
+	 * Make the stack empty.
+	 */
+	void clear() {
+		cur_ = 0; // re-use stack memory
+		// Don't clear heap; re-use it
+	}
+
+	/**
+	 * Get the element on the top of the stack.
+	 */
+	inline ESet<T>& back() {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Get the element on the top of the stack, const version.
+	 */
+	inline const ESet<T>& back() const {
+		assert_gt(cur_, 0);
+		return list_[cur_-1];
+	}
+
+	/**
+	 * Get the frontmost element (bottom of stack).
+	 */
+	inline ESet<T>& front() {
+		assert_gt(cur_, 0);
+		return list_[0];
+	}
+
+	/**
+	 * Get the element on the bottom of the stack, const version.
+	 */
+	inline const ESet<T>& front() const { return front(); }
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline ESet<T>& operator[](size_t i) {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const ESet<T>& operator[](size_t i) const {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline ESet<T>& get(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const ESet<T>& get(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	ESet<T>& getSlow(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.  This version is not
+	 * inlined, which guarantees we can use it from the debugger.
+	 */
+	const ESet<T>& getSlow(size_t i) const {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a pointer to the beginning of the buffer.
+	 */
+	ESet<T> *ptr() { return list_; }
+
+	/**
+	 * Return a const pointer to the beginning of the buffer.
+	 */
+	const ESet<T> *ptr() const { return list_; }
+
+	/**
+	 * Set the memory category for this object and all children.
+	 */
+	void setCat(int cat) {
+		assert_gt(cat, 0);
+		cat_ = cat;
+		if(cat_ != 0) {
+			for(size_t i = 0; i < sz_; i++) {
+				assert(list_[i].null());
+				list_[i].setCat(cat_);
+			}
+		}
+	}
+
+	/**
+	 * Return memory category.
+	 */
+	int cat() const { return cat_; }
+
+protected:
+
+	/**
+	 * Initialize memory for ELSet.
+	 */
+	void lazyInit() {
+		assert(list_ == NULL);
+		list_ = alloc(sz_);
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	ESet<T> *alloc(size_t sz) {
+		assert_gt(sz, 0);
+		ESet<T> *tmp = new ESet<T>[sz];
+		gMemTally.add(cat_, sz);
+		if(cat_ != 0) {
+			for(size_t i = 0; i < sz; i++) {
+				assert(tmp[i].ptr() == NULL);
+				tmp[i].setCat(cat_);
+			}
+		}
+		return tmp;
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	void free() {
+		if(list_ != NULL) {
+			delete[] list_;
+			gMemTally.del(cat_, sz_);
+			list_ = NULL;
+		}
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.  Copy old contents into new buffer
+	 * using operator=.
+	 */
+	void expandCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		ESet<T>* tmp = alloc(newsz);
+		if(list_ != NULL) {
+			for(size_t i = 0; i < cur_; i++) {
+				assert_eq(cat_, tmp[i].cat());
+				tmp[i].xfer(list_[i]);
+				assert_eq(cat_, tmp[i].cat());
+			}
+			free();
+		}
+		list_ = tmp;
+		sz_ = newsz;
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.  Don't copy old contents over.
+	 */
+	void expandNoCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		free();
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		ESet<T>* tmp = alloc(newsz);
+		list_ = tmp;
+		sz_ = newsz;
+		assert_gt(sz_, 0);
+	}
+
+	int cat_;    // memory category, for accounting purposes
+	ESet<T> *list_; // list pointer, returned from new[]
+	size_t sz_;  // capacity
+	size_t cur_; // occupancy (AKA size)
+
+};
+
+/**
+ * Expandable map using a heap-allocated sorted array.
+ *
+ * Note that the copy constructor and operator= routines perform
+ * shallow copies (w/ memcpy).
+ */
+template <typename K, typename V>
+class EMap {
+
+public:
+
+	/**
+	 * Allocate initial default of 128 elements.
+	 */
+	EMap(int cat = 0) :
+		cat_(cat),
+		list_(NULL),
+		sz_(128),
+		cur_(0)
+	{
+		list_ = alloc(sz_);
+	}
+
+	/**
+	 * Initially allocate given number of elements; should be > 0.
+	 */
+	EMap(size_t isz, int cat = 0) :
+		cat_(cat),
+		list_(NULL),
+		sz_(isz),
+		cur_(0)
+	{
+		assert_gt(isz, 0);
+		list_ = alloc(sz_);
+	}
+
+	/**
+	 * Copy from another ESet.
+	 */
+	EMap(const EMap<K, V>& o) : list_(NULL) {
+		*this = o;
+	}
+
+	/**
+	 * Destructor.
+	 */
+	~EMap() { free(); }
+
+	/**
+	 * Copy contents of given ESet into this ESet.
+	 */
+	EMap& operator=(const EMap<K, V>& o) {
+		sz_ = o.sz_;
+		cur_ = o.cur_;
+		free();
+		list_ = alloc(sz_);
+		memcpy(list_, o.list_, cur_ * sizeof(std::pair<K, V>));
+		return *this;
+	}
+
+	/**
+	 * Return number of elements.
+	 */
+	size_t size() const { return cur_; }
+	
+	/**
+	 * Return the total size in bytes occupied by this map.
+	 */
+	size_t totalSizeBytes() const {
+		return 	sizeof(int) +
+		        2 * sizeof(size_t) +
+				cur_ * sizeof(std::pair<K, V>);
+	}
+
+	/**
+	 * Return the total capacity in bytes occupied by this map.
+	 */
+	size_t totalCapacityBytes() const {
+		return 	sizeof(int) +
+		        2 * sizeof(size_t) +
+				sz_ * sizeof(std::pair<K, V>);
+	}
+
+	/**
+	 * Return true iff there are no elements.
+	 */
+	bool empty() const { return cur_ == 0; }
+
+	/**
+	 * Insert a new element into the set in sorted order.
+	 */
+	bool insert(const std::pair<K, V>& el) {
+		size_t i = 0;
+		if(cur_ == 0) {
+			insert(el, 0);
+			return true;
+		}
+		if(cur_ < 16) {
+			// Linear scan
+			i = scanLoBound(el.first);
+		} else {
+			// Binary search
+			i = bsearchLoBound(el.first);
+		}
+		if(list_[i] == el) return false; // already there
+		insert(el, i);
+		return true; // not already there
+	}
+
+	/**
+	 * Return true iff this set contains 'el'.
+	 */
+	bool contains(const K& el) const {
+		if(cur_ == 0) return false;
+		else if(cur_ == 1) return el == list_[0].first;
+		size_t i;
+		if(cur_ < 16) {
+			// Linear scan
+			i = scanLoBound(el);
+		} else {
+			// Binary search
+			i = bsearchLoBound(el);
+		}
+		return i != cur_ && list_[i].first == el;
+	}
+
+	/**
+	 * Return true iff this set contains 'el'.
+	 */
+	bool containsEx(const K& el, size_t& i) const {
+		if(cur_ == 0) return false;
+		else if(cur_ == 1) {
+			i = 0;
+			return el == list_[0].first;
+		}
+		if(cur_ < 16) {
+			// Linear scan
+			i = scanLoBound(el);
+		} else {
+			// Binary search
+			i = bsearchLoBound(el);
+		}
+		return i != cur_ && list_[i].first == el;
+	}
+
+	/**
+	 * Remove element from set.
+	 */
+	void remove(const K& el) {
+		size_t i;
+		if(cur_ < 16) {
+			// Linear scan
+			i = scanLoBound(el);
+		} else {
+			// Binary search
+			i = bsearchLoBound(el);
+		}
+		assert(i != cur_ && list_[i].first == el);
+		erase(i);
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resize(size_t sz) {
+		if(sz <= cur_) return;
+		if(sz_ < sz) expandCopy(sz);
+	}
+	
+	/**
+	 * Get the ith key, value pair in the map.
+	 */
+	const std::pair<K, V>& get(size_t i) const {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+	
+	/**
+	 * Get the ith key, value pair in the map.
+	 */
+	const std::pair<K, V>& operator[](size_t i) const {
+		return get(i);
+	}
+
+	/**
+	 * Clear set without deallocating (or setting) anything.
+	 */
+	void clear() { cur_ = 0; }
+
+private:
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	std::pair<K, V> *alloc(size_t sz) {
+		assert_gt(sz, 0);
+		std::pair<K, V> *tmp = new std::pair<K, V>[sz];
+		gMemTally.add(cat_, sz);
+		return tmp;
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	void free() {
+		if(list_ != NULL) {
+			delete[] list_;
+			gMemTally.del(cat_, sz_);
+			list_ = NULL;
+		}
+	}
+
+	/**
+	 * Simple linear scan that returns the index of the first element
+	 * of list_ that is not less than el, or cur_ if all elements are
+	 * less than el.
+	 */
+	size_t scanLoBound(const K& el) const {
+		for(size_t i = 0; i < cur_; i++) {
+			if(!(list_[i].first < el)) {
+				// Shouldn't be equal
+				return i;
+			}
+		}
+		return cur_;
+	}
+
+	/**
+	 * Perform a binary search for the first element that is not less
+	 * than 'el'.  Return cur_ if all elements are less than el.
+	 */
+	size_t bsearchLoBound(const K& el) const {
+		size_t hi = cur_;
+		size_t lo = 0;
+		while(true) {
+			if(lo == hi) {
+#ifndef NDEBUG
+				if((rand() % 10) == 0) {
+					assert_eq(lo, scanLoBound(el));
+				}
+#endif
+				return lo;
+			}
+			size_t mid = lo + ((hi-lo)>>1);
+			assert_neq(mid, hi);
+			if(list_[mid].first < el) {
+				if(lo == mid) {
+#ifndef NDEBUG
+					if((rand() % 10) == 0) {
+						assert_eq(hi, scanLoBound(el));
+					}
+#endif
+					return hi;
+				}
+				lo = mid;
+			} else {
+				hi = mid;
+			}
+		}
+	}
+
+	/**
+	 * Return true if sorted, assert otherwise.
+	 */
+	bool sorted() const {
+		if(cur_ <= 1) return true;
+#ifndef NDEBUG
+		for(size_t i = 0; i < cur_-1; i++) {
+			assert(!(list_[i] == list_[i+1]));
+			assert(list_[i] < list_[i+1]);
+		}
+#endif
+		return true;
+	}
+
+	/**
+	 * Insert value 'el' at offset 'idx'.  It's OK to insert at cur_,
+	 * which is equivalent to appending.
+	 */
+	void insert(const std::pair<K, V>& el, size_t idx) {
+		assert_leq(idx, cur_);
+		if(cur_ == sz_) {
+			expandCopy(sz_+1);
+		}
+		for(size_t i = cur_; i > idx; i--) {
+			list_[i] = list_[i-1];
+		}
+		list_[idx] = el;
+		assert(idx == cur_ || list_[idx] < list_[idx+1]);
+		cur_++;
+		assert(sorted());
+	}
+
+	/**
+	 * Erase element at offset idx.
+	 */
+	void erase(size_t idx) {
+		assert_lt(idx, cur_);
+		for(size_t i = idx; i < cur_-1; i++) {
+			list_[i] = list_[i+1];
+		}
+		cur_--;
+		assert(sorted());
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Expansions are quadratic.
+	 */
+	void expandCopy(size_t thresh) {
+		if(thresh <= sz_) return;
+		size_t newsz = sz_ * 2;
+		while(newsz < thresh) newsz *= 2;
+		std::pair<K, V>* tmp = alloc(newsz);
+		for(size_t i = 0; i < cur_; i++) {
+			tmp[i] = list_[i];
+		}
+		free();
+		list_ = tmp;
+		sz_ = newsz;
+	}
+
+	int cat_;    // memory category, for accounting purposes
+	std::pair<K, V> *list_; // list pointer, returned from new[]
+	size_t sz_;  // capacity
+	size_t cur_; // occupancy (AKA size)
+};
+
+/**
+ * A class that allows callers to create objects that are referred to by ID.
+ * Objects should not be referred to via pointers or references, since they
+ * are stored in an expandable buffer that might be resized and thereby moved
+ * to another address.
+ */
+template <typename T, int S = 128>
+class EFactory {
+
+public:
+
+	explicit EFactory(size_t isz, int cat = 0) : l_(isz, cat) { }
+	
+	explicit EFactory(int cat = 0) : l_(cat) { }
+	
+	/**
+	 * Clear the list.
+	 */
+	void clear() {
+		l_.clear();
+	}
+	
+	/**
+	 * Add one additional item to the list and return its ID.
+	 */
+	size_t alloc() {
+		l_.expand();
+		return l_.size()-1;
+	}
+	
+	/**
+	 * Return the number of items in the list.
+	 */
+	size_t size() const {
+		return l_.size();
+	}
+
+	/**
+	 * Return the number of items in the factory.
+	 */
+	size_t totalSizeBytes() const {
+		return l_.totalSizeBytes();
+	}
+
+	/**
+	 * Return the total capacity in bytes occupied by this factory.
+	 */
+	size_t totalCapacityBytes() const {
+		return 	l_.totalCapacityBytes();
+	}
+    
+    /**
+     * Resize the list.
+     */
+    void resize(size_t sz) {
+        l_.resize(sz);
+    }
+
+	/**
+	 * Return true iff the list is empty.
+	 */
+	bool empty() const {
+		return size() == 0;
+	}
+	
+	/**
+	 * Shrink the list such that the  topmost (most recently allocated) element
+	 * is removed.
+	 */
+	void pop() {
+		l_.resize(l_.size()-1);
+	}
+	
+	/**
+	 * Return mutable list item at offset 'off'
+	 */
+	T& operator[](size_t off) {
+		return l_[off];
+	}
+
+	/**
+	 * Return immutable list item at offset 'off'
+	 */
+	const T& operator[](size_t off) const {
+		return l_[off];
+	}
+
+protected:
+
+	EList<T, S> l_;
+};
+
+/**
+ * An expandable bit vector based on EList
+ */
+template <int S = 128>
+class EBitList {
+
+public:
+
+	explicit EBitList(size_t isz, int cat = 0) : l_(isz, cat) { reset(); }
+	
+	explicit EBitList(int cat = 0) : l_(cat) { reset(); }
+
+	/**
+	 * Reset to empty state.
+	 */
+	void clear() {
+		reset();
+	}
+	
+	/**
+	 * Reset to empty state.
+	 */
+	void reset() {
+		l_.clear();
+		max_ = std::numeric_limits<size_t>::max();
+	}
+
+	/**
+	 * Set a bit.
+	 */
+	void set(size_t off) {
+		resize(off);
+		l_[off >> 3] |= (1 << (off & 7));
+		if(off > max_ || max_ == std::numeric_limits<size_t>::max()) {
+			max_ = off;
+		}
+	}
+
+	/**
+	 * Return mutable list item at offset 'off'
+	 */
+	bool test(size_t off) const {
+		if((size_t)(off >> 3) >= l_.size()) {
+			return false;
+		}
+		return (l_[off >> 3] & (1 << (off & 7))) != 0;
+	}
+	
+	/**
+	 * Return size of the underlying byte array.
+	 */
+	size_t size() const {
+		return l_.size();
+	}
+	
+	/**
+	 * Resize to accomodate at least the given number of bits.
+	 */
+	void resize(size_t off) {
+		if((size_t)(off >> 3) >= l_.size()) {
+			size_t oldsz = l_.size();
+			l_.resize((off >> 3) + 1);
+			for(size_t i = oldsz; i < l_.size(); i++) {
+				l_[i] = 0;
+			}
+		}
+	}
+	
+	/**
+	 * Return max set bit.
+	 */
+	size_t max() const {
+		return max_;
+	}
+
+protected:
+
+	EList<uint8_t, S> l_;
+	size_t max_;
+};
+
+/**
+ * Implements a min-heap.
+ */
+template <typename T, int S = 128>
+class EHeap {
+public:
+
+	/**
+	 * Add the element to the next available leaf position and percolate up.
+	 */
+	void insert(T o) {
+		size_t pos = l_.size();
+		l_.push_back(o);
+		while(pos > 0) {
+			size_t parent = (pos-1) >> 1;
+			if(l_[pos] < l_[parent]) {
+				T tmp(l_[pos]);
+				l_[pos] = l_[parent];
+				l_[parent] = tmp;
+				pos = parent;
+			} else break;
+		}
+		assert(repOk());
+	}
+	
+	/**
+	 * Return the topmost element.
+	 */
+	T top() {
+		assert_gt(l_.size(), 0);
+		return l_[0];
+	}
+	
+	/**
+	 * Remove the topmost element.
+	 */
+	T pop() {
+		assert_gt(l_.size(), 0);
+		T ret = l_[0];
+		l_[0] = l_[l_.size()-1];
+		l_.resize(l_.size()-1);
+		size_t cur = 0;
+		while(true) {
+			size_t c1 = ((cur+1) << 1) - 1;
+			size_t c2 = c1 + 1;
+			if(c2 < l_.size()) {
+				if(l_[c1] < l_[cur] && l_[c1] <= l_[c2]) {
+					T tmp(l_[c1]);
+					l_[c1] = l_[cur];
+					l_[cur] = tmp;
+					cur = c1;
+				} else if(l_[c2] < l_[cur]) {
+					T tmp(l_[c2]);
+					l_[c2] = l_[cur];
+					l_[cur] = tmp;
+					cur = c2;
+				} else {
+					break;
+				}
+			} else if(c1 < l_.size()) {
+				if(l_[c1] < l_[cur]) {
+					T tmp(l_[c1]);
+					l_[c1] = l_[cur];
+					l_[cur] = tmp;
+					cur = c1;
+				} else {
+					break;
+				}
+			} else {
+				break;
+			}
+		}
+		assert(repOk());
+		return ret;
+	}
+	
+	/**
+	 * Return number of elements in the heap.
+	 */
+	size_t size() const {
+		return l_.size();
+	}
+
+	/**
+	 * Return the total size in bytes occupied by this heap.
+	 */
+	size_t totalSizeBytes() const {
+		return 	l_.totalSizeBytes();
+	}
+
+	/**
+	 * Return the total capacity in bytes occupied by this heap.
+	 */
+	size_t totalCapacityBytes() const {
+		return 	l_.totalCapacityBytes();
+	}
+	
+	/**
+	 * Return true when heap is empty.
+	 */
+	bool empty() const {
+		return l_.empty();
+	}
+	
+	/**
+	 * Return element at offset i.
+	 */
+	const T& operator[](size_t i) const {
+		return l_[i];
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that heap property holds.
+	 */
+	bool repOk() const {
+		if(empty()) return true;
+		return repOkNode(0);
+	}
+
+	/**
+	 * Check that heap property holds at and below this node.
+	 */
+	bool repOkNode(size_t cur) const {
+        size_t c1 = ((cur+1) << 1) - 1;
+        size_t c2 = c1 + 1;
+		if(c1 < l_.size()) {
+			assert_leq(l_[cur], l_[c1]);
+		}
+		if(c2 < l_.size()) {
+			assert_leq(l_[cur], l_[c2]);
+		}
+		if(c2 < l_.size()) {
+			return repOkNode(c1) && repOkNode(c2);
+		} else if(c1 < l_.size()) {
+			return repOkNode(c1);
+		}
+		return true;
+	}
+#endif
+	
+	/**
+	 * Clear the heap so that it's empty.
+	 */
+	void clear() {
+		l_.clear();
+	}
+
+protected:
+
+	EList<T, S> l_;
+};
+
+/**
+ * Dispenses pages of memory for all the lists in the cache, including
+ * the sequence-to-range map, the range list, the edits list, and the
+ * offsets list.  All lists contend for the same pool of memory.
+ */
+class Pool {
+public:
+	Pool(
+		uint64_t bytes,
+		uint32_t pagesz,
+		int cat = 0) :
+		cat_(cat),
+		cur_(0),
+		bytes_(bytes),
+		pagesz_(pagesz),
+		pages_(cat)
+	{
+		for(size_t i = 0; i < ((bytes+pagesz-1)/pagesz); i++) {
+			pages_.push_back(new uint8_t[pagesz]);
+			gMemTally.add(cat, pagesz);
+			assert(pages_.back() != NULL);
+		}
+		assert(repOk());
+	}
+	
+	/**
+	 * Free each page.
+	 */
+	~Pool() {
+		for(size_t i = 0; i < pages_.size(); i++) {
+			assert(pages_[i] != NULL);
+			delete[] pages_[i];
+			gMemTally.del(cat_, pagesz_);
+		}
+	}
+
+	/**
+	 * Allocate one page, or return NULL if no pages are left.
+	 */
+	uint8_t * alloc() {
+		assert(repOk());
+		if(cur_ == pages_.size()) return NULL;
+		return pages_[cur_++];
+	}
+    
+    bool full() { return cur_ == pages_.size(); }
+
+	/**
+	 * Clear the pool so that no pages are considered allocated.
+	 */
+	void clear() {
+		cur_ = 0;
+		assert(repOk());
+	}
+
+	/**
+	 * Reset the Pool to be as though
+	 */
+	void free() {
+		// Currently a no-op because the only freeing method supported
+		// now is to clear the entire pool
+	}
+
+#ifndef NDEBUG
+	/**
+	 * Check that pool is internally consistent.
+	 */
+	bool repOk() const {
+		assert_leq(cur_, pages_.size());
+		assert(!pages_.empty());
+		assert_gt(bytes_, 0);
+		assert_gt(pagesz_, 0);
+		return true;
+	}
+#endif
+
+public:
+	int             cat_;    // memory category, for accounting purposes
+	uint32_t        cur_;    // next page to hand out
+	const uint64_t  bytes_;  // total bytes in the pool
+	const uint32_t  pagesz_; // size of a single page
+	EList<uint8_t*> pages_;  // the pages themselves
+};
+
+/**
+ * An expandable list backed by a pool.
+ */
+template<typename T, int S>
+class PList {
+
+#define PLIST_PER_PAGE (S / sizeof(T))
+
+public:
+	/**
+	 * Initialize the current-edit pointer to 0 and set the number of
+	 * edits per memory page.
+	 */
+	PList(int cat = 0) :
+		cur_(0),
+		curPage_(0),
+		pages_(cat) { }
+
+	/**
+	 * Add 1 object to the list.
+	 */
+	bool add(Pool& p, const T& o) {
+		assert(repOk());
+		if(!ensure(p, 1)) return false;
+		if(cur_ == PLIST_PER_PAGE) {
+			cur_ = 0;
+			curPage_++;
+		}
+		assert_lt(curPage_, pages_.size());
+		assert(repOk());
+		assert_lt(cur_, PLIST_PER_PAGE);
+		pages_[curPage_][cur_++] = o;
+		return true;
+	}
+
+	/**
+	 * Add a list of objects to the list.
+	 */
+	bool add(Pool& p, const EList<T>& os) {
+		if(!ensure(p, os.size())) return false;
+		for(size_t i = 0; i < os.size(); i++) {
+			if(cur_ == PLIST_PER_PAGE) {
+				cur_ = 0;
+				curPage_++;
+			}
+			assert_lt(curPage_, pages_.size());
+			assert(repOk());
+			assert_lt(cur_, PLIST_PER_PAGE);
+			pages_[curPage_][cur_++] = os[i];
+		}
+		return true;
+	}
+
+	/**
+	 * Add a list of objects to the list.
+	 */
+	bool copy(
+		Pool& p,
+		const PList<T, S>& src,
+		size_t i,
+		size_t len)
+	{
+		if(!ensure(p, src.size())) return false;
+		for(size_t i = 0; i < src.size(); i++) {
+			if(cur_ == PLIST_PER_PAGE) {
+				cur_ = 0;
+				curPage_++;
+			}
+			assert_lt(curPage_, pages_.size());
+			assert(repOk());
+			assert_lt(cur_, PLIST_PER_PAGE);
+			pages_[curPage_][cur_++] = src[i];
+		}
+		return true;
+	}
+
+	/**
+	 * Add 'num' objects, all equal to 'o' to the list.
+	 */
+	bool addFill(Pool& p, size_t num, const T& o) {
+		if(!ensure(p, num)) return false;
+		for(size_t i = 0; i < num; i++) {
+			if(cur_ == PLIST_PER_PAGE) {
+				cur_ = 0;
+				curPage_++;
+			}
+			assert_lt(curPage_, pages_.size());
+			assert(repOk());
+			assert_lt(cur_, PLIST_PER_PAGE);
+			pages_[curPage_][cur_++] = o;
+		}
+		return true;
+	}
+
+	/**
+	 * Free all pages associated with the list.
+	 */
+	void clear() {
+		pages_.clear();
+		cur_ = curPage_ = 0;
+	}
+
+#ifndef NDEBUG
+	/**
+	 * Check that list is internally consistent.
+	 */
+	bool repOk() const {
+		assert(pages_.size() == 0 || curPage_ < pages_.size());
+		assert_leq(cur_, PLIST_PER_PAGE);
+		return true;
+	}
+#endif
+
+	/**
+	 * Return the number of elements in the list.
+	 */
+	size_t size() const {
+		return curPage_ * PLIST_PER_PAGE + cur_;
+	}
+	
+	/**
+	 * Return true iff the PList has no elements.
+	 */
+	bool empty() const {
+		return size() == 0;
+	}
+
+	/**
+	 * Get the ith element added to the list.
+	 */
+	inline const T& getConst(size_t i) const {
+		assert_lt(i, size());
+		size_t page = i / PLIST_PER_PAGE;
+		size_t elt = i % PLIST_PER_PAGE;
+		return pages_[page][elt];
+	}
+
+	/**
+	 * Get the ith element added to the list.
+	 */
+	inline T& get(size_t i) {
+		assert_lt(i, size());
+		size_t page = i / PLIST_PER_PAGE;
+		size_t elt = i % PLIST_PER_PAGE;
+		assert_lt(page, pages_.size());
+		assert(page < pages_.size()-1 || elt < cur_);
+		return pages_[page][elt];
+	}
+	
+	/**
+	 * Get the most recently added element.
+	 */
+	inline T& back() {
+		size_t page = (size()-1) / PLIST_PER_PAGE;
+		size_t elt = (size()-1) % PLIST_PER_PAGE;
+		assert_lt(page, pages_.size());
+		assert(page < pages_.size()-1 || elt < cur_);
+		return pages_[page][elt];
+	}
+	
+	/**
+	 * Get const version of the most recently added element.
+	 */
+	inline const T& back() const {
+		size_t page = (size()-1) / PLIST_PER_PAGE;
+		size_t elt = (size()-1) % PLIST_PER_PAGE;
+		assert_lt(page, pages_.size());
+		assert(page < pages_.size()-1 || elt < cur_);
+		return pages_[page][elt];
+	}
+
+	/**
+	 * Get the element most recently added to the list.
+	 */
+	T& last() {
+		assert(!pages_.empty());
+		assert_gt(PLIST_PER_PAGE, 0);
+		if(cur_ == 0) {
+			assert_gt(pages_.size(), 1);
+			return pages_[pages_.size()-2][PLIST_PER_PAGE-1];
+		} else {
+			return pages_.back()[cur_-1];
+		}
+	}
+
+	/**
+	 * Return true iff 'num' additional objects will fit in the pages
+	 * allocated to the list.  If more pages are needed, they are
+	 * added if possible.
+	 */
+	bool ensure(Pool& p, size_t num) {
+		assert(repOk());
+		if(num == 0) return true;
+		// Allocation of the first page
+		if(pages_.size() == 0) {
+			if(expand(p) == NULL) {
+				return false;
+			}
+			assert_eq(1, pages_.size());
+		}
+		size_t cur = cur_;
+		size_t curPage = curPage_;
+		while(cur + num > PLIST_PER_PAGE) {
+			assert_lt(curPage, pages_.size());
+			if(curPage == pages_.size()-1 && expand(p) == NULL) {
+				return false;
+			}
+			num -= (PLIST_PER_PAGE - cur);
+			cur = 0;
+			curPage++;
+		}
+		return true;
+	}
+
+protected:
+
+	/**
+	 * Expand our page supply by 1
+	 */
+	T* expand(Pool& p) {
+		T* newpage = (T*)p.alloc();
+		if(newpage == NULL) {
+			return NULL;
+		}
+		pages_.push_back(newpage);
+		return pages_.back();
+	}
+
+	size_t       cur_;     // current elt within page
+	size_t       curPage_; // current page
+	EList<T*>    pages_;   // the pages
+};
+
+/**
+ * A slice of an EList.
+ */
+template<typename T, int S>
+class EListSlice {
+
+public:
+	EListSlice() :
+		i_(0),
+		len_(0),
+		list_()
+	{ }
+
+	EListSlice(
+		EList<T, S>& list,
+		size_t i,
+		size_t len) :
+		i_(i),
+		len_(len),
+		list_(&list)
+	{ }
+	
+	/**
+	 * Initialize from a piece of another PListSlice.
+	 */
+	void init(const EListSlice<T, S>& sl, size_t first, size_t last) {
+		assert_gt(last, first);
+		assert_leq(last - first, sl.len_);
+		i_ = sl.i_ + first;
+		len_ = last - first;
+		list_ = sl.list_;
+	}
+	
+	/**
+	 * Reset state to be empty.
+	 */
+	void reset() {
+		i_ = len_ = 0;
+		list_ = NULL;
+	}
+	
+	/**
+	 * Get the ith element of the slice.
+	 */
+	inline const T& get(size_t i) const {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i + i_);
+	}
+
+	/**
+	 * Get the ith element of the slice.
+	 */
+	inline T& get(size_t i) {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i + i_);
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline T& operator[](size_t i) {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i + i_);
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const T& operator[](size_t i) const {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i + i_);
+	}
+
+	/**
+	 * Return true iff this slice is initialized.
+	 */
+	bool valid() const {
+		return len_ != 0;
+	}
+	
+	/**
+	 * Return number of elements in the slice.
+	 */
+	size_t size() const {
+		return len_;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Ensure that the PListSlice is internally consistent and
+	 * consistent with the backing PList.
+	 */
+	bool repOk() const {
+		assert_leq(i_ + len_, list_->size());
+		return true;
+	}
+#endif
+	
+	/**
+	 * Return true iff this slice refers to the same slice of the same
+	 * list as the given slice.
+	 */
+	bool operator==(const EListSlice& sl) const {
+		return i_ == sl.i_ && len_ == sl.len_ && list_ == sl.list_;
+	}
+
+	/**
+	 * Return false iff this slice refers to the same slice of the same
+	 * list as the given slice.
+	 */
+	bool operator!=(const EListSlice& sl) const {
+		return !(*this == sl);
+	}
+	
+	/**
+	 * Set the length.  This could leave things inconsistent (e.g. could
+	 * include elements that fall off the end of list_).
+	 */
+	void setLength(size_t nlen) {
+		len_ = (uint32_t)nlen;
+	}
+	
+protected:
+	size_t i_;
+	size_t len_;
+	EList<T, S>* list_;
+};
+
+/**
+ * A slice of a PList.
+ */
+template<typename T, int S>
+class PListSlice {
+
+public:
+	PListSlice() :
+		i_(0),
+		len_(0),
+		list_()
+	{ }
+
+	PListSlice(
+		PList<T, S>& list,
+		TIndexOffU i,
+		TIndexOffU len) :
+		i_(i),
+		len_(len),
+		list_(&list)
+	{ }
+	
+	/**
+	 * Initialize from a piece of another PListSlice.
+	 */
+	void init(const PListSlice<T, S>& sl, size_t first, size_t last) {
+		assert_gt(last, first);
+		assert_leq(last - first, sl.len_);
+		i_ = (uint32_t)(sl.i_ + first);
+		len_ = (uint32_t)(last - first);
+		list_ = sl.list_;
+	}
+	
+	/**
+	 * Reset state to be empty.
+	 */
+	void reset() {
+		i_ = len_ = 0;
+		list_ = NULL;
+	}
+	
+	/**
+	 * Get the ith element of the slice.
+	 */
+	inline const T& get(size_t i) const {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i+i_);
+	}
+
+	/**
+	 * Get the ith element of the slice.
+	 */
+	inline T& get(size_t i) {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i+i_);
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline T& operator[](size_t i) {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i+i_);
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline const T& operator[](size_t i) const {
+		assert(valid());
+		assert_lt(i, len_);
+		return list_->get(i+i_);
+	}
+
+	/**
+	 * Return true iff this slice is initialized.
+	 */
+	bool valid() const {
+		return len_ != 0;
+	}
+	
+	/**
+	 * Return number of elements in the slice.
+	 */
+	size_t size() const {
+		return len_;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Ensure that the PListSlice is internally consistent and
+	 * consistent with the backing PList.
+	 */
+	bool repOk() const {
+		assert_leq(i_ + len_, list_->size());
+		return true;
+	}
+#endif
+	
+	/**
+	 * Return true iff this slice refers to the same slice of the same
+	 * list as the given slice.
+	 */
+	bool operator==(const PListSlice& sl) const {
+		return i_ == sl.i_ && len_ == sl.len_ && list_ == sl.list_;
+	}
+
+	/**
+	 * Return false iff this slice refers to the same slice of the same
+	 * list as the given slice.
+	 */
+	bool operator!=(const PListSlice& sl) const {
+		return !(*this == sl);
+	}
+	
+	/**
+	 * Set the length.  This could leave things inconsistent (e.g. could
+	 * include elements that fall off the end of list_).
+	 */
+	void setLength(size_t nlen) {
+		len_ = (uint32_t)nlen;
+	}
+	
+protected:
+	uint32_t i_;
+	uint32_t len_;
+	PList<T, S>* list_;
+};
+
+/**
+ * A Red-Black tree node.  Links to parent & left and right children.
+ * Key and Payload are of types K and P.  Node total ordering is based
+ * on K's total ordering.  K must implement <, == and > operators.
+ */
+template<typename K, typename P> // K=key, P=payload
+class RedBlackNode {
+
+	typedef RedBlackNode<K,P> TNode;
+
+public:
+	TNode *parent;  // parent
+	TNode *left;    // left child
+	TNode *right;   // right child
+	bool   red;     // true -> red, false -> black
+	K      key;     // key, for ordering
+	P      payload; // payload (i.e. value)
+
+	/**
+	 * Return the parent of this node's parent, or NULL if none exists.
+	 */
+	RedBlackNode *grandparent() {
+		return parent != NULL ? parent->parent : NULL;
+	}
+
+	/**
+	 * Return the sibling of this node's parent, or NULL if none exists.
+	 */
+	RedBlackNode *uncle() {
+		if(parent == NULL) return NULL; // no parent
+		if(parent->parent == NULL) return NULL; // parent has no siblings
+		return (parent->parent->left == parent) ? parent->parent->right : parent->parent->left;
+	}
+	
+	/**
+	 * Return true iff this node is its parent's left child.
+	 */
+	bool isLeftChild() const { assert(parent != NULL); return parent->left == this; }
+
+	/**
+	 * Return true iff this node is its parent's right child.
+	 */
+	bool isRightChild() const { assert(parent != NULL); return parent->right == this; }
+
+	/**
+	 * Return true iff this node is its parent's right child.
+	 */
+	void replaceChild(RedBlackNode* ol, RedBlackNode* nw) {
+		if(left == ol) {
+			left = nw;
+		} else {
+			assert(right == ol);
+			right = nw;
+		}
+	}
+
+	/**
+	 * Return the number of non-null children this node has.
+	 */
+	int numChildren() const {
+		return ((left != NULL) ? 1 : 0) + ((right != NULL) ? 1 : 0);
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that node is internally consistent.
+	 */ 
+	bool repOk() const {
+		if(parent != NULL) {
+			assert(parent->left == this || parent->right == this);
+		}
+		return true;
+	}
+#endif
+
+	/**
+	 * True -> my key is less than than the given node's key.
+	 */
+	bool operator<(const TNode& o) const { return key < o.key; }
+
+	/**
+	 * True -> my key is greater than the given node's key.
+	 */
+	bool operator>(const TNode& o) const { return key > o.key; }
+
+	/**
+	 * True -> my key equals the given node's key.
+	 */
+	bool operator==(const TNode& o) const { return key == o.key; }
+
+	/**
+	 * True -> my key is less than the given key.
+	 */
+	bool operator<(const K& okey) const { return key < okey; }
+
+	/**
+	 * True -> my key is greater than the given key.
+	 */
+	bool operator>(const K& okey) const { return key > okey; }
+
+	/**
+	 * True -> my key is equal to the given key.
+	 */
+	bool operator==(const K& okey) const { return key == okey; }
+};
+
+/**
+ * A Red-Black tree that associates keys (of type K) with payloads (of
+ * type P).  Red-Black trees are self-balancing and guarantee that the
+ * tree as always "balanced" to a factor of 2, i.e., the longest
+ * root-to-leaf path is never more than twice as long as the shortest
+ * root-to-leaf path.
+ */
+template<typename K, typename P> // K=key, P=payload
+class RedBlack {
+
+	typedef RedBlackNode<K,P> TNode;
+
+public:
+    /**
+	 * Initialize the current-edit pointer to 0 and set the number of
+	 * edits per memory page.
+	 */
+	RedBlack(uint32_t pageSz, int cat = 0) :
+		perPage_(pageSz/sizeof(TNode)), pages_(cat) { clear(); }
+
+	/**
+	 * Given a DNA string, find the red-black node corresponding to it,
+	 * if one exists.
+	 */
+	inline TNode* lookup(const K& key) const {
+		TNode* cur = root_;
+		while(cur != NULL) {
+			if((*cur) == key) return cur;
+			if((*cur) < key) {
+				cur = cur->right;
+			} else {
+				cur = cur->left;
+			}
+		}
+		return NULL;
+	}
+
+	/**
+	 * Add a new key as a node in the red-black tree.
+	 */
+	TNode* add(
+		Pool& p,      // in: pool for memory pages
+		const K& key, // in: key to insert
+		bool* added)  // if true, assert is thrown if key exists
+	{
+		// Look for key; if it's not there, get its parent
+		TNode* cur = root_;
+		assert(root_ == NULL || !root_->red);
+		TNode* parent = NULL;
+		bool leftChild = true;
+		while(cur != NULL) {
+			if((*cur) == key) {
+				// Found it; break out of loop with cur != NULL
+				break;
+			}
+			parent = cur;
+			if((*cur) < key) {
+				if((cur = cur->right) == NULL) {
+					// Fell off the bottom of the tree as the right
+					// child of parent 'lastCur'
+					leftChild = false;
+				}
+			} else {
+				if((cur = cur->left) == NULL) {
+					// Fell off the bottom of the tree as the left
+					// child of parent 'lastCur'
+					leftChild = true;
+				}
+			}
+		}
+		if(cur != NULL) {
+			// Found an entry; assert if we weren't supposed to
+			if(added != NULL) *added = false;
+		} else {
+			assert(root_ == NULL || !root_->red);
+			if(!addNode(p, cur)) {
+				// Exhausted memory
+				return NULL;
+			}
+			assert(cur != NULL);
+			assert(cur != root_);
+			assert(cur != parent);
+			// Initialize new node
+			cur->key = key;
+			cur->left = cur->right = NULL;
+			cur->red = true; // red until proven black
+			keys_++;
+			if(added != NULL) *added = true;
+			// Put it where we know it should go
+			addNode(cur, parent, leftChild);
+		}
+		return cur; // return the added or found node
+	}
+
+#ifndef NDEBUG
+	/**
+	 * Check that list is internally consistent.
+	 */
+	bool repOk() const {
+		assert(curPage_ == 0 || curPage_ < pages_.size());
+		assert_leq(cur_, perPage_);
+		assert(root_ == NULL || !root_->red);
+		return true;
+	}
+#endif
+	
+	/**
+	 * Clear all state.
+	 */
+	void clear() {
+		cur_ = curPage_ = 0;
+		root_ = NULL;
+		keys_ = 0;
+		intenseRepOkCnt_ = 0;
+		pages_.clear();
+	}
+	
+	/**
+	 * Return number of keys added.
+	 */
+	size_t size() const {
+		return keys_;
+	}
+	
+	/**
+	 * Return true iff there are no keys in the map.
+	 */
+	bool empty() const {
+		return keys_ == 0;
+	}
+
+	/**
+	 * Add another node and return a pointer to it in 'node'.  A new
+	 * page is allocated if necessary.  If the allocation fails, false
+	 * is returned.
+	 */
+	bool addNode(Pool& p, TNode*& node) {
+		assert_leq(cur_, perPage_);
+		assert(repOk());
+		assert(this != NULL);
+		// Allocation of the first page
+		if(pages_.size() == 0) {
+			if(addPage(p) == NULL) {
+				node = NULL;
+				return false;
+			}
+			assert_eq(1, pages_.size());
+		}
+		if(cur_ == perPage_) {
+			assert_lt(curPage_, pages_.size());
+			if(curPage_ == pages_.size()-1 && addPage(p) == NULL) {
+				return false;
+			}
+			cur_ = 0;
+			curPage_++;
+		}
+		assert_lt(cur_, perPage_);
+		assert_lt(curPage_, pages_.size());
+		node = &pages_[curPage_][cur_];
+		assert(node != NULL);
+		cur_++;
+		return true;
+	}
+    
+    const TNode* root() const { return root_; }
+
+protected:
+
+#ifndef NDEBUG
+	/**
+	 * Check specifically that the red-black invariants are satistfied.
+	 */
+	bool redBlackRepOk(TNode* n) {
+		if(n == NULL) return true;
+		if(++intenseRepOkCnt_ < 500) return true;
+		intenseRepOkCnt_ = 0;
+		int minNodes = -1; // min # nodes along any n->leaf path
+		int maxNodes = -1; // max # nodes along any n->leaf path
+		// The number of black nodes along paths from n to leaf
+		// (must be same for all paths)
+		int blackConst = -1;
+		size_t nodesTot = 0;
+		redBlackRepOk(
+			n,
+			1, /* 1 node so far */
+			n->red ? 0 : 1, /* black nodes so far */
+			blackConst,
+			minNodes,
+			maxNodes,
+			nodesTot);
+		if(n == root_) {
+			assert_eq(nodesTot, keys_);
+		}
+		assert_gt(minNodes, 0);
+		assert_gt(maxNodes, 0);
+		assert_leq(maxNodes, 2*minNodes);
+		return true;
+	}
+
+	/**
+	 * Check specifically that the red-black invariants are satistfied.
+	 */
+	bool redBlackRepOk(
+		TNode* n,
+		int nodes,
+		int black,
+		int& blackConst,
+		int& minNodes,
+		int& maxNodes,
+		size_t& nodesTot) const
+	{
+		assert_gt(black, 0);
+		nodesTot++; // account for leaf node
+		if(n->left == NULL) {
+			if(blackConst == -1) blackConst = black;
+			assert_eq(black, blackConst);
+			if(nodes+1 > maxNodes) maxNodes = nodes+1;
+			if(nodes+1 < minNodes || minNodes == -1) minNodes = nodes+1;
+		} else {
+			if(n->red) assert(!n->left->red); // Red can't be child of a red
+			redBlackRepOk(
+				n->left,                         // next node
+				nodes + 1,                       // # nodes so far on path
+				black + (n->left->red ? 0 : 1),  // # black so far on path
+				blackConst,                      // invariant # black nodes on root->leaf path
+				minNodes,                        // min root->leaf len so far         
+				maxNodes,                        // max root->leaf len so far
+				nodesTot);                       // tot nodes so far
+		}
+		if(n->right == NULL) {
+			if(blackConst == -1) blackConst = black;
+			assert_eq(black, blackConst);
+			if(nodes+1 > maxNodes) maxNodes = nodes+1;
+			if(nodes+1 < minNodes || minNodes == -1) minNodes = nodes+1;
+		} else {
+			if(n->red) assert(!n->right->red); // Red can't be child of a red
+			redBlackRepOk(
+				n->right,                        // next node
+				nodes + 1,                       // # nodes so far on path
+				black + (n->right->red ? 0 : 1), // # black so far on path
+				blackConst,                      // invariant # black nodes on root->leaf path
+				minNodes,                        // min root->leaf len so far         
+				maxNodes,                        // max root->leaf len so far
+				nodesTot);                       // tot nodes so far
+		}
+		return true;
+	}
+#endif
+
+	/**
+	 * Rotate to the left such that n is replaced by its right child
+	 * w/r/t n's current parent.
+	 */
+	void leftRotate(TNode* n) {
+		TNode* r = n->right;
+		assert(n->repOk());
+		assert(r->repOk());
+		n->right = r->left;
+		if(n->right != NULL) {
+			n->right->parent = n;
+			assert(n->right->repOk());
+		}
+		r->parent = n->parent;
+		n->parent = r;
+		r->left = n;
+		if(r->parent != NULL) {
+			r->parent->replaceChild(n, r);
+		}
+		if(root_ == n) root_ = r;
+		assert(!root_->red);
+		assert(n->repOk());
+		assert(r->repOk());
+	}
+
+	/**
+	 * Rotate to the right such that n is replaced by its left child
+	 * w/r/t n's current parent.  n moves down to the right and loses
+	 * its left child, while its former left child moves up and gains a
+	 * right child.
+	 */
+	void rightRotate(TNode* n) {
+		TNode* r = n->left;
+		assert(n->repOk());
+		assert(r->repOk());
+		n->left = r->right;
+		if(n->left != NULL) {
+			n->left->parent = n;
+			assert(n->left->repOk());
+		}
+		r->parent = n->parent;
+		n->parent = r;
+		r->right = n;
+		if(r->parent != NULL) {
+			r->parent->replaceChild(n, r);
+		}
+		if(root_ == n) root_ = r;
+		assert(!root_->red);
+		assert(n->repOk());
+		assert(r->repOk());
+	}
+
+	/**
+	 * Add a node to the red-black tree, maintaining the red-black
+	 * invariants.
+	 */
+	void addNode(TNode* n, TNode* parent, bool leftChild) {
+		assert(n != NULL);
+		if(parent == NULL) {
+			// Case 1: inserted at root
+			root_ = n;
+			root_->red = false; // root must be black
+			n->parent = NULL;
+			assert(redBlackRepOk(root_));
+			assert(n->repOk());
+		} else {
+			assert(!root_->red);
+			// Add new node to tree
+			if(leftChild) {
+				assert(parent->left == NULL);
+				parent->left = n;
+			} else {
+				assert(parent->right == NULL);
+				parent->right = n;
+			}
+			n->parent = parent;
+			int thru = 0;
+			while(true) {
+				thru++;
+				parent = n->parent;
+				if(parent != NULL) assert(parent->repOk());
+				if(parent == NULL && n->red) {
+					n->red = false;
+				}
+				if(parent == NULL || !parent->red) {
+					assert(redBlackRepOk(root_));
+					break;
+				}
+				TNode* uncle = n->uncle();
+				TNode* gparent = n->grandparent();
+				assert(gparent != NULL); // if parent is red, grandparent must exist
+				bool uncleRed = (uncle != NULL ? uncle->red : false);
+				if(uncleRed) {
+					// Parent is red, uncle is red; recursive case
+					assert(uncle != NULL);
+					parent->red = uncle->red = false;
+					gparent->red = true;
+					n = gparent;
+					continue;
+				} else {
+					if(parent->isLeftChild()) {
+						// Parent is red, uncle is black, parent is
+						// left child
+						if(!n->isLeftChild()) {
+							n = parent;
+							leftRotate(n);
+						}
+						n = n->parent;
+						n->red = false;
+						n->parent->red = true;
+						rightRotate(n->parent);
+						assert(redBlackRepOk(n));
+						assert(redBlackRepOk(root_));
+					} else {
+						// Parent is red, uncle is black, parent is
+						// right child.
+						if(!n->isRightChild()) {
+							n = parent;
+							rightRotate(n);
+						}
+						n = n->parent;
+						n->red = false;
+						n->parent->red = true;
+						leftRotate(n->parent);
+						assert(redBlackRepOk(n));
+						assert(redBlackRepOk(root_));
+					}
+				}
+				break;
+			}
+		}
+		assert(redBlackRepOk(root_));
+	}
+
+	/**
+	 * Expand our page supply by 1
+	 */
+	TNode* addPage(Pool& p) {
+		TNode *n = (TNode *)p.alloc();
+		if(n != NULL) {
+			pages_.push_back(n);
+		}
+		return n;
+	}
+
+	size_t        keys_;    // number of keys so far
+	size_t        cur_;     // current elt within page
+	size_t        curPage_; // current page
+	const size_t  perPage_; // # edits fitting in a page
+	TNode*        root_;    // root node
+	EList<TNode*> pages_;   // the pages
+	int intenseRepOkCnt_;   // counter for the computationally intensive repOk function
+};
+
+/**
+ * For assembling doubly-linked lists of Edits.
+ */
+template <typename T>
+struct DoublyLinkedList {
+	
+	DoublyLinkedList() : payload(), prev(NULL), next(NULL) { }
+	
+	/**
+	 * Add all elements in the doubly-linked list to the provided EList.
+	 */
+	void toList(EList<T>& l) {
+		// Add this and all subsequent elements
+		DoublyLinkedList<T> *cur = this;
+		while(cur != NULL) {
+			l.push_back(cur->payload);
+			cur = cur->next;
+		}
+		// Add all previous elements
+		cur = prev;
+		while(cur != NULL) {
+			l.push_back(cur->payload);
+			cur = cur->prev;
+		}
+	}
+	
+	T                    payload;
+	DoublyLinkedList<T> *prev;
+	DoublyLinkedList<T> *next;
+};
+
+template <typename T1, typename T2>
+struct Pair {
+	T1 a;
+	T2 b;
+
+	Pair() : a(), b() { }
+	
+	Pair(
+		const T1& a_,
+		const T2& b_) { a = a_; b = b_; }
+
+	bool operator==(const Pair& o) const {
+		return a == o.a && b == o.b;
+	}
+	
+	bool operator<(const Pair& o) const {
+		if(a < o.a) return true;
+		if(a > o.a) return false;
+		if(b < o.b) return true;
+		return false;
+	}
+};
+
+template <typename T1, typename T2, typename T3>
+struct Triple {
+	T1 a;
+	T2 b;
+	T3 c;
+
+	Triple() : a(), b(), c() { }
+
+	Triple(
+		const T1& a_,
+		const T2& b_,
+		const T3& c_) { a = a_; b = b_; c = c_; }
+
+	bool operator==(const Triple& o) const {
+		return a == o.a && b == o.b && c == o.c;
+	}
+	
+	bool operator<(const Triple& o) const {
+		if(a < o.a) return true;
+		if(a > o.a) return false;
+		if(b < o.b) return true;
+		if(b > o.b) return false;
+		if(c < o.c) return true;
+		return false;
+	}
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Quad {
+
+	Quad() : a(), b(), c(), d() { }
+
+	Quad(
+		const T1& a_,
+		const T2& b_,
+		const T3& c_,
+		const T4& d_) { a = a_; b = b_; c = c_; d = d_; }
+
+	Quad(
+		const T1& a_,
+		const T1& b_,
+		const T1& c_,
+		const T1& d_)
+	{
+		init(a_, b_, c_, d_);
+	}
+	
+	void init(
+		const T1& a_,
+		const T1& b_,
+		const T1& c_,
+		const T1& d_)
+	{
+		a = a_; b = b_; c = c_; d = d_;
+	}
+
+	bool operator==(const Quad& o) const {
+		return a == o.a && b == o.b && c == o.c && d == o.d;
+	}
+	
+	bool operator<(const Quad& o) const {
+		if(a < o.a) return true;
+		if(a > o.a) return false;
+		if(b < o.b) return true;
+		if(b > o.b) return false;
+		if(c < o.c) return true;
+		if(c > o.c) return false;
+		if(d < o.d) return true;
+		return false;
+	}
+
+	T1 a;
+	T2 b;
+	T3 c;
+	T4 d;
+};
+
+/**
+ * For assembling doubly-linked lists of EList.
+ */
+template <typename T>
+struct LinkedEListNode {
+	
+	LinkedEListNode() : payload(), next(NULL) { }
+		
+	T                  payload;
+	LinkedEListNode<T> *next;
+};
+
+/**
+ * For assembling doubly-linked lists of EList.
+ */
+template <typename T>
+struct LinkedEList {
+	
+	LinkedEList() : head(NULL) {
+        ASSERT_ONLY(num_allocated = 0);
+        ASSERT_ONLY(num_new_node = 0);
+        ASSERT_ONLY(num_delete_node = 0);
+    }
+    
+    ~LinkedEList() {
+        ASSERT_ONLY(size_t num_deallocated = 0);
+        while(head != NULL) {
+            LinkedEListNode<T>* next = head->next;
+            delete head;
+            ASSERT_ONLY(num_deallocated++);
+            head = next;
+        }
+        // daehwan - for debugging purposes
+        // assert_eq(num_allocated, num_deallocated);
+    }
+    
+    LinkedEListNode<T>* new_node() {
+        ASSERT_ONLY(num_new_node++);
+        LinkedEListNode<T> *result = NULL;
+        if(head == NULL) {
+            head = new LinkedEListNode<T>();
+            head-> next = NULL;
+            ASSERT_ONLY(num_allocated++);
+        }
+        assert(head != NULL);
+        result = head;
+        head = head->next;
+        assert(result != NULL);
+        return result;
+    }
+    
+    void delete_node(LinkedEListNode<T> *node) {
+        ASSERT_ONLY(num_delete_node++);
+        assert(node != NULL);
+        // check if this is already deleted.
+#ifndef NDEBUG
+        LinkedEListNode<T> *temp = head;
+        while(temp != NULL) {
+            assert(temp != node);
+            temp = temp->next;
+        }
+#endif
+        node->next = head;
+        head = node;
+    }
+    
+	LinkedEListNode<T> *head;
+    
+    ASSERT_ONLY(size_t num_allocated);
+    ASSERT_ONLY(size_t num_new_node);
+    ASSERT_ONLY(size_t num_delete_node);
+};
+
+
+#endif /* DS_H_ */
diff --git a/edit.cpp b/edit.cpp
new file mode 100644
index 0000000..2f79745
--- /dev/null
+++ b/edit.cpp
@@ -0,0 +1,486 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include "edit.h"
+
+using namespace std;
+
+/**
+ * Print a single edit to a std::ostream.  Format is
+ * (pos):(ref chr)>(read chr).  Where 'pos' is an offset from the 5'
+ * end of the read, and the ref and read chrs are expressed w/r/t the
+ * Watson strand.
+ */
+ostream& operator<< (ostream& os, const Edit& e) {
+    if(e.type != EDIT_TYPE_SPL) {
+        os << e.pos << ":" << (char)e.chr << ">" << (char)e.qchr;
+    } else {
+        os << e.pos << ":" << e.splLen;
+    }
+
+	return os;
+}
+
+/**
+ * Print a list of edits to a std::ostream, separated by commas.
+ */
+void Edit::print(ostream& os, const EList<Edit>& edits, char delim) {
+	for(size_t i = 0; i < edits.size(); i++) {
+		os << edits[i];
+		if(i < edits.size()-1) os << delim;
+	}
+}
+
+/**
+ * Flip all the edits.pos fields so that they're with respect to
+ * the other end of the read (of length 'sz').
+ */
+void Edit::invertPoss(
+	EList<Edit>& edits,
+	size_t sz,
+	size_t ei,
+	size_t en,
+	bool sort)
+{
+	// Invert elements
+	size_t ii = 0;
+	for(size_t i = ei; i < ei + en/2; i++) {
+		Edit tmp = edits[i];
+		edits[i] = edits[ei + en - ii - 1];
+		edits[ei + en - ii - 1] = tmp;
+		ii++;
+	}
+	for(size_t i = ei; i < ei + en; i++) {
+		assert(edits[i].pos < sz ||
+			   (edits[i].isReadGap() && edits[i].pos == sz));
+		// Adjust pos
+        if(edits[i].isReadGap() || edits[i].isSpliced()) {
+            edits[i].pos = (uint32_t)(sz - edits[i].pos);
+        } else {
+            edits[i].pos = (uint32_t)(sz - edits[i].pos - 1);
+        }
+		// Adjust pos2
+		if(edits[i].isReadGap()) {
+			int64_t pos2diff = (int64_t)(uint64_t)edits[i].pos2 - (int64_t)((uint64_t)std::numeric_limits<uint32_t>::max() >> 1);
+			int64_t pos2new = (int64_t)(uint64_t)edits[i].pos2 - 2*pos2diff;
+			assert(pos2diff == 0 || (uint32_t)pos2new != (std::numeric_limits<uint32_t>::max() >> 1));
+			edits[i].pos2 = (uint32_t)pos2new;
+		}
+	}
+	if(sort) {
+		// Edits might not necessarily be in same order after inversion
+		edits.sortPortion(ei, en);
+#ifndef NDEBUG
+		for(size_t i = ei + 1; i < ei + en; i++) {
+			assert_geq(edits[i].pos, edits[i-1].pos);
+		}
+#endif
+	}
+}
+
+/**
+ * For now, we pretend that the alignment is in the forward orientation
+ * and that the Edits are listed from left- to right-hand side.
+ */
+void Edit::printQAlign(
+	std::ostream& os,
+	const BTDnaString& read,
+	const EList<Edit>& edits)
+{
+	printQAlign(os, "", read, edits);
+}
+
+/**
+ * For now, we pretend that the alignment is in the forward orientation
+ * and that the Edits are listed from left- to right-hand side.
+ */
+void Edit::printQAlignNoCheck(
+	std::ostream& os,
+	const BTDnaString& read,
+	const EList<Edit>& edits)
+{
+	printQAlignNoCheck(os, "", read, edits);
+}
+
+/**
+ * For now, we pretend that the alignment is in the forward orientation
+ * and that the Edits are listed from left- to right-hand side.
+ */
+void Edit::printQAlign(
+	std::ostream& os,
+	const char *prefix,
+	const BTDnaString& read,
+	const EList<Edit>& edits)
+{
+	size_t eidx = 0;
+	os << prefix;
+	// Print read
+	for(size_t i = 0; i < read.length(); i++) {
+		bool del = false, mm = false;
+		while(eidx < edits.size() && edits[eidx].pos == i) {
+			if(edits[eidx].isReadGap()) {
+				os << '-';
+			} else if(edits[eidx].isRefGap()) {
+				del = true;
+				assert_eq((int)edits[eidx].qchr, read.toChar(i));
+				os << read.toChar(i);
+			} else {
+				mm = true;
+				assert(edits[eidx].isMismatch());
+				assert_eq((int)edits[eidx].qchr, read.toChar(i));
+				os << (char)edits[eidx].qchr;
+			}
+			eidx++;
+		}
+		if(!del && !mm) os << read.toChar(i);
+	}
+	os << endl;
+	os << prefix;
+	eidx = 0;
+	// Print match bars
+	for(size_t i = 0; i < read.length(); i++) {
+		bool del = false, mm = false;
+		while(eidx < edits.size() && edits[eidx].pos == i) {
+			if(edits[eidx].isReadGap()) {
+				os << ' ';
+			} else if(edits[eidx].isRefGap()) {
+				del = true;
+				os << ' ';
+			} else {
+				mm = true;
+				assert(edits[eidx].isMismatch());
+				os << ' ';
+			}
+			eidx++;
+		}
+		if(!del && !mm) os << '|';
+	}
+	os << endl;
+	os << prefix;
+	eidx = 0;
+	// Print reference
+	for(size_t i = 0; i < read.length(); i++) {
+		bool del = false, mm = false;
+		while(eidx < edits.size() && edits[eidx].pos == i) {
+			if(edits[eidx].isReadGap()) {
+				os << (char)edits[eidx].chr;
+			} else if(edits[eidx].isRefGap()) {
+				del = true;
+				os << '-';
+			} else {
+				mm = true;
+				assert(edits[eidx].isMismatch());
+				os << (char)edits[eidx].chr;
+			}
+			eidx++;
+		}
+		if(!del && !mm) os << read.toChar(i);
+	}
+	os << endl;
+}
+
+/**
+ * For now, we pretend that the alignment is in the forward orientation
+ * and that the Edits are listed from left- to right-hand side.
+ */
+void Edit::printQAlignNoCheck(
+	std::ostream& os,
+	const char *prefix,
+	const BTDnaString& read,
+	const EList<Edit>& edits)
+{
+	size_t eidx = 0;
+	os << prefix;
+	// Print read
+	for(size_t i = 0; i < read.length(); i++) {
+		bool del = false, mm = false;
+		while(eidx < edits.size() && edits[eidx].pos == i) {
+			if(edits[eidx].isReadGap()) {
+				os << '-';
+			} else if(edits[eidx].isRefGap()) {
+				del = true;
+				os << read.toChar(i);
+			} else {
+				mm = true;
+				os << (char)edits[eidx].qchr;
+			}
+			eidx++;
+		}
+		if(!del && !mm) os << read.toChar(i);
+	}
+	os << endl;
+	os << prefix;
+	eidx = 0;
+	// Print match bars
+	for(size_t i = 0; i < read.length(); i++) {
+		bool del = false, mm = false;
+		while(eidx < edits.size() && edits[eidx].pos == i) {
+			if(edits[eidx].isReadGap()) {
+				os << ' ';
+			} else if(edits[eidx].isRefGap()) {
+				del = true;
+				os << ' ';
+			} else {
+				mm = true;
+				os << ' ';
+			}
+			eidx++;
+		}
+		if(!del && !mm) os << '|';
+	}
+	os << endl;
+	os << prefix;
+	eidx = 0;
+	// Print reference
+	for(size_t i = 0; i < read.length(); i++) {
+		bool del = false, mm = false;
+		while(eidx < edits.size() && edits[eidx].pos == i) {
+			if(edits[eidx].isReadGap()) {
+				os << (char)edits[eidx].chr;
+			} else if(edits[eidx].isRefGap()) {
+				del = true;
+				os << '-';
+			} else {
+				mm = true;
+				os << (char)edits[eidx].chr;
+			}
+			eidx++;
+		}
+		if(!del && !mm) os << read.toChar(i);
+	}
+	os << endl;
+}
+
+/**
+ * Sort the edits in the provided list.
+ */
+void Edit::sort(EList<Edit>& edits) {
+	edits.sort(); // simple!
+}
+
+/**
+ * Given a read string and some edits, generate and append the corresponding
+ * reference string to 'ref'.  If read aligned to the Watson strand, the caller
+ * should pass the original read sequence and original edits.  If a read
+ * aligned to the Crick strand, the caller should pass the reverse complement
+ * of the read and a version of the edits list that has had Edit:invertPoss
+ * called on it to cause edits to be listed in 3'-to-5' order.
+ */
+void Edit::toRef(
+	const BTDnaString& read,
+	const EList<Edit>& edits,
+	BTDnaString& ref,
+	bool fw,
+	size_t trim5,
+	size_t trim3)
+{
+	// edits should be sorted
+	size_t eidx = 0;
+	// Print reference
+	const size_t rdlen = read.length();
+	size_t trimBeg = fw ? trim5 : trim3;
+	size_t trimEnd = fw ? trim3 : trim5;
+	assert(Edit::repOk(edits, read, fw, trim5, trim3));
+	if(!fw) {
+		invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false);
+	}
+	for(size_t i = 0; i < rdlen; i++) {
+		ASSERT_ONLY(int c = read[i]);
+		assert_range(0, 4, c);
+		bool del = false, mm = false;
+		bool append = i >= trimBeg && rdlen - i - 1 >= trimEnd;
+		bool appendIns = i >= trimBeg && rdlen - i >= trimEnd;
+		while(eidx < edits.size() && edits[eidx].pos+trimBeg == i) {
+			if(edits[eidx].isReadGap()) {
+				// Inserted characters come before the position's
+				// character
+				if(appendIns) {
+					ref.appendChar((char)edits[eidx].chr);
+				}
+			} else if(edits[eidx].isRefGap()) {
+				assert_eq("ACGTN"[c], edits[eidx].qchr);
+				del = true;
+			} else if(edits[eidx].isMismatch()){
+				mm = true;
+				assert(edits[eidx].qchr != edits[eidx].chr || edits[eidx].qchr == 'N');
+				assert_eq("ACGTN"[c], edits[eidx].qchr);
+				if(append) {
+					ref.appendChar((char)edits[eidx].chr);
+				}
+			}
+			eidx++;
+		}
+		if(!del && !mm) {
+			if(append) {
+				ref.append(read[i]);
+			}
+		}
+	}
+	if(trimEnd == 0) {
+		while(eidx < edits.size()) {
+			assert_gt(rdlen, edits[eidx].pos);
+			if(edits[eidx].isReadGap()) {
+				ref.appendChar((char)edits[eidx].chr);
+			}
+			eidx++;
+		}
+	}
+	if(!fw) {
+		invertPoss(const_cast<EList<Edit>&>(edits), read.length()-trimBeg-trimEnd, false);
+	}
+}
+
+#ifndef NDEBUG
+/**
+ * Check that the edit is internally consistent.
+ */
+bool Edit::repOk() const {
+    assert(inited());
+	// Ref and read characters cannot be the same unless they're both Ns
+    if(type != EDIT_TYPE_SPL) {
+        assert(qchr != chr || qchr == 'N');
+        // Type must match characters
+        assert(isRefGap() ||  chr != '-');
+        assert(isReadGap() || qchr != '-');
+        assert(!isMismatch() || (qchr != '-' && chr != '-'));
+    } else {
+        assert_gt(splLen, 0);
+    }
+	return true;
+}
+
+/**
+ * Given a list of edits and a DNA string representing the query
+ * sequence, check that the edits are consistent with respect to the
+ * query.
+ */
+bool Edit::repOk(
+	const EList<Edit>& edits,
+	const BTDnaString& s,
+	bool fw,
+	size_t trimBeg,
+	size_t trimEnd)
+{
+	if(!fw) {
+		invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false);
+		swap(trimBeg, trimEnd);
+	}
+	for(size_t i = 0; i < edits.size(); i++) {
+		const Edit& e = edits[i];
+		size_t pos = e.pos;
+		if(i > 0) {
+			assert_geq(pos, edits[i-1].pos);
+		}
+		bool del = false, mm = false;
+		while(i < edits.size() && edits[i].pos == pos) {
+			const Edit& ee = edits[i];
+			assert_lt(ee.pos, s.length());
+            if(ee.type != EDIT_TYPE_SPL) {
+                if(ee.qchr != '-') {
+                    assert(ee.isRefGap() || ee.isMismatch());
+                    assert_eq((int)ee.qchr, s.toChar(ee.pos+trimBeg));
+                }
+            }
+			if(ee.isMismatch()) {
+				assert(!mm);
+				mm = true;
+				assert(!del);
+			} else if(ee.isReadGap()) {
+				assert(!mm);
+			} else if(ee.isRefGap()) {
+				assert(!mm);
+				assert(!del);
+				del = true;
+			} else if(ee.isSpliced()) {
+                
+            }
+			i++;
+		}
+	}
+	if(!fw) {
+		invertPoss(const_cast<EList<Edit>&>(edits), s.length()-trimBeg-trimEnd, false);
+	}
+	return true;
+}
+#endif
+
+/**
+ * Merge second argument into the first.  Assume both are sorted to
+ * begin with.
+ */
+void Edit::merge(EList<Edit>& dst, const EList<Edit>& src) {
+	size_t di = 0, si = 0;
+	while(di < dst.size()) {
+		if(src[si].pos < dst[di].pos) {
+			dst.insert(src[si], di);
+			si++; di++;
+		} else if(src[si].pos == dst[di].pos) {
+			// There can be two inserts at a given position, but we
+			// can't merge them because there's no way to know their
+			// order
+			assert(src[si].isReadGap() != dst[di].isReadGap());
+			if(src[si].isReadGap()) {
+				dst.insert(src[si], di);
+				si++; di++;
+			} else if(dst[di].isReadGap()) {
+				di++;
+			}
+		}
+	}
+	while(si < src.size()) dst.push_back(src[si++]);
+}
+
+/**
+ * Clip off some of the low-numbered positions.
+ */
+void Edit::clipLo(EList<Edit>& ed, size_t len, size_t amt) {
+	size_t nrm = 0;
+	for(size_t i = 0; i < ed.size(); i++) {
+		assert_lt(ed[i].pos, len);
+		if(ed[i].pos < amt) {
+			nrm++;
+		} else {
+			// Shift everyone else up
+			ed[i].pos -= (uint32_t)amt;
+		}
+	}
+	ed.erase(0, nrm);
+}
+
+/**
+ * Clip off some of the high-numbered positions.
+ */
+void Edit::clipHi(EList<Edit>& ed, size_t len, size_t amt) {
+	assert_leq(amt, len);
+	size_t max = len - amt;
+	size_t nrm = 0;
+	for(size_t i = 0; i < ed.size(); i++) {
+		size_t ii = ed.size() - i - 1;
+		assert_lt(ed[ii].pos, len);
+		if(ed[ii].pos > max) {
+			nrm++;
+		} else if(ed[ii].pos == max && !ed[ii].isReadGap()) {
+			nrm++;
+		} else {
+			break;
+		}
+	}
+	ed.resize(ed.size() - nrm);
+}
diff --git a/edit.h b/edit.h
new file mode 100644
index 0000000..31975e0
--- /dev/null
+++ b/edit.h
@@ -0,0 +1,394 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EDIT_H_
+#define EDIT_H_
+
+#include <iostream>
+#include <stdint.h>
+#include <limits>
+#include "assert_helpers.h"
+#include "filebuf.h"
+#include "sstring.h"
+#include "ds.h"
+
+/**
+ * 3 types of edits; mismatch (substitution), insertion in the
+ * reference, deletion in the reference.
+ */
+enum {
+	EDIT_TYPE_READ_GAP = 1,
+	EDIT_TYPE_REF_GAP,
+	EDIT_TYPE_MM,
+	EDIT_TYPE_SNP,
+    EDIT_TYPE_SPL, // splicing of pre-messenger RNAs into messenger RNAs
+};
+
+enum {
+    EDIT_SPL_UNKNOWN = 1,
+    EDIT_SPL_FW,
+    EDIT_SPL_RC
+};
+
+/**
+ * Encapsulates an edit between the read sequence and the reference sequence.
+ * We obey a few conventions when populating its fields.  The fields are:
+ *
+ * 	uint8_t  chr;  // reference character involved (for subst and ins)
+ *  uint8_t  qchr; // read character involved (for subst and del)
+ *  uint8_t  type; // 1 -> mm, 2 -> SNP, 3 -> ins, 4 -> del
+ *  uint32_t pos;  // position w/r/t search root
+ *
+ * One convention is that pos is always an offset w/r/t the 5' end of the read.
+ *
+ * Another is that chr and qchr are expressed in terms of the nucleotides on
+ * the forward version of the read.  So if we're aligning the reverse
+ * complement of the read, and an A in the reverse complement mismatches a C in
+ * the reference, chr should be G and qchr should be T.
+ */
+struct Edit {
+
+	Edit() { reset(); }
+
+	Edit(
+		uint32_t po,
+		int ch,
+		int qc,
+		int ty,
+		bool chrs = true)
+	{
+		init(po, ch, qc, ty, chrs);
+	}
+    
+    Edit(
+         uint32_t po,
+         int ch,
+         int qc,
+         int ty,
+         uint32_t sl,
+         uint8_t sdir,
+         bool knowns,
+         bool chrs = true)
+	{
+		init(po, ch, qc, ty, sl, sdir, knowns, chrs);
+	}
+	
+    /**
+     * Reset Edit to uninitialized state.
+     */
+	void reset() {
+		pos = pos2 = std::numeric_limits<uint32_t>::max();
+		chr = qchr = type = 0;
+        splLen = 0;
+        splDir = EDIT_SPL_UNKNOWN;
+        knownSpl = false;
+	}
+	
+    /**
+     * Return true iff the Edit is initialized.
+     */
+	bool inited() const {
+		return pos != std::numeric_limits<uint32_t>::max();
+	}
+	
+    /**
+     * Initialize a new Edit.
+     */
+	void init(
+		uint32_t po,
+		int ch,
+		int qc,
+		int ty,
+		bool chrs = true)
+	{
+		chr = ch;
+		qchr = qc;
+		type = ty;
+        splLen = 0;
+        splDir = EDIT_SPL_UNKNOWN;
+		pos = po;
+		if(qc == '-') {
+			// Read gap
+			pos2 = std::numeric_limits<uint32_t>::max() >> 1;
+		} else {
+			pos2 = std::numeric_limits<uint32_t>::max();
+		}
+		if(!chrs) {
+			assert_range(0, 4, (int)chr);
+			assert_range(0, 4, (int)qchr);
+			chr = "ACGTN"[chr];
+			qchr = "ACGTN"[qchr];
+		}
+#ifndef NDEBUG
+        if(type != EDIT_TYPE_SPL) {
+            assert_in(chr, "ACMGRSVTWYHKDBN-");
+            assert_in(qchr, "ACGTN-");
+            assert(chr != qchr || chr == 'N');
+        }
+#endif
+		assert(inited());
+	}
+    
+    /**
+     * Initialize a new Edit.
+     */
+	void init(
+              uint32_t po,
+              int ch,
+              int qc,
+              int ty,
+              uint32_t sl,
+              uint32_t sdir,
+              bool knowns,
+              bool chrs = true)
+	{
+        assert_eq(ty, EDIT_TYPE_SPL);
+        init(po, ch, qc, ty, chrs);
+        splLen = sl;
+        splDir = sdir;
+        knownSpl = knowns;
+	}
+	
+	/**
+	 * Return true iff one part of the edit or the other has an 'N'.
+	 */
+	bool hasN() const {
+		assert(inited());
+		return chr == 'N' || qchr == 'N';
+	}
+
+	/**
+	 * Edit less-than overload.
+	 */
+	int operator< (const Edit &rhs) const {
+		assert(inited());
+		if(pos  < rhs.pos) return 1;
+		if(pos  > rhs.pos) return 0;
+		if(pos2 < rhs.pos2) return 1;
+		if(pos2 > rhs.pos2) return 0;
+		if(type < rhs.type) return 1;
+		if(type > rhs.type) return 0;
+		if(chr  < rhs.chr) return 1;
+		if(chr  > rhs.chr) return 0;
+		return (qchr < rhs.qchr)? 1 : 0;
+	}
+
+	/**
+	 * Edit equals overload.
+	 */
+	int operator== (const Edit &rhs) const {
+		assert(inited());
+		return(pos  == rhs.pos &&
+			   pos2 == rhs.pos2 &&
+			   chr  == rhs.chr &&
+			   qchr == rhs.qchr &&
+			   type == rhs.type &&
+               splLen == rhs.splLen &&
+               splDir == rhs.splDir /* &&
+               knownSpl == rhs.knownSpl */);
+	}
+
+	/**
+	 * Return true iff this Edit is an initialized insertion.
+	 */
+	bool isReadGap() const {
+		assert(inited());
+		return type == EDIT_TYPE_READ_GAP;
+	}
+
+	/**
+	 * Return true iff this Edit is an initialized deletion.
+	 */
+	bool isRefGap() const {
+		assert(inited());
+		return type == EDIT_TYPE_REF_GAP;
+	}
+
+	/**
+	 * Return true if this Edit is either an initialized deletion or an
+	 * initialized insertion.
+	 */
+	bool isGap() const {
+		assert(inited());
+		return (type == EDIT_TYPE_REF_GAP || type == EDIT_TYPE_READ_GAP);
+	}
+    
+    bool isSpliced() const {
+        assert(inited());
+        return type == EDIT_TYPE_SPL;
+    }
+	
+	/**
+	 * Return the number of gaps in the given edit list.
+	 */
+	static size_t numGaps(const EList<Edit>& es) {
+		size_t gaps = 0;
+		for(size_t i = 0; i < es.size(); i++) {
+			if(es[i].isGap()) gaps++;
+		}
+		return gaps;
+	}
+
+	/**
+	 * Return true iff this Edit is an initialized mismatch.
+	 */
+	bool isMismatch() const {
+		assert(inited());
+		return type == EDIT_TYPE_MM;
+	}
+
+	/**
+	 * Sort the edits in the provided list.
+	 */
+	static void sort(EList<Edit>& edits);
+
+	/**
+	 * Flip all the edits.pos fields so that they're with respect to
+	 * the other end of the read (of length 'sz').
+	 */
+	static void invertPoss(
+		EList<Edit>& edits,
+		size_t sz,
+		size_t ei,
+		size_t en,
+		bool sort = false);
+
+	/**
+	 * Flip all the edits.pos fields so that they're with respect to
+	 * the other end of the read (of length 'sz').
+	 */
+	static void invertPoss(EList<Edit>& edits, size_t sz, bool sort = false) {
+		invertPoss(edits, sz, 0, edits.size(), sort);
+	}
+	
+	/**
+	 * Clip off some of the low-numbered positions.
+	 */
+	static void clipLo(EList<Edit>& edits, size_t len, size_t amt);
+
+	/**
+	 * Clip off some of the high-numbered positions.
+	 */
+	static void clipHi(EList<Edit>& edits, size_t len, size_t amt);
+
+	/**
+	 * Given a read string and some edits, generate and append the
+	 * corresponding reference string to 'ref'.
+	 */
+	static void toRef(
+		const BTDnaString& read,
+		const EList<Edit>& edits,
+		BTDnaString& ref,
+		bool fw = true,
+		size_t trim5 = 0,
+		size_t trim3 = 0);
+
+	/**
+	 * Given a string and its edits with respect to some other string,
+	 * print the alignment between the strings with the strings stacked
+	 * vertically, with vertical bars denoting matches.
+	 */
+	static void printQAlign(
+		std::ostream& os,
+		const BTDnaString& read,
+		const EList<Edit>& edits);
+
+	/**
+	 * Given a string and its edits with respect to some other string,
+	 * print the alignment between the strings with the strings stacked
+	 * vertically, with vertical bars denoting matches.  Add 'prefix'
+	 * before each line of output.
+	 */
+	static void printQAlign(
+		std::ostream& os,
+		const char *prefix,
+		const BTDnaString& read,
+		const EList<Edit>& edits);
+
+	/**
+	 * Given a string and its edits with respect to some other string,
+	 * print the alignment between the strings with the strings stacked
+	 * vertically, with vertical bars denoting matches.
+	 */
+	static void printQAlignNoCheck(
+		std::ostream& os,
+		const BTDnaString& read,
+		const EList<Edit>& edits);
+
+	/**
+	 * Given a string and its edits with respect to some other string,
+	 * print the alignment between the strings with the strings stacked
+	 * vertically, with vertical bars denoting matches.  Add 'prefix'
+	 * before each line of output.
+	 */
+	static void printQAlignNoCheck(
+		std::ostream& os,
+		const char *prefix,
+		const BTDnaString& read,
+		const EList<Edit>& edits);
+
+#ifndef NDEBUG
+	bool repOk() const;
+
+	/**
+	 * Given a list of edits and a DNA string representing the query
+	 * sequence, check that the edits are consistent with respect to the
+	 * query.
+	 */
+	static bool repOk(
+		const EList<Edit>& edits,
+		const BTDnaString& s,
+		bool fw = true,
+		size_t trim5 = 0,
+		size_t trim3 = 0);
+#endif
+
+	uint8_t  chr;  // reference character involved (for subst and ins)
+	uint8_t  qchr; // read character involved (for subst and del)
+	uint8_t  type; // 1 -> mm, 2 -> SNP, 3 -> ins, 4 -> del
+	uint32_t pos;  // position w/r/t search root
+	uint32_t pos2; // Second int to take into account when sorting.  Useful for
+	               // sorting read gap edits that are all part of the same long
+				   // gap.
+    
+    uint32_t splLen; // skip over the genome due to an intron
+    uint8_t  splDir;
+    bool     knownSpl;
+    
+    int64_t  donor_seq;
+    int64_t  acceptor_seq;
+
+	friend std::ostream& operator<< (std::ostream& os, const Edit& e);
+
+	/**
+	 * Print a comma-separated list of Edits to given output stream.
+	 */
+	static void print(
+		std::ostream& os,
+		const EList<Edit>& edits,
+		char delim = '\t');
+
+	/**
+	 * Merge second argument into the first.  Assume both are sorted to
+	 * begin with.
+	 */
+	static void merge(EList<Edit>& dst, const EList<Edit>& src);
+};
+
+#endif /* EDIT_H_ */
diff --git a/endian_swap.h b/endian_swap.h
new file mode 100644
index 0000000..762f274
--- /dev/null
+++ b/endian_swap.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ENDIAN_SWAP_H
+#define ENDIAN_SWAP_H
+
+#include <stdint.h>
+#include <inttypes.h>
+
+/**
+ * Return true iff the machine running this program is big-endian.
+ */
+static inline bool currentlyBigEndian() {
+	static uint8_t endianCheck[] = {1, 0, 0, 0};
+	return *((uint32_t*)endianCheck) != 1;
+}
+
+/**
+ * Return copy of uint32_t argument with byte order reversed.
+ */
+static inline uint16_t endianSwapU16(uint16_t u) {
+	uint16_t tmp = 0;
+	tmp |= ((u >> 8) & (0xff << 0));
+	tmp |= ((u << 8) & (0xff << 8));
+	return tmp;
+}
+
+/**
+ * Return copy of uint32_t argument with byte order reversed.
+ */
+static inline uint32_t endianSwapU32(uint32_t u) {
+	uint32_t tmp = 0;
+	tmp |= ((u >> 24) & (0xff <<  0));
+	tmp |= ((u >>  8) & (0xff <<  8));
+	tmp |= ((u <<  8) & (0xff << 16));
+	tmp |= ((u << 24) & (0xff << 24));
+	return tmp;
+}
+
+/**
+ * Return copy of uint64_t argument with byte order reversed.
+ */
+static inline uint64_t endianSwapU64(uint64_t u) {
+	uint64_t tmp = 0;
+	tmp |= ((u >> 56) & (0xffull <<  0));
+	tmp |= ((u >> 40) & (0xffull <<  8));
+	tmp |= ((u >> 24) & (0xffull << 16));
+	tmp |= ((u >>  8) & (0xffull << 24));
+	tmp |= ((u <<  8) & (0xffull << 32));
+	tmp |= ((u << 24) & (0xffull << 40));
+	tmp |= ((u << 40) & (0xffull << 48));
+	tmp |= ((u << 56) & (0xffull << 56));
+	return tmp;
+}
+
+/**
+ * Return copy of uint_t argument with byte order reversed.
+ */
+template <typename index_t>
+static inline index_t endianSwapIndex(index_t u) {
+	if(sizeof(index_t) == 8) {
+		return (index_t)endianSwapU64(u);
+	} else if(sizeof(index_t) == 4) {
+		return endianSwapU32((uint32_t)u);
+	} else {
+		return endianSwapU16(u);
+	}
+}
+
+/**
+ * Return copy of int16_t argument with byte order reversed.
+ */
+static inline int16_t endianSwapI16(int16_t i) {
+	int16_t tmp = 0;
+	tmp |= ((i >> 8) & (0xff << 0));
+	tmp |= ((i << 8) & (0xff << 8));
+	return tmp;
+}
+
+/**
+ * Convert uint16_t argument to the specified endianness.  It's assumed
+ * that u currently has the endianness of the current machine.
+ */
+static inline uint16_t endianizeU16(uint16_t u, bool toBig) {
+	if(toBig == currentlyBigEndian()) {
+		return u;
+	}
+	return endianSwapU16(u);
+}
+
+/**
+ * Convert int16_t argument to the specified endianness.  It's assumed
+ * that u currently has the endianness of the current machine.
+ */
+static inline int16_t endianizeI16(int16_t i, bool toBig) {
+	if(toBig == currentlyBigEndian()) {
+		return i;
+	}
+	return endianSwapI16(i);
+}
+
+/**
+ * Return copy of int32_t argument with byte order reversed.
+ */
+static inline int32_t endianSwapI32(int32_t i) {
+	int32_t tmp = 0;
+	tmp |= ((i >> 24) & (0xff <<  0));
+	tmp |= ((i >>  8) & (0xff <<  8));
+	tmp |= ((i <<  8) & (0xff << 16));
+	tmp |= ((i << 24) & (0xff << 24));
+	return tmp;
+}
+
+/**
+ * Convert uint32_t argument to the specified endianness.  It's assumed
+ * that u currently has the endianness of the current machine.
+ */
+static inline uint32_t endianizeU32(uint32_t u, bool toBig) {
+	if(toBig == currentlyBigEndian()) {
+		return u;
+	}
+	return endianSwapU32(u);
+}
+
+/**
+ * Convert int32_t argument to the specified endianness.  It's assumed
+ * that u currently has the endianness of the current machine.
+ */
+static inline int32_t endianizeI32(int32_t i, bool toBig) {
+	if(toBig == currentlyBigEndian()) {
+		return i;
+	}
+	return endianSwapI32(i);
+}
+
+template <typename index_t>
+index_t endianizeIndex(index_t u, bool toBig) {
+	if(toBig == currentlyBigEndian()) {
+		return u;
+	}
+	return endianSwapIndex(u);
+}
+
+#endif
diff --git a/evaluation/centrifuge_evaluate.py b/evaluation/centrifuge_evaluate.py
new file mode 100755
index 0000000..212d2d8
--- /dev/null
+++ b/evaluation/centrifuge_evaluate.py
@@ -0,0 +1,614 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess, inspect
+import platform, multiprocessing
+import string, re
+from datetime import datetime, date, time
+import copy
+from argparse import ArgumentParser, FileType
+
+
+"""
+"""
+def read_taxonomy_tree(tax_file):
+    taxonomy_tree = {}
+    for line in tax_file:
+        fields = line.strip().split('\t')
+        assert len(fields) == 5
+        tax_id, parent_tax_id, rank = fields[0], fields[2], fields[4]
+        assert tax_id not in taxonomy_tree
+        taxonomy_tree[tax_id] = [parent_tax_id, rank]        
+    return taxonomy_tree
+
+
+"""
+"""
+def compare_scm(centrifuge_out, true_out, taxonomy_tree, rank):
+    ancestors = set()
+    for tax_id in taxonomy_tree.keys():
+        if tax_id in ancestors:
+            continue
+        while True:
+            parent_tax_id, cur_rank = taxonomy_tree[tax_id]
+            if parent_tax_id in ancestors:
+                break
+            if tax_id == parent_tax_id:
+                break
+            tax_id = parent_tax_id
+            ancestors.add(tax_id)
+
+    db_dic = {}
+    first = True
+    for line in open(centrifuge_out):
+        if first:
+            first = False
+            continue
+        read_name, seq_id, tax_id, score, _, _, _, _ = line.strip().split('\t')
+        # Traverse up taxonomy tree to match the given rank parameter
+        rank_tax_id = tax_id
+        if rank != "strain":
+            while True:
+                if tax_id not in taxonomy_tree:
+                    rank_tax_id = ""
+                    break
+                parent_tax_id, cur_rank = taxonomy_tree[tax_id]
+                if cur_rank == rank:
+                    rank_tax_id = tax_id
+                    break
+                if tax_id == parent_tax_id:
+                    rank_tax_id = ""
+                    break
+                tax_id = parent_tax_id
+        else:
+            assert rank == "strain"
+            if tax_id in ancestors:
+                continue
+
+        if rank_tax_id == "":
+            continue            
+        if read_name not in db_dic:
+            db_dic[read_name] = set()
+        db_dic[read_name].add(rank_tax_id)
+
+    classified, unclassified, unique_classified = 0, 0, 0
+    for line in open(true_out):
+        if line.startswith('@'):
+            continue
+        
+        read_name, tax_id = line.strip().split('\t')[:2]
+        # Traverse up taxonomy tree to match the given rank parameter
+        rank_tax_id = tax_id
+        if rank != "strain":
+            while True:
+                if tax_id not in taxonomy_tree:
+                    rank_tax_id = ""
+                    break
+                parent_tax_id, cur_rank = taxonomy_tree[tax_id]
+                if cur_rank == rank:
+                    rank_tax_id = tax_id
+                    break
+                if tax_id == parent_tax_id:
+                    rank_tax_id = ""
+                    break
+                tax_id = parent_tax_id
+        if rank_tax_id == "":
+            continue
+        if read_name not in db_dic:
+            unclassified += 1
+            continue
+
+        maps = db_dic[read_name]
+        if rank_tax_id in maps:
+            classified += 1
+            if len(maps) == 1:
+                unique_classified += 1
+        else:
+            unclassified += 1
+
+    raw_unique_classified = 0
+    for value in db_dic.values():
+        if len(value) == 1:
+            raw_unique_classified += 1
+    return classified, unique_classified, unclassified, len(db_dic), raw_unique_classified
+
+
+"""
+"""
+def compare_abundance(centrifuge_out, true_out, taxonomy_tree, debug):
+    db_dic = {}
+    first = True
+    for line in open(centrifuge_out):
+        if first:
+            first = False
+            continue
+        genome_name, tax_id, tax_rank, genome_len, num_reads, num_unique_reads, abundance = line.strip().split('\t')
+        db_dic[tax_id] = float(abundance)
+
+    SSR = 0.0 # Sum of squared residuals
+    first = True
+    for line in open(true_out):
+        if first:
+            first = False
+            continue
+        
+        tax_id, genome_len, num_reads, abundance, genome_name = line.strip().split('\t')
+
+        # daehwan - for debugging purposes
+        """
+        cur_tax_id = tax_id
+        while True:
+            if cur_tax_id not in taxonomy_tree:
+                break
+            parent_tax_id, rank = taxonomy_tree[cur_tax_id]
+            print "%s: %s" % (cur_tax_id, rank)
+            if cur_tax_id == parent_tax_id:
+                break
+            cur_tax_id = parent_tax_id
+        print
+        print
+        """
+        
+        abundance = float(abundance)
+        if tax_id in db_dic:
+            SSR += (abundance - db_dic[tax_id]) ** 2;
+            if debug:
+                print >> sys.stderr, "\t\t\t\t{:<10}: {:.6} vs. {:.6} (truth vs. centrifuge)".format(tax_id, abundance, db_dic[tax_id])
+        else:
+            SSR += (abundance) ** 2
+
+    return SSR
+
+
+"""
+e.g.
+     sqlite3 analysis.db --header --separator $'\t' "select * from Classification;"
+"""
+def sql_execute(sql_db, sql_query):
+    sql_cmd = [
+        "sqlite3", sql_db,
+        "-separator", "\t",
+        "%s;" % sql_query
+        ]
+    # print >> sys.stderr, sql_cmd
+    sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE)
+    output = sql_process.communicate()[0][:-1]
+    return output
+
+
+"""
+"""
+def create_sql_db(sql_db):
+    if os.path.exists(sql_db):
+        print >> sys.stderr, sql_db, "already exists!"
+        return
+    
+    columns = [
+        ["id", "integer primary key autoincrement"],
+        ["centrifutgeIndex", "text"],
+        ["readBase", "text"],
+        ["readType", "text"],
+        ["program", "text"],
+        ["version", "text"],
+        ["numFragments", "integer"],
+        ["strain_classified", "integer"],
+        ["strain_uniqueclassified", "integer"],
+        ["strain_unclassified", "integer"],
+        ["species_classified", "integer"],
+        ["species_uniqueclassified", "integer"],
+        ["species_unclassified", "integer"],
+        ["genus_classified", "integer"],
+        ["genus_uniqueclassified", "integer"],
+        ["genus_unclassified", "integer"],
+        ["family_classified", "integer"],
+        ["family_uniqueclassified", "integer"],
+        ["family_unclassified", "integer"],
+        ["order_classified", "integer"],
+        ["order_uniqueclassified", "integer"],
+        ["order_unclassified", "integer"],
+        ["class_classified", "integer"],
+        ["class_uniqueclassified", "integer"],
+        ["class_unclassified", "integer"],
+        ["phylum_classified", "integer"],
+        ["phylum_uniqueclassified", "integer"],
+        ["phylum_unclassified", "integer"],
+        ["time", "real"],
+        ["host", "text"],
+        ["created", "text"],
+        ["cmd", "text"]
+        ]
+    
+    sql_create_table = "CREATE TABLE Classification ("
+    for i in range(len(columns)):
+        name, type = columns[i]
+        if i != 0:
+            sql_create_table += ", "
+        sql_create_table += ("%s %s" % (name, type))
+    sql_create_table += ");"
+    sql_execute(sql_db, sql_create_table)
+
+
+"""
+"""
+def write_analysis_data(sql_db, genome_name, database_name):
+    if not os.path.exists(sql_db):
+        return
+
+    """
+    programs = []
+    sql_aligners = "SELECT aligner FROM ReadCosts GROUP BY aligner"
+    output = sql_execute(sql_db, sql_aligners)
+    aligners = output.split()
+
+    can_read_types = ["all", "M", "2M_gt_15", "2M_8_15", "2M_1_7", "gt_2M"]    
+    tmp_read_types = []
+    sql_types = "SELECT type FROM ReadCosts GROUP BY type"
+    output = sql_execute(sql_db, sql_types)
+    tmp_read_types = output.split()
+
+    read_types = []
+    for read_type in can_read_types:
+        if read_type in tmp_read_types:
+            read_types.append(read_type)
+
+    for paired in [False, True]:
+        database_fname = genome_name + "_" + database_name
+        if paired:
+            end_type = "paired"
+            database_fname += "_paired"
+        else:
+            end_type = "single"
+            database_fname += "_single"
+        database_fname += ".analysis"
+        database_file = open(database_fname, "w")
+        print >> database_file, "end_type\ttype\taligner\tnum_reads\ttime\tmapped_reads\tunique_mapped_reads\tunmapped_reads\tmapping_point\ttrue_gtf_junctions\ttemp_junctions\ttemp_gtf_junctions"
+        for aligner in aligners:
+            for read_type in read_types:
+                sql_row = "SELECT end_type, type, aligner, num_reads, time, mapped_reads, unique_mapped_reads, unmapped_reads, mapping_point, true_gtf_junctions, temp_junctions, temp_gtf_junctions FROM ReadCosts"
+                sql_row += " WHERE genome = '%s' and head = '%s' and aligner = '%s' and type = '%s' and end_type = '%s' ORDER BY created DESC LIMIT 1" % (genome_name, database_name, aligner, read_type, end_type)
+                output = sql_execute(sql_db, sql_row)
+                if output:
+                    print >> database_file, output
+
+        database_file.close()
+    """
+
+
+"""
+"""
+def evaluate(index_base,
+             index_base_for_read,
+             num_fragment,
+             paired,
+             error_rate,
+             ranks,
+             programs,
+             runtime_only,
+             sql,
+             verbose,
+             debug):
+    # Current script directory
+    curr_script = os.path.realpath(inspect.getsourcefile(evaluate))
+    path_base = os.path.dirname(curr_script)
+
+    sql_db_name = "analysis.db"
+    if not os.path.exists(sql_db_name):
+        create_sql_db(sql_db_name)
+
+    num_cpus = multiprocessing.cpu_count()
+    if num_cpus > 8:
+        num_threads = min(8, num_cpus)
+        desktop = False
+    else:
+        num_threads = min(3, num_cpus)
+        desktop = True
+
+    def check_files(fnames):
+        for fname in fnames:
+            if not os.path.exists(fname):
+                return False
+        return True
+
+    # Check if indexes exists, otherwise create indexes
+    index_path = "%s/indexes/Centrifuge" % path_base
+    if not os.path.exists(path_base + "/indexes"):
+        os.mkdir(path_base + "/indexes")
+    if not os.path.exists(index_path):
+        os.mkdir(index_path)
+    index_fnames = ["%s/%s.%d.cf" % (index_path, index_base, i+1) for i in range(3)]
+    if not check_files(index_fnames):
+        print >> sys.stderr, "Downloading indexes: %s" % ("index")
+        os.system("cd %s; wget ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz; ln -s %s/%s* .; cd -" % \
+                      (index_path, index_base, index_base, index_base, index_base, index_base))
+        assert check_files(index_fnames)        
+
+    # Read taxonomic IDs
+    centrifuge_inspect = os.path.join(path_base, "../centrifuge-inspect")
+    tax_ids = set()
+    tax_cmd = [centrifuge_inspect,
+               "--conversion-table",
+               "%s/%s" % (index_path, index_base_for_read)]
+    tax_proc = subprocess.Popen(tax_cmd, stdout=subprocess.PIPE)
+    for line in tax_proc.stdout:
+        _, tax_id = line.strip().split()
+        tax_ids.add(tax_id)
+    tax_ids = list(tax_ids)
+
+    # Read taxonomic tree
+    tax_tree_cmd = [centrifuge_inspect,
+                    "--taxonomy-tree",
+                    "%s/%s" % (index_path, index_base_for_read)]    
+    tax_tree_proc = subprocess.Popen(tax_tree_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
+    taxonomy_tree = read_taxonomy_tree(tax_tree_proc.stdout)
+
+    compressed = (index_base.find("compressed") != -1) or (index_base_for_read.find("compressed") != -1)
+
+    # Check if simulated reads exist, otherwise simulate reads
+    read_path = "%s/reads" % path_base
+    if not os.path.exists(read_path):
+        os.mkdir(read_path)
+    read_base = "%s_%dM" % (index_base_for_read, num_fragment / 1000000)
+    if error_rate > 0.0:
+        read_base += "%.2fe" % error_rate
+
+    read1_fname = "%s/%s_1.fa" % (read_path, read_base)
+    read2_fname = "%s/%s_2.fa" % (read_path, read_base)
+    truth_fname = "%s/%s.truth" % (read_path, read_base)
+    scm_fname = "%s/%s.scm" % (read_path, read_base)
+    read_fnames = [read1_fname, read2_fname, truth_fname, scm_fname]
+    if not check_files(read_fnames):
+        print >> sys.stderr, "Simulating reads %s_1.fq %s_2.fq ..." % (read_base, read_base)
+        centrifuge_simulate = os.path.join(path_base, "centrifuge_simulate_reads.py")
+        simulate_cmd = [centrifuge_simulate,
+                        "--num-fragment", str(num_fragment)]
+        if error_rate > 0.0:
+            simulate_cmd += ["--error-rate", str(error_rate)]
+        simulate_cmd += ["%s/%s" % (index_path, index_base_for_read),
+                         "%s/%s" % (read_path, read_base)]
+        
+        simulate_proc = subprocess.Popen(simulate_cmd, stdout=open("/dev/null", 'w'))
+        simulate_proc.communicate()
+        assert check_files(read_fnames)
+
+    if runtime_only:
+        verbose = True
+
+    if paired:
+        base_fname = read_base + "_paired"
+    else:
+        base_fname = read_base + "_single"
+
+    print >> sys.stderr, "Database: %s" % (index_base)
+    if paired:
+        print >> sys.stderr, "\t%d million pairs" % (num_fragment / 1000000)
+    else:
+        print >> sys.stderr, "\t%d million reads" % (num_fragment / 1000000)
+
+    program_bin_base = "%s/.." % path_base
+    def get_program_version(program, version):
+        version = ""
+        if program == "centrifuge":
+            if version:
+                cmd = ["%s/%s_%s/%s" % (program_bin_base, program, version, program)]
+            else:
+                cmd = ["%s/%s" % (program_bin_base, program)]
+            cmd += ["--version"]                    
+            cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+            version = cmd_process.communicate()[0][:-1].split("\n")[0]
+            version = version.split()[-1]
+        else:
+            assert False
+
+        return version
+
+    def get_program_cmd(program, version, read1_fname, read2_fname, out_fname):
+        cmd = []
+        if program == "centrifuge":
+            if version:
+                cmd = ["%s/centrifuge_%s/centrifuge" % (program_bin_base, version)]
+            else:
+                cmd = ["%s/centrifuge" % (program_bin_base)]
+            cmd += ["-f",
+                    "-p", str(num_threads),
+                    "%s/%s" % (index_path, index_base)]
+            # cmd += ["-k", "5"]
+            # cmd += ["--no-traverse"]
+            if paired:
+                cmd += ["-1", read1_fname,
+                        "-2", read2_fname]
+            else:
+                cmd += ["-U", read1_fname]                        
+        else:
+            assert False
+
+        return cmd
+
+    init_time = {"centrifuge" : 0.0}
+    for program, version in programs:
+        program_name = program
+        if version:
+            program_name += ("_%s" % version)
+
+        print >> sys.stderr, "\t%s\t%s" % (program_name, str(datetime.now()))
+        if paired:
+            program_dir = program_name + "_paired"
+        else:
+            program_dir = program_name + "_single"
+            
+        if not os.path.exists(program_dir):
+            os.mkdir(program_dir)
+        os.chdir(program_dir)
+
+        out_fname = "centrifuge.output"
+        if runtime_only:
+            out_fname = "/dev/null"
+
+        if os.path.exists(out_fname):
+            continue
+
+        # Classify all reads
+        program_cmd = get_program_cmd(program, version, read1_fname, read2_fname, out_fname)
+        start_time = datetime.now()
+        if verbose:
+            print >> sys.stderr, "\t", start_time, " ".join(program_cmd)
+        if program in ["centrifuge"]:
+            proc = subprocess.Popen(program_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
+        else:
+            proc = subprocess.Popen(program_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        proc.communicate()
+        finish_time = datetime.now()
+        duration = finish_time - start_time
+        assert program in init_time
+        duration = duration.total_seconds() - init_time[program]
+        if duration < 0.1:
+            duration = 0.1
+        if verbose:
+            print >> sys.stderr, "\t", finish_time, "finished:", duration            
+
+        results = {"strain"  : [0, 0, 0],
+                   "species" : [0, 0, 0],
+                   "genus"   : [0, 0, 0],
+                   "family"  : [0, 0, 0],
+                   "order"   : [0, 0, 0],
+                   "class"   : [0, 0, 0],
+                   "phylum"  : [0, 0, 0]}
+        for rank in ranks:
+            if runtime_only:
+                break
+            if compressed and rank == "strain":
+                continue
+
+            classified, unique_classified, unclassified, raw_classified, raw_unique_classified = \
+                compare_scm(out_fname, scm_fname, taxonomy_tree, rank)
+            results[rank] = [classified, unique_classified, unclassified]
+            num_cases = classified + unclassified
+            # if rank == "strain":
+            #    assert num_cases == num_fragment
+
+            print >> sys.stderr, "\t\t%s" % rank
+            print >> sys.stderr, "\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases)
+            print >> sys.stderr, "\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified)
+            print >> sys.stderr, "\n\t\t\tfor uniquely classified ",
+            if paired:
+                print >> sys.stderr, "pairs"
+            else:
+                print >> sys.stderr, "reads"
+            print >> sys.stderr, "\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases)
+            print >> sys.stderr, "\t\t\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified)
+
+            # Calculate sum of squared residuals in abundance
+            if rank == "strain":
+                abundance_SSR = compare_abundance("centrifuge_report.tsv", truth_fname, taxonomy_tree, debug)
+                print >> sys.stderr, "\t\t\tsum of squared residuals in abundance: {}".format(abundance_SSR)
+
+        if runtime_only:
+            os.chdir("..")
+            continue
+
+        if sql and os.path.exists("../" + sql_db_name):
+            if paired:
+                end_type = "paired"
+            else:
+                end_type = "single"
+            sql_insert = "INSERT INTO \"Classification\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \
+                (index_base, read_base, end_type, program_name, get_program_version(program, version), num_fragment, \
+                     results["strain"][0],  results["strain"][1],  results["strain"][2], \
+                     results["species"][0], results["species"][1], results["species"][2], \
+                     results["genus"][0],   results["genus"][1],   results["genus"][2], \
+                     results["family"][0],  results["family"][1],  results["family"][2], \
+                     results["order"][0],   results["order"][1],   results["order"][2], \
+                     results["class"][0],   results["class"][1],   results["class"][2], \
+                     results["phylum"][0],  results["phylum"][1],  results["phylum"][2], \
+                     duration, platform.node(), " ".join(program_cmd))
+            sql_execute("../" + sql_db_name, sql_insert)     
+
+ 
+        os.system("touch done")
+        os.chdir("..")
+
+        """
+        if os.path.exists(sql_db_name):
+            write_analysis_data(sql_db_name, genome, data_base)
+        """
+        
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description='Centrifuge evaluation')
+    parser.add_argument('index_base',
+                        nargs='?',
+                        type=str,
+                        help='Centrifuge index')
+    parser.add_argument('--index-base-for-read',
+                        dest="index_base_for_read",
+                        type=str,
+                        default="",
+                        help='index base for read (default same as index base)')    
+    parser.add_argument('--num-fragment',
+                        dest="num_fragment",
+                        action='store',
+                        type=int,
+                        default=1,
+                        help='Number of fragments in millions (default: 1)')
+    parser.add_argument('--paired',
+                        dest='paired',
+                        action='store_true',
+                        help='Paired-end reads')
+    parser.add_argument('--error-rate',
+                        dest='error_rate',
+                        action='store',
+                        type=float,
+                        default=0.0,
+                        help='per-base sequencing error rate (%%) (default: 0.0)')
+    rank_list_default = "strain,species,genus,family,order,class,phylum"
+    parser.add_argument("--rank-list",
+                        dest="ranks",
+                        type=str,
+                        default=rank_list_default,
+                        help="A comma-separated list of ranks (default: %s)" % rank_list_default)
+    parser.add_argument("--program-list",
+                        dest="programs",
+                        type=str,
+                        default="centrifuge",
+                        help="A comma-separated list of aligners (default: centrifuge)")
+    parser.add_argument('--runtime-only',
+                        dest='runtime_only',
+                        action='store_true',
+                        help='Just check runtime without evaluation')    
+    parser.add_argument('--no-sql',
+                        dest='sql',
+                        action='store_false',
+                        help='Do not write results into a sqlite database')
+    parser.add_argument('-v', '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        help='also print some statistics to stderr')
+    parser.add_argument('--debug',
+                        dest='debug',
+                        action='store_true',
+                        help='Debug')
+
+    args = parser.parse_args()
+    if not args.index_base:
+        parser.print_help()
+        exit(1)
+    if args.index_base_for_read == "":
+        args.index_base_for_read = args.index_base
+    ranks = args.ranks.split(',')
+    programs = []
+    for program in args.programs.split(','):
+        if '_' in program:
+            programs.append(program.split('_'))
+        else:
+            programs.append([program, ""])
+            
+    evaluate(args.index_base,
+             args.index_base_for_read,
+             args.num_fragment * 1000000,
+             args.paired,
+             args.error_rate,
+             ranks,
+             programs,
+             args.runtime_only,
+             args.sql,
+             args.verbose,
+             args.debug)
diff --git a/evaluation/centrifuge_simulate_reads.py b/evaluation/centrifuge_simulate_reads.py
new file mode 100755
index 0000000..6002717
--- /dev/null
+++ b/evaluation/centrifuge_simulate_reads.py
@@ -0,0 +1,875 @@
+#!/usr/bin/env python
+
+#
+# Copyright 2015, Daehwan Kim <infphilo at gmail.com>
+#
+# This file is part of HISAT 2.
+#
+# HISAT 2 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# HISAT 2 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys, os, subprocess, inspect
+import math, random, re
+from collections import defaultdict, Counter
+from argparse import ArgumentParser, FileType
+
+
+"""
+"""
+def reverse_complement(seq):
+    result = ""
+    for nt in seq:
+        base = nt
+        if nt == 'A':
+            base = 'T'
+        elif nt == 'a':
+            base = 't'
+        elif nt == 'C':
+            base = 'G'
+        elif nt == 'c':
+            base = 'g'
+        elif nt == 'G':
+            base = 'C'
+        elif nt == 'g':
+            base = 'c'
+        elif nt == 'T':
+            base = 'A'
+        elif nt == 't':
+            base = 'a'
+        
+        result = base + result
+    
+    return result
+
+
+"""
+"""
+def get_genome_seq_id(genome_name):
+    genome_seq_id = genome_name.split()[0]
+    if len(genome_seq_id.split('|')) >= 2:
+        genome_seq_id = '|'.join(genome_seq_id.split('|')[:2])
+    return genome_seq_id
+    
+
+"""
+Random source for sequencing errors
+"""
+class ErrRandomSource:
+    def __init__(self, prob = 0.0, size = 1 << 20):
+        self.size = size
+        self.rands = []
+        for i in range(self.size):
+            if random.random() < prob:
+                self.rands.append(1)
+            else:
+                self.rands.append(0)
+        self.cur = 0
+        
+    def getRand(self):
+        assert self.cur < len(self.rands)
+        rand = self.rands[self.cur]
+        self.cur = (self.cur + 1) % len(self.rands)
+        return rand
+
+
+"""
+"""
+def read_genomes(genomes_file, seq2taxID):
+    genome_dic = {}    
+    tax_id, sequence = "", ""
+    for line in genomes_file:
+        if line[0] == ">":
+            if tax_id and sequence:
+                if genome_seq_id in genome_dic:
+                    genome_dic[tax_id] += sequence
+                else:
+                    genome_dic[tax_id] = sequence
+            
+            genome_name = line[1:-1]
+            genome_seq_id = get_genome_seq_id(genome_name)
+            assert genome_seq_id in seq2taxID
+            tax_id = seq2taxID[genome_seq_id]
+            sequence = ""
+        else:
+            sequence += line[:-1]
+
+    if tax_id and sequence:
+        if tax_id in genome_dic:
+            genome_dic[tax_id] += sequence
+        else:
+            genome_dic[tax_id] = sequence
+    
+    return genome_dic
+
+
+"""
+"""
+def read_transcript(genomes_seq, gtf_file, frag_len):
+    genes = defaultdict(list)
+    transcripts = {}
+
+    # Parse valid exon lines from the GTF file into a dict by transcript_id
+    for line in gtf_file:
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        if '#' in line:
+            line = line.split('#')[0].strip()
+        try:
+            chrom, source, feature, left, right, score, \
+                strand, frame, values = line.split('\t')
+        except ValueError:
+            continue
+        if not chrom in genome_seq:
+            continue
+        
+        # Zero-based offset
+        left, right = int(left) - 1, int(right) - 1
+        if feature != 'exon' or left >= right:
+            continue
+
+        values_dict = {}
+        for attr in values.split(';')[:-1]:
+            attr, _, val = attr.strip().partition(' ')
+            values_dict[attr] = val.strip('"')
+
+        if 'gene_id' not in values_dict or \
+                'transcript_id' not in values_dict:
+            continue
+
+        transcript_id = values_dict['transcript_id']
+        if transcript_id not in transcripts:
+            transcripts[transcript_id] = [chrom, strand, [[left, right]]]
+            genes[values_dict['gene_id']].append(transcript_id)
+        else:
+            transcripts[transcript_id][2].append([left, right])
+
+    # Sort exons and merge where separating introns are <=5 bps
+    for tran, [chr, strand, exons] in transcripts.items():
+            exons.sort()
+            tmp_exons = [exons[0]]
+            for i in range(1, len(exons)):
+                if exons[i][0] - tmp_exons[-1][1] <= 5:
+                    tmp_exons[-1][1] = exons[i][1]
+                else:
+                    tmp_exons.append(exons[i])
+            transcripts[tran] = [chr, strand, tmp_exons]
+
+    tmp_transcripts = {}
+    for tran, [chr, strand, exons] in transcripts.items():
+        exon_lens = [e[1] - e[0] + 1 for e in exons]
+        transcript_len = sum(exon_lens)
+        if transcript_len >= frag_len:
+            tmp_transcripts[tran] = [chr, strand, transcript_len, exons]
+
+    transcripts = tmp_transcripts
+
+    return genes, transcripts
+    
+
+"""
+"""
+def generate_rna_expr_profile(expr_profile_type, num_transcripts = 10000):
+    # Modelling and simulating generic RNA-Seq experiments with the flux simulator
+    # http://nar.oxfordjournals.org/content/suppl/2012/06/29/gks666.DC1/nar-02667-n-2011-File002.pdf
+    def calc_expr(x, a):
+        x, a, b = float(x), 9500.0, 9500.0
+        k = -0.6
+        return (x**k) * math.exp(x/a * (x/b)**2)
+    
+    expr_profile = [0.0] * num_transcripts
+    for i in range(len(expr_profile)):
+        if expr_profile_type == "flux":
+            expr_profile[i] = calc_expr(i + 1, num_transcripts)
+        elif expr_profile_type == "constant":
+            expr_profile[i] = 1.0
+        else:
+            assert False
+
+    expr_sum = sum(expr_profile)
+    expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))]
+    assert abs(sum(expr_profile) - 1.0) < 0.001
+    return expr_profile
+
+
+"""
+"""
+def generate_dna_expr_profile(expr_profile_type, num_genomes):
+    # Modelling and simulating generic RNA-Seq experiments with the flux simulator
+    # http://nar.oxfordjournals.org/content/suppl/2012/06/29/gks666.DC1/nar-02667-n-2011-File002.pdf
+    def calc_expr(x, a):
+        x, a, b = float(x), 9500.0, 9500.0
+        k = -0.6
+        return (x**k) * math.exp(x/a * (x/b)**2)
+    
+    expr_profile = [0.0] * num_genomes
+    for i in range(len(expr_profile)):
+        if expr_profile_type == "flux":
+            expr_profile[i] = calc_expr(i + 1, num_genomes)
+        elif expr_profile_type == "constant":
+            expr_profile[i] = 1.0
+        else:
+            assert False
+
+    expr_sum = sum(expr_profile)
+    expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))]
+    assert abs(sum(expr_profile) - 1.0) < 0.001
+    return expr_profile
+
+
+"""
+"""
+def getSamAlignment(dna, exons, genome_seq, trans_seq, frag_pos, read_len, err_rand_src, max_mismatch):
+    # Find the genomic position for frag_pos and exon number
+    tmp_frag_pos, tmp_read_len = frag_pos, read_len
+    pos, cigars, cigar_descs = exons[0][0], [], []
+    e_pos = 0
+    prev_e = None
+    for e_i in range(len(exons)):
+        e = exons[e_i]
+        if prev_e:
+            i_len = e[0] - prev_e[1] - 1
+            pos += i_len
+        e_len = e[1] - e[0] + 1
+        if e_len <= tmp_frag_pos:
+            tmp_frag_pos -= e_len
+            pos += e_len
+        else:
+            pos += tmp_frag_pos
+            e_pos = tmp_frag_pos
+            break                        
+        prev_e = e
+
+    # Define Cigar and its descriptions
+    assert e_i < len(exons)
+    e_len = exons[e_i][1] - exons[e_i][0] + 1
+    assert e_pos < e_len
+    cur_pos = pos
+    match_len = 0
+    prev_e = None
+    mismatch, remain_trans_len = 0, len(trans_seq) - (frag_pos + read_len)
+    assert remain_trans_len >= 0
+    for e_i in range(e_i, len(exons)):
+        e = exons[e_i]
+        if prev_e:
+            i_len = e[0] - prev_e[1] - 1
+            cur_pos += i_len
+            cigars.append(("{}N".format(i_len)))
+            cigar_descs.append([])
+        tmp_e_left = e_left = e[0] + e_pos
+        e_pos = 0
+
+        # Simulate mismatches due to sequencing errors
+        mms = []
+        for i in range(e_left, min(e[1], e_left + tmp_read_len - 1)):
+            if err_rand_src.getRand() == 1:
+                assert i < len(genome_seq)
+                err_base = "A"
+                rand = random.randint(0, 2)
+                if genome_seq[i] == "A":
+                    err_base = "GCT"[rand]
+                elif genome_seq[i] == "C":
+                    err_base = "AGT"[rand]
+                elif genome_seq[i] == "G":
+                    err_base = "ACT"[rand]
+                else:
+                    err_base = "ACG"[rand]                    
+                mms.append(["", "single", i, err_base])
+
+        tmp_diffs = mms
+        def diff_sort(a , b):
+            return a[2] - b[2]
+
+        tmp_diffs = sorted(tmp_diffs, cmp=diff_sort)
+        diffs = []
+        if len(tmp_diffs) > 0:
+            diffs = tmp_diffs[:1]
+            for diff in tmp_diffs[1:]:
+                _, tmp_type, tmp_pos, tmp_data = diff
+                _, prev_type, prev_pos, prev_data = diffs[-1]
+                if prev_type == "deletion":
+                    prev_pos += prev_data
+                if tmp_pos <= prev_pos:
+                    continue
+                diffs.append(diff)
+
+        cigar_descs.append([])
+        prev_diff = None
+        for diff in diffs:
+            diff_id, diff_type, diff_pos, diff_data = diff
+            if prev_diff:
+                prev_diff_id, prev_diff_type, prev_diff_pos, prev_diff_data = prev_diff
+                if prev_diff_type == "deletion":
+                    prev_diff_pos += prev_diff_data
+                assert prev_diff_pos < diff_pos
+            diff_pos2 = diff_pos
+            if diff_type == "deletion":
+                diff_pos2 += diff_data
+            if e_left + tmp_read_len - 1 < diff_pos2 or e[1] < diff_pos2:
+                break            
+            if diff_type == "single":
+                if diff_id == "" and mismatch >= max_mismatch:
+                    continue                
+                cigar_descs[-1].append([diff_pos - tmp_e_left, diff_data, diff_id])
+                tmp_e_left = diff_pos + 1
+                if diff_id == "":
+                    mismatch += 1
+            elif diff_type == "deletion":
+                if len(cigars) <= 0:
+                    continue
+                del_len = diff_data
+                if remain_trans_len < del_len:
+                    continue
+                remain_trans_len -= del_len
+                if diff_pos - e_left > 0:
+                    cigars.append("{}M".format(diff_pos - e_left))
+                    cigar_descs[-1].append([diff_pos - tmp_e_left, "", ""])
+                    cigar_descs.append([])
+                cigars.append("{}D".format(del_len))
+                cigar_descs[-1].append([0, del_len, diff_id])
+                cigar_descs.append([])
+                tmp_read_len -= (diff_pos - e_left)
+                e_left = tmp_e_left = diff_pos + del_len
+            elif diff_type == "insertion":
+                if len(cigars) > 0:
+                    ins_len = len(diff_data)
+                    if e_left + tmp_read_len - 1 < diff_pos + ins_len:
+                        break
+                    if diff_pos - e_left > 0:
+                        cigars.append("{}M".format(diff_pos - e_left))
+                        cigar_descs[-1].append([diff_pos - tmp_e_left, "", ""])
+                        cigar_descs.append([])
+                    cigars.append("{}I".format(ins_len))
+                    cigar_descs[-1].append([0, diff_data, diff_id])
+                    cigar_descs.append([])
+                    tmp_read_len -= (diff_pos - e_left)
+                    tmp_read_len -= ins_len
+                    e_left = tmp_e_left = diff_pos
+            else:
+                assert False
+            prev_diff = diff
+
+        e_right = min(e[1], e_left + tmp_read_len - 1)
+        e_len = e_right - e_left + 1
+        remain_e_len = e_right - tmp_e_left + 1
+        if remain_e_len > 0:
+            cigar_descs[-1].append([remain_e_len, "", ""])
+        if e_len < tmp_read_len:
+            tmp_read_len -= e_len
+            cigars.append(("{}M".format(e_len)))
+        else:
+            assert e_len == tmp_read_len
+            cigars.append(("{}M".format(tmp_read_len)))
+            tmp_read_len = 0
+            break
+        prev_e = e
+
+    # Define MD, XM, NM, Zs, read_seq
+    MD, XM, NM, Zs, read_seq = "", 0, 0, "", ""
+    assert len(cigars) == len(cigar_descs)
+    MD_match_len, Zs_match_len = 0, 0
+    cur_trans_pos = frag_pos
+    for c in range(len(cigars)):
+        cigar = cigars[c]
+        cigar_len, cigar_op = int(cigar[:-1]), cigar[-1]
+        cigar_desc = cigar_descs[c]
+        if cigar_op == 'N':
+            continue
+        if cigar_op == 'M':
+            for add_match_len, alt_base, snp_id in cigar_desc:
+                MD_match_len += add_match_len
+                Zs_match_len += add_match_len
+                assert cur_trans_pos + add_match_len <= len(trans_seq)
+                read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
+                cur_trans_pos += add_match_len
+                if alt_base != "":
+                    if MD_match_len > 0:
+                        MD += ("{}".format(MD_match_len))
+                        MD_match_len = 0
+                    MD += trans_seq[cur_trans_pos]
+                    if snp_id != "":
+                        if Zs != "":
+                            Zs += ","
+                        Zs += ("{}|S|{}".format(Zs_match_len, snp_id))
+                        Zs_match_len = 0
+                    else:
+                        Zs_match_len += 1
+                    if snp_id == "":
+                        XM += 1
+                        NM += 1
+                    read_seq += alt_base
+                    cur_trans_pos += 1
+        elif cigar_op == 'D':
+            assert len(cigar_desc) == 1
+            add_match_len, del_len, snp_id = cigar_desc[0]
+            MD_match_len += add_match_len
+            Zs_match_len += add_match_len
+            if MD_match_len > 0:
+                MD += ("{}".format(MD_match_len))
+                MD_match_len = 0
+            MD += ("^{}".format(trans_seq[cur_trans_pos:cur_trans_pos+cigar_len]))
+            read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
+            if Zs != "":
+                Zs += ","
+            Zs += ("{}|D|{}".format(Zs_match_len, cigar_desc[0][-1]))
+            Zs_match_len = 0
+            cur_trans_pos += cigar_len
+        elif cigar_op == 'I':
+            assert len(cigar_desc) == 1
+            add_match_len, ins_seq, snp_id = cigar_desc[0]
+            ins_len = len(ins_seq)
+            MD_match_len += add_match_len
+            Zs_match_len += add_match_len
+            read_seq += trans_seq[cur_trans_pos:cur_trans_pos+add_match_len]
+            read_seq += ins_seq
+            if Zs != "":
+                Zs += ","
+            Zs += ("{}|I|{}".format(Zs_match_len, cigar_desc[0][-1]))
+            Zs_match_len = 0
+        else:
+            assert False
+
+    if MD_match_len > 0:
+        MD += ("{}".format(MD_match_len))
+
+    if len(read_seq) != read_len:
+        print >> sys.stderr, "read length differs:", len(read_seq), "vs.", read_len
+        print >> sys.stderr, pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs
+        assert False
+
+    return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq
+
+
+"""
+"""
+cigar_re = re.compile('\d+\w')
+def samRepOk(genome_seq, read_seq, chr, pos, cigar, XM, NM, MD, Zs, max_mismatch):
+    assert chr in genome_seq
+    chr_seq = genome_seq[chr]
+    assert pos < len(chr_seq)
+
+    # Calculate XM and NM based on Cigar and Zs
+    cigars = cigar_re.findall(cigar)
+    cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
+    ref_pos, read_pos = pos, 0
+    ann_ref_seq, ann_ref_rel, ann_read_seq, ann_read_rel = [], [], [], []
+    for i in range(len(cigars)):
+        cigar_len, cigar_op = cigars[i]
+        if cigar_op == "M":
+            partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
+            partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
+            assert len(partial_ref_seq) == len(partial_read_seq)
+            ann_ref_seq += list(partial_ref_seq)
+            ann_read_seq += list(partial_read_seq)
+            for j in range(len(partial_ref_seq)):
+                if partial_ref_seq[j] == partial_read_seq[j]:
+                    ann_ref_rel.append("=")
+                    ann_read_rel.append("=")
+                else:
+                    ann_ref_rel.append("X")
+                    ann_read_rel.append("X")
+            ref_pos += cigar_len
+            read_pos += cigar_len
+        elif cigar_op == "D":
+            partial_ref_seq = chr_seq[ref_pos:ref_pos+cigar_len]
+            ann_ref_rel += list(partial_ref_seq)
+            ann_ref_seq += list(partial_ref_seq)
+            ann_read_rel += (["-"] * cigar_len)
+            ann_read_seq += (["-"] * cigar_len)
+            ref_pos += cigar_len
+        elif cigar_op == "I":
+            partial_read_seq = read_seq[read_pos:read_pos+cigar_len]
+            ann_ref_rel += (["-"] * cigar_len)
+            ann_ref_seq += (["-"] * cigar_len)
+            ann_read_rel += list(partial_read_seq)
+            ann_read_seq += list(partial_read_seq) 
+            read_pos += cigar_len
+        elif cigar_op == "N":
+            ref_pos += cigar_len
+        else:
+            assert False
+    
+    assert len(ann_ref_seq) == len(ann_read_seq)
+    assert len(ann_ref_seq) == len(ann_ref_rel)
+    assert len(ann_ref_seq) == len(ann_read_rel)
+    ann_Zs_seq = ["0" for i in range(len(ann_ref_seq))]
+
+    Zss, Zs_i, snp_pos_add = [], 0, 0
+    if Zs != "":
+        Zss = Zs.split(',')
+        Zss = [zs.split('|') for zs in Zss]
+
+    ann_read_pos = 0
+    for zs in Zss:
+        zs_pos, zs_type, zs_id = zs
+        zs_pos = int(zs_pos)
+        for i in range(zs_pos):
+            while ann_read_rel[ann_read_pos] == '-':
+                ann_read_pos += 1
+            ann_read_pos += 1
+        if zs_type == "S":
+            ann_Zs_seq[ann_read_pos] = "1"
+            ann_read_pos += 1
+        elif zs_type == "D":
+            while ann_read_rel[ann_read_pos] == '-':
+                ann_Zs_seq[ann_read_pos] = "1"
+                ann_read_pos += 1
+        elif zs_type == "I":
+            while ann_ref_rel[ann_read_pos] == '-':
+                ann_Zs_seq[ann_read_pos] = "1"
+                ann_read_pos += 1
+        else:
+            assert False
+
+    tMD, tXM, tNM = "", 0, 0
+    match_len = 0
+    i = 0
+    while i < len(ann_ref_seq):
+        if ann_ref_rel[i] == "=":
+            assert ann_read_rel[i] == "="
+            match_len += 1
+            i += 1
+            continue
+        assert ann_read_rel[i] != "="
+        if ann_ref_rel[i] == "X" and ann_read_rel[i] == "X":
+            if match_len > 0:
+                tMD += ("{}".format(match_len))
+                match_len = 0
+            tMD += ann_ref_seq[i]
+            if ann_Zs_seq[i] == "0":
+                tXM += 1
+                tNM += 1
+            i += 1
+        else:
+            assert ann_ref_rel[i] == "-" or ann_read_rel[i] == "-"
+            if ann_ref_rel[i] == '-':
+                while ann_ref_rel[i] == '-':
+                    if ann_Zs_seq[i] == "0":
+                        tNM += 1
+                    i += 1
+            else:
+                assert ann_read_rel[i] == '-'
+                del_seq = ""
+                while  ann_read_rel[i] == '-':
+                    del_seq += ann_ref_seq[i]
+                    if ann_Zs_seq[i] == "0":
+                        tNM += 1
+                    i += 1
+                if match_len > 0:
+                    tMD += ("{}".format(match_len))
+                    match_len = 0
+                tMD += ("^{}".format(del_seq))
+
+    if match_len > 0:
+        tMD += ("{}".format(match_len))
+
+    if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM:
+        print >> sys.stderr, chr, pos, cigar, MD, XM, NM, Zs
+        print >> sys.stderr, tMD, tXM, tNM
+        assert False
+        
+        
+"""
+"""
+def simulate_reads(index_fname, base_fname, \
+                       dna, paired_end, read_len, frag_len, \
+                       num_frag, expr_profile_type, error_rate, max_mismatch, \
+                       random_seed, sanity_check, verbose):
+    random.seed(random_seed)
+    
+    # Current script directory
+    curr_script = os.path.realpath(inspect.getsourcefile(simulate_reads))
+    ex_path = os.path.dirname(curr_script)
+    centrifuge_inspect = os.path.join(ex_path, "../centrifuge-inspect")
+
+    err_rand_src = ErrRandomSource(error_rate / 100.0)
+    
+    if read_len > frag_len:
+        frag_len = read_len
+
+    # Read taxonomic IDs
+    seq2texID = {}
+    tax_cmd = [centrifuge_inspect,
+               "--conversion-table",
+               index_fname]
+    tax_proc = subprocess.Popen(tax_cmd, stdout=subprocess.PIPE)
+    for line in tax_proc.stdout:
+        seq_id, tax_id = line.strip().split()
+        seq2texID[seq_id] = tax_id
+
+    # Read names
+    names = {}
+    name_cmd = [centrifuge_inspect,
+                "--name-table",
+                index_fname]
+    name_proc = subprocess.Popen(name_cmd, stdout=subprocess.PIPE)
+    for line in name_proc.stdout:
+        tax_id, name = line.strip().split('\t')
+        names[tax_id] = name
+
+    # Genome sizes
+    sizes = {}
+    size_cmd = [centrifuge_inspect,
+                "--size-table",
+                index_fname]
+    size_proc = subprocess.Popen(size_cmd, stdout=subprocess.PIPE)
+    for line in size_proc.stdout:
+        tax_id, size = line.strip().split('\t')
+        sizes[tax_id] = int(size)
+
+    # Read genome sequences into memory
+    genomes_fname = index_fname + ".fa"
+    if not os.path.exists(genomes_fname):
+        print >> sys.stderr, "Extracting genomes from Centrifuge index to %s, which may take a few hours ..."  % (genomes_fname)
+        extract_cmd = [centrifuge_inspect,
+                       index_fname]
+        extract_proc = subprocess.Popen(extract_cmd, stdout=open(genomes_fname, 'w'))
+        extract_proc.communicate()
+    genome_seqs = read_genomes(open(genomes_fname), seq2texID)
+
+    if dna:
+        genes, transcripts = {}, {}
+    else:
+        genes, transcripts = read_transcript(genome_seqs, gtf_file, frag_len)
+        
+    if sanity_check:
+        sanity_check_input(genomes_seq, genes, transcripts, frag_len)
+
+    if dna:
+        expr_profile = generate_dna_expr_profile(expr_profile_type, min(len(genome_seqs), 100))
+    else:
+        num_transcripts = min(len(transcripts), 10000)
+        expr_profile = generate_rna_expr_profile(expr_profile_type, num_transcripts)
+
+    expr_profile = [int(expr_profile[i] * num_frag) for i in range(len(expr_profile))]
+    assert num_frag >= sum(expr_profile)
+    while sum(expr_profile) < num_frag:
+        for i in range(min(num_frag - sum(expr_profile), len(expr_profile))):
+            expr_profile[i] += 1
+    assert num_frag == sum(expr_profile)
+
+    if dna:
+        genome_ids = genome_seqs.keys()
+    else:
+        transcript_ids = transcripts.keys()
+        random.shuffle(transcript_ids)
+        assert len(transcript_ids) >= len(expr_profile)
+
+    # Truth table
+    truth_file = open(base_fname + ".truth", "w")
+    print >> truth_file, "taxID\tgenomeLen\tnumReads\tabundance\tname"
+    truth_list = []
+    normalized_sum = 0.0
+    debug_num_frag = 0
+    for t in range(len(expr_profile)):
+        t_num_frags = expr_profile[t]
+        if dna:
+            tax_id = genome_ids[t]
+        else:
+            transcript_id = transcript_ids[t]
+            chr, strand, transcript_len, exons = transcripts[transcript_id]
+        assert tax_id in genome_seqs and tax_id in sizes
+        genome_len = sizes[tax_id]
+        raw_abundance = float(t_num_frags)/num_frag
+        normalized_sum += (raw_abundance / genome_len)
+        truth_list.append([tax_id, genome_len, t_num_frags, raw_abundance])
+        debug_num_frag += t_num_frags
+    assert debug_num_frag == num_frag
+    for truth in truth_list:
+        tax_id, genome_len, t_num_frags, raw_abundance = truth
+        can_tax_id = tax_id
+        if '.' in can_tax_id:
+            can_tax_id = can_tax_id.split('.')[0]
+        name = "N/A"        
+        if can_tax_id in names:
+            name = names[can_tax_id]
+        abundance = raw_abundance / genome_len / normalized_sum
+        print >> truth_file, "{}\t{}\t{}\t{:.6}\t{}".format(tax_id, genome_len, t_num_frags, abundance, name)
+    truth_file.close()
+
+    # Sequence Classification Map (SCM) - something I made up ;-)
+    scm_file = open(base_fname + ".scm", "w")
+
+    # Write SCM header
+    print >> scm_file, "@HD\tVN:1.0\tSO:unsorted"
+    for tax_id in genome_seqs.keys():
+        name = ""
+        if tax_id in names:
+            name = names[tax_id]
+        print >> scm_file, "@SQ\tTID:%s\tSN:%s\tLN:%d" % (tax_id, name, len(genome_seqs[tax_id]))
+
+    read_file = open(base_fname + "_1.fa", "w")
+    if paired_end:
+        read2_file = open(base_fname + "_2.fa", "w")
+
+    cur_read_id = 1
+    for t in range(len(expr_profile)):
+        t_num_frags = expr_profile[t]
+        if dna:
+            tax_id = genome_ids[t]
+            print >> sys.stderr, "TaxID: %s, num fragments: %d" % (tax_id, t_num_frags)
+        else:
+            transcript_id = transcript_ids[t]
+            chr, strand, transcript_len, exons = transcripts[transcript_id]
+            print >> sys.stderr, transcript_id, t_num_frags
+
+        genome_seq = genome_seqs[tax_id]
+        genome_len = len(genome_seq)
+        if dna:
+            t_seq = genome_seq
+            exons = [[0, genome_len - 1]]
+        else:            
+            t_seq = ""
+            for e in exons:
+                assert e[0] < e[1]
+                t_seq += genome_seq[e[0]:e[1]+1]
+            assert len(t_seq) == transcript_len
+            
+        for f in range(t_num_frags):
+            if dna:
+                while True:
+                    frag_pos = random.randint(0, genome_len - frag_len)
+                    if 'N' not in genome_seq[frag_pos:frag_pos + frag_len]:
+                        break
+            else:
+                frag_pos = random.randint(0, transcript_len - frag_len)
+
+            pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq = getSamAlignment(dna, exons, genome_seq, t_seq, frag_pos, read_len, err_rand_src, max_mismatch)
+            pos2, cigars2, cigar2_descs, MD2, XM2, NM2, Zs2, read2_seq = getSamAlignment(dna, exons, genome_seq, t_seq, frag_pos+frag_len-read_len, read_len, err_rand_src, max_mismatch)
+            cigar_str, cigar2_str = "".join(cigars), "".join(cigars2)
+            if sanity_check:
+                samRepOk(genome_seq, read_seq, chr, pos, cigar_str, XM, NM, MD, Zs, max_mismatch)
+                samRepOk(genome_seq, read2_seq, chr, pos2, cigar2_str, XM2, NM2, MD2, Zs2, max_mismatch)
+
+            if Zs != "":
+                Zs = ("\tZs:Z:{}".format(Zs))
+            if Zs2 != "":
+                Zs2 = ("\tZs:Z:{}".format(Zs2))
+            
+            if dna:
+                XS, TI = "", ""                
+            else:
+                XS = "\tXS:A:{}".format(strand)
+                TI = "\tTI:Z:{}".format(transcript_id)                
+
+            print >> read_file, ">{}".format(cur_read_id)
+            print >> read_file, read_seq
+            output = "{}\t{}\t{}\t{}\tNM:i:{}\tMD:Z:{}".format(cur_read_id, tax_id, pos + 1, cigar_str, NM, MD)
+            if paired_end:
+                print >> read2_file, ">{}".format(cur_read_id)
+                print >> read2_file, reverse_complement(read2_seq)
+                output += "\t{}\t{}\tNM2:i:{}\tMD2:Z:{}".format(pos2 + 1, cigar2_str, NM2, MD2)
+            print >> scm_file, output
+                
+            cur_read_id += 1
+            
+    scm_file.close()
+    read_file.close()
+    if paired_end:
+        read2_file.close()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        description='Simulate reads from Centrifuge index')
+    parser.add_argument('index_fname',
+                        nargs='?',
+                        type=str,
+                        help='Centrifuge index')
+    """
+    parser.add_argument('gtf_file',
+                        nargs='?',
+                        type=FileType('r'),
+                        help='input GTF file')
+    """
+    parser.add_argument('base_fname',
+                        nargs='?',
+                        type=str,
+                        help='output base filename')
+    parser.add_argument('--rna',
+                        dest='dna',
+                        action='store_false',
+                        default=True,
+                        help='RNA-seq reads (default: DNA-seq reads)')
+    parser.add_argument('--single-end',
+                        dest='paired_end',
+                        action='store_false',
+                        default=True,
+                        help='single-end reads (default: paired-end reads)')
+    parser.add_argument('-r', '--read-length',
+                        dest='read_len',
+                        action='store',
+                        type=int,
+                        default=100,
+                        help='read length (default: 100)')
+    parser.add_argument('-f', '--fragment-length',
+                        dest='frag_len',
+                        action='store',
+                        type=int,
+                        default=250,
+                        help='fragment length (default: 250)')
+    parser.add_argument('-n', '--num-fragment',
+                        dest='num_frag',
+                        action='store',
+                        type=int,
+                        default=1000000,
+                        help='number of fragments (default: 1000000)')
+    parser.add_argument('-e', '--expr-profile',
+                        dest='expr_profile',
+                        action='store',
+                        type=str,
+                        default='flux',
+                        help='expression profile: flux or constant (default: flux)')
+    parser.add_argument('--error-rate',
+                        dest='error_rate',
+                        action='store',
+                        type=float,
+                        default=0.0,
+                        help='per-base sequencing error rate (%%) (default: 0.0)')
+    parser.add_argument('--max-mismatch',
+                        dest='max_mismatch',
+                        action='store',
+                        type=int,
+                        default=3,
+                        help='max mismatches due to sequencing errors (default: 3)')
+    parser.add_argument('--random-seed',
+                        dest='random_seed',
+                        action='store',
+                        type=int,
+                        default=0,
+                        help='random seeding value (default: 0)')
+    parser.add_argument('--sanity-check',
+                        dest='sanity_check',
+                        action='store_true',
+                        help='sanity check')
+    parser.add_argument('-v', '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        help='also print some statistics to stderr')
+    parser.add_argument('--version', 
+                        action='version',
+                        version='%(prog)s 2.0.0-alpha')
+    args = parser.parse_args()
+    if not args.index_fname:
+        parser.print_help()
+        exit(1)
+    if not args.dna:
+        print >> sys.stderr, "Error: --rna is not implemented."
+        exit(1)
+    # if args.dna:
+    #    args.expr_profile = "constant"
+    simulate_reads(args.index_fname, args.base_fname, \
+                       args.dna, args.paired_end, args.read_len, args.frag_len, \
+                       args.num_frag, args.expr_profile, args.error_rate, args.max_mismatch, \
+                       args.random_seed, args.sanity_check, args.verbose)
diff --git a/evaluation/test/centrifuge_evaluate_mason.py b/evaluation/test/centrifuge_evaluate_mason.py
new file mode 100755
index 0000000..2e63799
--- /dev/null
+++ b/evaluation/test/centrifuge_evaluate_mason.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess, inspect
+import platform, multiprocessing
+import string, re
+from datetime import datetime, date, time
+import copy
+from argparse import ArgumentParser, FileType
+
+
+"""
+"""
+def read_taxonomy_tree(tax_file):
+    taxonomy_tree = {}
+    for line in tax_file:
+        fields = line.strip().split('\t')
+        assert len(fields) == 5
+        tax_id, parent_tax_id, rank = fields[0], fields[2], fields[4]
+        assert tax_id not in taxonomy_tree
+        taxonomy_tree[tax_id] = [parent_tax_id, rank]        
+    return taxonomy_tree
+
+
+"""
+"""
+def compare_scm(centrifuge_out, true_out, taxonomy_tree, rank):
+    higher_ranked = {}
+        
+    ancestors = set()
+    for tax_id in taxonomy_tree.keys():
+        if tax_id in ancestors:
+            continue
+        while True:
+            parent_tax_id, cur_rank = taxonomy_tree[tax_id]
+            if parent_tax_id in ancestors:
+                break
+            if tax_id == parent_tax_id:
+                break
+            tax_id = parent_tax_id
+            ancestors.add(tax_id)
+
+    db_dic = {}
+    first = True
+    for line in open(centrifuge_out):
+        if first:
+            first = False
+            continue
+        read_name, seq_id, tax_id, score, _, _, _ = line.strip().split('\t')
+
+        # Traverse up taxonomy tree to match the given rank parameter
+        rank_tax_id = tax_id
+        if rank != "strain":
+            while True:
+                if tax_id not in taxonomy_tree:
+                    rank_tax_id = ""
+                    break
+                parent_tax_id, cur_rank = taxonomy_tree[tax_id]
+                if cur_rank == rank:
+                    rank_tax_id = tax_id
+                    break
+                if tax_id == parent_tax_id:
+                    rank_tax_id = ""
+                    break
+                tax_id = parent_tax_id
+        else:
+            assert rank == "strain"
+            if tax_id in ancestors:
+                continue
+
+        if rank_tax_id == "":
+            # higher_ranked[read_name] = True            
+            continue
+        
+        if read_name not in db_dic:
+            db_dic[read_name] = set()
+        db_dic[read_name].add(rank_tax_id)
+
+    classified, unclassified, unique_classified = 0, 0, 0
+    for line in open(true_out):
+        if line.startswith('@'):
+            continue
+
+        fields = line.strip().split('\t')
+        if len(fields) != 3:
+            print >> sys.stderr, "Warning: %s missing" % (line.strip())
+            continue
+        read_name, tax_id = fields[1:3] 
+        # Traverse up taxonomy tree to match the given rank parameter
+        rank_tax_id = tax_id
+        if read_name not in db_dic:
+            unclassified += 1
+            continue
+
+        maps = db_dic[read_name]
+        if rank_tax_id in maps:
+            classified += 1
+            if len(maps) == 1 and read_name not in higher_ranked:
+                unique_classified += 1
+        else:
+            unclassified += 1
+            # daehwan - for debugging purposes
+            # print read_name
+
+    raw_unique_classified = 0
+    for read_name, maps in db_dic.items():
+        if len(maps) == 1 and read_name not in higher_ranked:
+            raw_unique_classified += 1
+    return classified, unique_classified, unclassified, len(db_dic), raw_unique_classified
+
+
+"""
+"""
+def evaluate():
+    # Current script directory
+    curr_script = os.path.realpath(inspect.getsourcefile(evaluate))
+    path_base = os.path.dirname(curr_script) + "/.."
+
+    # index_base = "b_compressed"
+    index_base = "b+h+v"
+    # index_base = "centrifuge_Dec_Bonly"
+
+    def check_files(fnames):
+        for fname in fnames:
+            if not os.path.exists(fname):
+                return False
+        return True
+
+    # Check if indexes exists, otherwise create indexes
+    index_path = "%s/indexes/Centrifuge" % path_base
+    # index_path = "."
+    if not os.path.exists(path_base + "/indexes"):
+        os.mkdir(path_base + "/indexes")
+    if not os.path.exists(index_path):
+        os.mkdir(index_path)
+    index_fnames = ["%s/%s.%d.cf" % (index_path, index_base, i+1) for i in range(3)]
+    assert check_files(index_fnames)
+
+    # Read taxonomic IDs
+    centrifuge_inspect = os.path.join(path_base, "../centrifuge-inspect")
+    tax_ids = set()
+    tax_cmd = [centrifuge_inspect,
+               "--conversion-table",
+               "%s/%s" % (index_path, index_base)]
+    tax_proc = subprocess.Popen(tax_cmd, stdout=subprocess.PIPE)
+    for line in tax_proc.stdout:
+        _, tax_id = line.strip().split()
+        tax_ids.add(tax_id)
+    tax_ids = list(tax_ids)
+
+    # Read taxonomic tree
+    tax_tree_cmd = [centrifuge_inspect,
+                    "--taxonomy-tree",
+                    "%s/%s" % (index_path, index_base)]    
+    tax_tree_proc = subprocess.Popen(tax_tree_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
+    taxonomy_tree = read_taxonomy_tree(tax_tree_proc.stdout)
+
+    read_fname = "bacteria_sim10K.fa"
+    scm_fname = "bacteria_sim10K.truth"
+    read_fnames = [read_fname, scm_fname]
+
+    program_bin_base = "%s/.." % path_base
+    centrifuge_cmd = ["%s/centrifuge" % program_bin_base,
+                      # "-k", "20",
+                      # "--min-hitlen", "15",
+                      "-f",
+                      "-p", "1",
+                      "%s/%s" % (index_path, index_base),
+                      read_fname]
+
+    print >> sys.stderr, '\t'.join(centrifuge_cmd)
+
+    out_fname = "centrifuge.output"
+    proc = subprocess.Popen(centrifuge_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
+    proc.communicate()
+
+    classified, unique_classified, unclassified, raw_classified, raw_unique_classified = \
+        compare_scm(out_fname, scm_fname, taxonomy_tree, "genus")
+    num_cases = classified + unclassified
+
+    print >> sys.stderr, "\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(classified, num_cases, float(classified) / num_cases)
+    print >> sys.stderr, "\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(classified, raw_classified, float(classified) / raw_classified)
+    print >> sys.stderr, "\t\t\t\t\tsensitivity: {:,} / {:,} ({:.2%})".format(unique_classified, num_cases, float(unique_classified) / num_cases)
+    print >> sys.stderr, "\t\t\t\t\tprecision  : {:,} / {:,} ({:.2%})".format(unique_classified, raw_unique_classified, float(unique_classified) / raw_unique_classified)
+
+
+if __name__ == "__main__":
+    evaluate()
diff --git a/example/index/test.1.cf b/example/index/test.1.cf
new file mode 100644
index 0000000..b913024
Binary files /dev/null and b/example/index/test.1.cf differ
diff --git a/example/index/test.2.cf b/example/index/test.2.cf
new file mode 100644
index 0000000..127ca47
Binary files /dev/null and b/example/index/test.2.cf differ
diff --git a/example/index/test.3.cf b/example/index/test.3.cf
new file mode 100644
index 0000000..0ecb193
Binary files /dev/null and b/example/index/test.3.cf differ
diff --git a/example/reads/input.fa b/example/reads/input.fa
new file mode 100644
index 0000000..6e93cb5
--- /dev/null
+++ b/example/reads/input.fa
@@ -0,0 +1,24 @@
+>C_1
+GATCCTCCCCAGGCCCCTACACCCAATGTGGAACCGGGGTCCCGAATGAAAATGCTGCTGTTCCCTGGAGGTGTTTTCCT
+>C_2
+GATCCTCCCCAGGCCCCTACACCCAATGTGGAACCGGGGTCCCGAATGAAAATGCTGCTGTTCCCTGGAGGTGTTTTCCT
+>C_3
+GATCCTCCCCAGGCCCCTACACCCAATGTGGAACCGGGGTCCCGAATGAAAATGCTGCTGTTCCCTGGAGGTGTTTTCCT
+>C_4
+GATCCTCCCCAGGCCCCTACACCCAATGTGGAACCGGGGTCCCGAATGAAAATGCTGCTGTTCCCTGGAGGTGTTTTCCT
+>1_1
+GGACGCTCTGCTTTGTTACCAATGAGAAGGGCGCTGAATCCTCGAAAATCCTGACCCTTTTAATTCATGCTCCCTTACTC
+>1_2
+ACGAGAGATGATGATCGTTGATATTTCCCTGGACTGTGTGGGGTCTCAGAGACCACTATGGGGCACTCTCGTCAGGCTTC
+>2_1
+TGGCCGGGCAGATGCAAAGCCTGGTGATGCAGAGTCGGGCAAAGGCGCAGCCTTCGTGTCCAAGCAGGAGGGCAGCGAGG
+>2_2
+TGGCCGGGCAGATGCAAAGCCTGGTGATGCAGAGTCGGGCAAAGGCGCAGCCTTCGTGTCCAAGCAGGAGGGCAGCGAGG
+>2_3
+TGGCCGGGCAGATGCAAAGCCTGGTGATGCAGAGTCGGGCAAAGGCGCAGCCTTCGTGTCCAAGCAGGAGGGCAGCGAGG
+>2_4
+TGGCCGGGCAGATGCAAAGCCTGGTGATGCAGAGTCGGGCAAAGGCGCAGCCTTCGTGTCCAAGCAGGAGGGCAGCGAGG
+>2_5
+TGGCCGGGCAGATGCAAAGCCTGGTGATGCAGAGTCGGGCAAAGGCGCAGCCTTCGTGTCCAAGCAGGAGGGCAGCGAGG
+>2_6
+TGGCCGGGCAGATGCAAAGCCTGGTGATGCAGAGTCGGGCAAAGGCGCAGCCTTCGTGTCCAAGCAGGAGGGCAGCGAGG
diff --git a/example/reference/gi_to_tid.dmp b/example/reference/gi_to_tid.dmp
new file mode 100644
index 0000000..a8a2c04
--- /dev/null
+++ b/example/reference/gi_to_tid.dmp
@@ -0,0 +1,2 @@
+gi|4	9646
+gi|7	9913
diff --git a/example/reference/names.dmp b/example/reference/names.dmp
new file mode 100644
index 0000000..58701ba
--- /dev/null
+++ b/example/reference/names.dmp
@@ -0,0 +1,90 @@
+1	|	all	|		|	synonym	|
+1	|	root	|		|	scientific name	|
+2759	|	Eucarya	|		|	synonym	|
+2759	|	Eucaryotae	|		|	synonym	|
+2759	|	Eukarya	|		|	synonym	|
+2759	|	Eukaryota	|		|	scientific name	|
+2759	|	Eukaryotae	|		|	synonym	|
+2759	|	eucaryotes	|		|	genbank common name	|
+2759	|	eukaryotes	|		|	common name	|
+2759	|	eukaryotes	|	eukaryotes<blast2759>	|	blast name	|
+6072	|	Eumetazoa	|		|	scientific name	|
+7711	|	Chordata	|		|	scientific name	|
+7711	|	chordates	|		|	genbank common name	|
+7711	|	chordates	|	chordates<blast7711>	|	blast name	|
+7742	|	Vertebrata	|	Vertebrata <Metazoa>	|	scientific name	|
+7742	|	Vertebrata Cuvier, 1812	|		|	authority	|
+7742	|	vertebrates	|		|	genbank common name	|
+7742	|	vertebrates	|	vertebrates<blast7742>	|	blast name	|
+7776	|	Gnathostomata	|	Gnathostomata <vertebrate>	|	scientific name	|
+7776	|	jawed vertebrates	|		|	genbank common name	|
+8287	|	Sarcopterygii	|		|	scientific name	|
+9347	|	Eutheria	|		|	scientific name	|
+9347	|	Placentalia	|		|	synonym	|
+9347	|	eutherian mammals	|		|	common name	|
+9347	|	placental mammals	|		|	common name	|
+9347	|	placentals	|		|	genbank common name	|
+9347	|	placentals	|	placentals <blast9347>	|	blast name	|
+9632	|	Ursidae	|		|	scientific name	|
+9632	|	bears	|		|	genbank common name	|
+9645	|	Ailuropoda	|		|	scientific name	|
+9646	|	Ailuropoda melanoleuca	|		|	scientific name	|
+9646	|	Ailuropoda melanoleuca (David, 1869)	|		|	authority	|
+9646	|	Ailuropoda melanoleura	|		|	misspelling	|
+9646	|	giant panda	|		|	genbank common name	|
+9845	|	Artiodactyla	|	Artiodactyla <Ruminantia>	|	in-part	|
+9845	|	Ruminantia	|		|	scientific name	|
+9895	|	Bovidae	|		|	scientific name	|
+9903	|	Bos	|		|	scientific name	|
+9903	|	oxen, cattle	|		|	genbank common name	|
+9913	|	Bos Tauurus	|		|	misspelling	|
+9913	|	Bos bovis	|		|	synonym	|
+9913	|	Bos primigenius taurus	|		|	synonym	|
+9913	|	Bos taurus	|		|	scientific name	|
+9913	|	Bos taurus Linnaeus, 1758	|		|	authority	|
+9913	|	Bovidae sp. Adi Nefas	|		|	includes	|
+9913	|	bovine	|		|	common name	|
+9913	|	cattle	|		|	genbank common name	|
+9913	|	cow	|		|	common name	|
+9913	|	domestic cattle	|		|	common name	|
+9913	|	domestic cow	|		|	common name	|
+27592	|	Bovinae	|		|	scientific name	|
+32523	|	Tetrapoda	|		|	scientific name	|
+32523	|	tetrapods	|		|	genbank common name	|
+32524	|	Amniota	|		|	scientific name	|
+32524	|	amniotes	|		|	genbank common name	|
+32525	|	Theria	|	Theria <Mammalia>	|	scientific name	|
+32525	|	Theria Parker & Haswell, 1897	|		|	authority	|
+33154	|	Fungi/Metazoa group	|		|	synonym	|
+33154	|	Opisthokonta	|		|	scientific name	|
+33154	|	Opisthokonta Cavalier-Smith 1987	|		|	authority	|
+33154	|	opisthokonts	|		|	synonym	|
+33208	|	Animalia	|		|	synonym	|
+33208	|	Metazoa	|		|	scientific name	|
+33208	|	animals	|		|	blast name	|
+33208	|	metazoans	|		|	genbank common name	|
+33208	|	multicellular animals	|		|	common name	|
+33213	|	Bilateria	|		|	scientific name	|
+33511	|	Deuterostomia	|		|	scientific name	|
+33511	|	deuterostomes	|		|	common name	|
+33554	|	Carnivora	|		|	scientific name	|
+33554	|	carnivores	|		|	genbank common name	|
+33554	|	carnivores	|	carnivores <blast33554>	|	blast name	|
+35500	|	Pecora	|		|	scientific name	|
+40674	|	Mammalia	|		|	scientific name	|
+40674	|	mammals	|		|	genbank common name	|
+40674	|	mammals	|	mammals<blast40674>	|	blast name	|
+89593	|	Craniata	|	Craniata <chordata>	|	scientific name	|
+91561	|	Cetartiodactyla	|		|	scientific name	|
+91561	|	even-toed ungulates	|		|	blast name	|
+91561	|	whales, hippos, ruminants, pigs, camels etc.	|		|	genbank common name	|
+117570	|	Teleostomi	|		|	scientific name	|
+117571	|	Euteleostomi	|		|	scientific name	|
+117571	|	bony vertebrates	|		|	genbank common name	|
+131567	|	biota	|		|	synonym	|
+131567	|	cellular organisms	|		|	scientific name	|
+314145	|	Laurasiatheria	|		|	scientific name	|
+379584	|	Caniformia	|		|	scientific name	|
+1338369	|	Dipnotetrapodomorpha	|		|	scientific name	|
+1437010	|	Boreoeutheria	|		|	scientific name	|
+1437010	|	Boreotheria	|		|	synonym	|
diff --git a/example/reference/nodes.dmp b/example/reference/nodes.dmp
new file mode 100644
index 0000000..42ddf17
--- /dev/null
+++ b/example/reference/nodes.dmp
@@ -0,0 +1,35 @@
+1	|	1	|	no rank
+2759	|	131567	|	superkingdom
+6072	|	33208	|	no rank
+7711	|	33511	|	phylum
+7742	|	89593	|	no rank
+7776	|	7742	|	no rank
+8287	|	117571	|	no rank
+9347	|	32525	|	no rank
+9632	|	379584	|	family
+9645	|	9632	|	genus
+9646	|	9645	|	species
+9845	|	91561	|	suborder
+9895	|	35500	|	family
+9903	|	27592	|	genus
+9913	|	9903	|	species
+27592	|	9895	|	subfamily
+32523	|	1338369	|	no rank
+32524	|	32523	|	no rank
+32525	|	40674	|	no rank
+33154	|	2759	|	no rank
+33208	|	33154	|	kingdom
+33213	|	6072	|	no rank
+33511	|	33213	|	no rank
+33554	|	314145	|	order
+35500	|	9845	|	infraorder
+40674	|	32524	|	class
+89593	|	7711	|	subphylum
+91561	|	314145	|	no rank
+117570	|	7776	|	no rank
+117571	|	117570	|	no rank
+131567	|	1	|	no rank
+314145	|	1437010	|	superorder
+379584	|	33554	|	suborder
+1338369	|	8287	|	no rank
+1437010	|	9347	|	no rank
diff --git a/example/reference/test.fa b/example/reference/test.fa
new file mode 100644
index 0000000..53d6f82
--- /dev/null
+++ b/example/reference/test.fa
@@ -0,0 +1,16 @@
+>gi|4|emb|X17276.1| Giant Panda satellite 1 DNA
+GGACGCTCTGCTTTGTTACCAATGAGAAGGGCGCTGAATCCTCGAAAATCCTGACCCTTTTAATTCATGCTCCCTTACTC
+ACGAGAGATGATGATCGTTGATATTTCCCTGGACTGTGTGGGGTCTCAGAGACCACTATGGGGCACTCTCGTCAGGCTTC
+GATCCTCCCCAGGCCCCTACACCCAATGTGGAACCGGGGTCCCGAATGAAAATGCTGCTGTTCCCTGGAGGTGTTTTCCT
+CGCGACCACGTTCCCTCATGTTTCCCTATTAACGAAGGGTGATGATAGTGCTAAGACGGTCCCTGTACGGTGTTGTTTCT
+GACAGACGTGTTTTGGGCCTTTTCGTTCCATTGCCGCCAGCAGTTTTGACAGGATTTCCCCAGGGAGCAAACTTTTCGAT
+GGAAACGGGTTTTGGCCGAATTGTCTTTCTCAGTGCTGTGTTCGTCGTGTTTCACTCACGGTACCAAAACACCTTGATTA
+TTGTTCCACCCTCCATAAGGCCGTCGTGACTTCAAGGGCTTTCCCCTCAAACTTTGTTTCTTGGTTCTACGGGCTG
+>gi|7|emb|X51700.1| Bos taurus mRNA for bone Gla protein
+GTCCACGCAGCCGCTGACAGACACACCATGAGAACCCCCATGCTGCTCGCCCTGCTGGCCCTGGCCACACTCTGCCTCGC
+TGGCCGGGCAGATGCAAAGCCTGGTGATGCAGAGTCGGGCAAAGGCGCAGCCTTCGTGTCCAAGCAGGAGGGCAGCGAGG
+GATCCTCCCCAGGCCCCTACACCCAATGTGGAACCGGGGTCCCGAATGAAAATGCTGCTGTTCCCTGGAGGTGTTTTCCT
+TGGTGAAGAGACTCAGGCGCTACCTGGACCACTGGCTGGGAGCCCCAGCCCCCTACCCAGATCCGCTGGAGCCCAAGAGG
+GAGGTGTGTGAGCTCAACCCTGACTGTGACGAGCTAGCTGACCACATCGGCTTCCAGGAAGCCTATCGGCGCTTCTACGG
+CCCAGTCTAGAGCTTGCAGCCCTGCCCACCTGGCTGGCAGCCCCCAGCTCTGGCTTCTCTCCAGGACCCCTCCCCTCCCC
+GTCATCCCCGCTGCTCTAGAATAAACTCCAGAAGAGG
diff --git a/fast_mutex.h b/fast_mutex.h
new file mode 100755
index 0000000..4d4b7cc
--- /dev/null
+++ b/fast_mutex.h
@@ -0,0 +1,248 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
+Copyright (c) 2010-2012 Marcus Geelnard
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+#ifndef _FAST_MUTEX_H_
+#define _FAST_MUTEX_H_
+
+/// @file
+
+// Which platform are we on?
+#if !defined(_TTHREAD_PLATFORM_DEFINED_)
+  #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
+    #define _TTHREAD_WIN32_
+  #else
+    #define _TTHREAD_POSIX_
+  #endif
+  #define _TTHREAD_PLATFORM_DEFINED_
+#endif
+
+// Check if we can support the assembly language level implementation (otherwise
+// revert to the system API)
+#if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \
+    (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \
+    (defined(__GNUC__) && (defined(__ppc__)))
+  #define _FAST_MUTEX_ASM_
+#else
+  #define _FAST_MUTEX_SYS_
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+  #ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+    #define __UNDEF_LEAN_AND_MEAN
+  #endif
+  #include <windows.h>
+  #ifdef __UNDEF_LEAN_AND_MEAN
+    #undef WIN32_LEAN_AND_MEAN
+    #undef __UNDEF_LEAN_AND_MEAN
+  #endif
+#else
+  #ifdef _FAST_MUTEX_ASM_
+    #include <sched.h>
+  #else
+    #include <pthread.h>
+  #endif
+#endif
+
+namespace tthread {
+
+/// Fast mutex class.
+/// This is a mutual exclusion object for synchronizing access to shared
+/// memory areas for several threads. It is similar to the tthread::mutex class,
+/// but instead of using system level functions, it is implemented as an atomic
+/// spin lock with very low CPU overhead.
+///
+/// The \c fast_mutex class is NOT compatible with the \c condition_variable
+/// class (however, it IS compatible with the \c lock_guard class). It should
+/// also be noted that the \c fast_mutex class typically does not provide
+/// as accurate thread scheduling as a the standard \c mutex class does.
+///
+/// Because of the limitations of the class, it should only be used in
+/// situations where the mutex needs to be locked/unlocked very frequently.
+///
+/// @note The "fast" version of this class relies on inline assembler language,
+/// which is currently only supported for 32/64-bit Intel x86/AMD64 and
+/// PowerPC architectures on a limited number of compilers (GNU g++ and MS
+/// Visual C++).
+/// For other architectures/compilers, system functions are used instead.
+class fast_mutex {
+  public:
+    /// Constructor.
+#if defined(_FAST_MUTEX_ASM_)
+    fast_mutex() : mLock(0) {}
+#else
+    fast_mutex()
+    {
+  #if defined(_TTHREAD_WIN32_)
+      InitializeCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_init(&mHandle, NULL);
+  #endif
+    }
+#endif
+
+#if !defined(_FAST_MUTEX_ASM_)
+    /// Destructor.
+    ~fast_mutex()
+    {
+  #if defined(_TTHREAD_WIN32_)
+      DeleteCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_destroy(&mHandle);
+  #endif
+    }
+#endif
+
+    /// Lock the mutex.
+    /// The method will block the calling thread until a lock on the mutex can
+    /// be obtained. The mutex remains locked until \c unlock() is called.
+    /// @see lock_guard
+    inline void lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      bool gotLock;
+      do {
+        gotLock = try_lock();
+        if(!gotLock)
+        {
+  #if defined(_TTHREAD_WIN32_)
+          Sleep(0);
+  #elif defined(_TTHREAD_POSIX_)
+          sched_yield();
+  #endif
+        }
+      } while(!gotLock);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      EnterCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_lock(&mHandle);
+  #endif
+#endif
+    }
+
+    /// Try to lock the mutex.
+    /// The method will try to lock the mutex. If it fails, the function will
+    /// return immediately (non-blocking).
+    /// @return \c true if the lock was acquired, or \c false if the lock could
+    /// not be acquired.
+    inline bool try_lock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+      int oldLock;
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $1,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        "movl %%eax,%1\n\t"
+        : "=m" (mLock), "=m" (oldLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+      int *ptrLock = &mLock;
+      __asm {
+        mov eax,1
+        mov ecx,ptrLock
+        xchg eax,[ecx]
+        mov oldLock,eax
+      }
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      int newLock = 1;
+      asm volatile (
+        "\n1:\n\t"
+        "lwarx  %0,0,%1\n\t"
+        "cmpwi  0,%0,0\n\t"
+        "bne-   2f\n\t"
+        "stwcx. %2,0,%1\n\t"
+        "bne-   1b\n\t"
+        "isync\n"
+        "2:\n\t"
+        : "=&r" (oldLock)
+        : "r" (&mLock), "r" (newLock)
+        : "cr0", "memory"
+      );
+  #endif
+      return (oldLock == 0);
+#else
+  #if defined(_TTHREAD_WIN32_)
+      return TryEnterCriticalSection(&mHandle) ? true : false;
+  #elif defined(_TTHREAD_POSIX_)
+      return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
+  #endif
+#endif
+    }
+
+    /// Unlock the mutex.
+    /// If any threads are waiting for the lock on this mutex, one of them will
+    /// be unblocked.
+    inline void unlock()
+    {
+#if defined(_FAST_MUTEX_ASM_)
+  #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+      asm volatile (
+        "movl $0,%%eax\n\t"
+        "xchg %%eax,%0\n\t"
+        : "=m" (mLock)
+        :
+        : "%eax", "memory"
+      );
+  #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+      int *ptrLock = &mLock;
+      __asm {
+        mov eax,0
+        mov ecx,ptrLock
+        xchg eax,[ecx]
+      }
+  #elif defined(__GNUC__) && (defined(__ppc__))
+      asm volatile (
+        "sync\n\t"  // Replace with lwsync where possible?
+        : : : "memory"
+      );
+      mLock = 0;
+  #endif
+#else
+  #if defined(_TTHREAD_WIN32_)
+      LeaveCriticalSection(&mHandle);
+  #elif defined(_TTHREAD_POSIX_)
+      pthread_mutex_unlock(&mHandle);
+  #endif
+#endif
+    }
+
+  private:
+#if defined(_FAST_MUTEX_ASM_)
+    int mLock;
+#else
+  #if defined(_TTHREAD_WIN32_)
+    CRITICAL_SECTION mHandle;
+  #elif defined(_TTHREAD_POSIX_)
+    pthread_mutex_t mHandle;
+  #endif
+#endif
+};
+
+}
+
+#endif // _FAST_MUTEX_H_
+
diff --git a/filebuf.h b/filebuf.h
new file mode 100644
index 0000000..66dffb4
--- /dev/null
+++ b/filebuf.h
@@ -0,0 +1,718 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef FILEBUF_H_
+#define FILEBUF_H_
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdexcept>
+#include "assert_helpers.h"
+
+/**
+ * Simple, fast helper for determining if a character is a newline.
+ */
+static inline bool isnewline(int c) {
+	return c == '\r' || c == '\n';
+}
+
+/**
+ * Simple, fast helper for determining if a character is a non-newline
+ * whitespace character.
+ */
+static inline bool isspace_notnl(int c) {
+	return isspace(c) && !isnewline(c);
+}
+
+/**
+ * Simple wrapper for a FILE*, istream or ifstream that reads it in chunks
+ * using fread and keeps those chunks in a buffer.  It also services calls to
+ * get(), peek() and gets() from the buffer, reading in additional chunks when
+ * necessary.
+ *
+ * Helper functions do things like parse strings, numbers, and FASTA records.
+ *
+ *
+ */
+class FileBuf {
+public:
+	FileBuf() {
+		init();
+	}
+
+	FileBuf(FILE *in) {
+		init();
+		_in = in;
+		assert(_in != NULL);
+	}
+
+	FileBuf(std::ifstream *inf) {
+		init();
+		_inf = inf;
+		assert(_inf != NULL);
+	}
+
+	FileBuf(std::istream *ins) {
+		init();
+		_ins = ins;
+		assert(_ins != NULL);
+	}
+
+	/**
+	 * Return true iff there is a stream ready to read.
+	 */
+	bool isOpen() {
+		return _in != NULL || _inf != NULL || _ins != NULL;
+	}
+
+	/**
+	 * Close the input stream (if that's possible)
+	 */
+	void close() {
+		if(_in != NULL && _in != stdin) {
+			fclose(_in);
+		} else if(_inf != NULL) {
+			_inf->close();
+		} else {
+			// can't close _ins
+		}
+	}
+
+	/**
+	 * Get the next character of input and advance.
+	 */
+	int get() {
+		assert(_in != NULL || _inf != NULL || _ins != NULL);
+		int c = peek();
+		if(c != -1) {
+			_cur++;
+			if(_lastn_cur < LASTN_BUF_SZ) _lastn_buf[_lastn_cur++] = c;
+		}
+		return c;
+	}
+
+	/**
+	 * Return true iff all input is exhausted.
+	 */
+	bool eof() {
+		return (_cur == _buf_sz) && _done;
+	}
+
+	/**
+	 * Initialize the buffer with a new C-style file.
+	 */
+	void newFile(FILE *in) {
+		_in = in;
+		_inf = NULL;
+		_ins = NULL;
+		_cur = BUF_SZ;
+		_buf_sz = BUF_SZ;
+		_done = false;
+	}
+
+	/**
+	 * Initialize the buffer with a new ifstream.
+	 */
+	void newFile(std::ifstream *__inf) {
+		_in = NULL;
+		_inf = __inf;
+		_ins = NULL;
+		_cur = BUF_SZ;
+		_buf_sz = BUF_SZ;
+		_done = false;
+	}
+
+	/**
+	 * Initialize the buffer with a new istream.
+	 */
+	void newFile(std::istream *__ins) {
+		_in = NULL;
+		_inf = NULL;
+		_ins = __ins;
+		_cur = BUF_SZ;
+		_buf_sz = BUF_SZ;
+		_done = false;
+	}
+
+	/**
+	 * Restore state as though we just started reading the input
+	 * stream.
+	 */
+	void reset() {
+		if(_inf != NULL) {
+			_inf->clear();
+			_inf->seekg(0, std::ios::beg);
+		} else if(_ins != NULL) {
+			_ins->clear();
+			_ins->seekg(0, std::ios::beg);
+		} else {
+			rewind(_in);
+		}
+		_cur = BUF_SZ;
+		_buf_sz = BUF_SZ;
+		_done = false;
+	}
+
+	/**
+	 * Peek at the next character of the input stream without
+	 * advancing.  Typically we can simple read it from the buffer.
+	 * Occasionally we'll need to read in a new buffer's worth of data.
+	 */
+	int peek() {
+		assert(_in != NULL || _inf != NULL || _ins != NULL);
+		assert_leq(_cur, _buf_sz);
+		if(_cur == _buf_sz) {
+			if(_done) {
+				// We already exhausted the input stream
+				return -1;
+			}
+			// Read a new buffer's worth of data
+			else {
+				// Get the next chunk
+				if(_inf != NULL) {
+					_inf->read((char*)_buf, BUF_SZ);
+					_buf_sz = _inf->gcount();
+				} else if(_ins != NULL) {
+					_ins->read((char*)_buf, BUF_SZ);
+					_buf_sz = _ins->gcount();
+				} else {
+					assert(_in != NULL);
+					_buf_sz = fread(_buf, 1, BUF_SZ, _in);
+				}
+				_cur = 0;
+				if(_buf_sz == 0) {
+					// Exhausted, and we have nothing to return to the
+					// caller
+					_done = true;
+					return -1;
+				} else if(_buf_sz < BUF_SZ) {
+					// Exhausted
+					_done = true;
+				}
+			}
+		}
+		return (int)_buf[_cur];
+	}
+
+	/**
+	 * Store a string of characters from the input file into 'buf',
+	 * until we see a newline, EOF, or until 'len' characters have been
+	 * read.
+	 */
+	size_t gets(char *buf, size_t len) {
+		size_t stored = 0;
+		while(true) {
+			int c = get();
+			if(c == -1) {
+				// End-of-file
+				buf[stored] = '\0';
+				return stored;
+			}
+			if(stored == len-1 || isnewline(c)) {
+				// End of string
+				buf[stored] = '\0';
+				// Skip over all end-of-line characters
+				int pc = peek();
+				while(isnewline(pc)) {
+					get(); // discard
+					pc = peek();
+				}
+				// Next get() will be after all newline characters
+				return stored;
+			}
+			buf[stored++] = (char)c;
+		}
+	}
+
+	/**
+	 * Store a string of characters from the input file into 'buf',
+	 * until we see a newline, EOF, or until 'len' characters have been
+	 * read.
+	 */
+	size_t get(char *buf, size_t len) {
+		size_t stored = 0;
+		for(size_t i = 0; i < len; i++) {
+			int c = get();
+			if(c == -1) return i;
+			buf[stored++] = (char)c;
+		}
+		return len;
+	}
+
+	static const size_t LASTN_BUF_SZ = 8 * 1024;
+
+	/**
+	 * Keep get()ing characters until a non-whitespace character (or
+	 * -1) is reached, and return it.
+	 */
+	int getPastWhitespace() {
+		int c;
+		while(isspace(c = get()) && c != -1);
+		return c;
+	}
+
+	/**
+	 * Keep get()ing characters until a we've passed over the next
+	 * string of newline characters (\r's and \n's) or -1 is reached,
+	 * and return it.
+	 */
+	int getPastNewline() {
+		int c = get();
+		while(!isnewline(c) && c != -1) c = get();
+		while(isnewline(c)) c = get();
+		assert_neq(c, '\r');
+		assert_neq(c, '\n');
+		return c;
+	}
+
+	/**
+	 * Keep get()ing characters until a we've passed over the next
+	 * string of newline characters (\r's and \n's) or -1 is reached,
+	 * and return it.
+	 */
+	int peekPastNewline() {
+		int c = peek();
+		while(!isnewline(c) && c != -1) c = get();
+		while(isnewline(c)) c = get();
+		assert_neq(c, '\r');
+		assert_neq(c, '\n');
+		return c;
+	}
+
+	/**
+	 * Keep peek()ing then get()ing characters until the next return
+	 * from peek() is just after the last newline of the line.
+	 */
+	int peekUptoNewline() {
+		int c = peek();
+		while(!isnewline(c) && c != -1) {
+			get(); c = peek();
+		}
+		while(isnewline(c)) {
+			get();
+			c = peek();
+		}
+		assert_neq(c, '\r');
+		assert_neq(c, '\n');
+		return c;
+	}
+	
+	/**
+	 * Parse a FASTA record.  Append name characters to 'name' and and append
+	 * all sequence characters to 'seq'.  If gotCaret is true, assuming the
+	 * file cursor has already moved just past the starting '>' character.
+	 */
+	template <typename TNameStr, typename TSeqStr>
+	void parseFastaRecord(
+		TNameStr& name,
+		TSeqStr&  seq,
+		bool      gotCaret = false)
+	{
+		int c;
+		if(!gotCaret) {
+			// Skip over caret and non-newline whitespace
+			c = peek();
+			while(isspace_notnl(c) || c == '>') { get(); c = peek(); }
+		} else {
+			// Skip over non-newline whitespace
+			c = peek();
+			while(isspace_notnl(c)) { get(); c = peek(); }
+		}
+		size_t namecur = 0, seqcur = 0;
+		// c is the first character of the fasta name record, or is the first
+		// newline character if the name record is empty
+		while(!isnewline(c) && c != -1) {
+			name[namecur++] = c; get(); c = peek();
+		}
+		// sequence consists of all the non-whitespace characters between here
+		// and the next caret
+		while(true) {
+			// skip over whitespace
+			while(isspace(c)) { get(); c = peek(); }
+			// if we see caret or EOF, break
+			if(c == '>' || c == -1) break;
+			// append and continue
+			seq[seqcur++] = c;
+			get(); c = peek();
+		}
+	}
+
+	/**
+	 * Parse a FASTA record and return its length.  If gotCaret is true,
+	 * assuming the file cursor has already moved just past the starting '>'
+	 * character.
+	 */
+	void parseFastaRecordLength(
+		size_t&   nameLen,
+		size_t&   seqLen,
+		bool      gotCaret = false)
+	{
+		int c;
+		nameLen = seqLen = 0;
+		if(!gotCaret) {
+			// Skip over caret and non-newline whitespace
+			c = peek();
+			while(isspace_notnl(c) || c == '>') { get(); c = peek(); }
+		} else {
+			// Skip over non-newline whitespace
+			c = peek();
+			while(isspace_notnl(c)) { get(); c = peek(); }
+		}
+		// c is the first character of the fasta name record, or is the first
+		// newline character if the name record is empty
+		while(!isnewline(c) && c != -1) {
+			nameLen++; get(); c = peek();
+		}
+		// sequence consists of all the non-whitespace characters between here
+		// and the next caret
+		while(true) {
+			// skip over whitespace
+			while(isspace(c)) { get(); c = peek(); }
+			// if we see caret or EOF, break
+			if(c == '>' || c == -1) break;
+			// append and continue
+			seqLen++;
+			get(); c = peek();
+		}
+	}
+
+	/**
+	 * Reset to the beginning of the last-N-chars buffer.
+	 */
+	void resetLastN() {
+		_lastn_cur = 0;
+	}
+
+	/**
+	 * Copy the last several characters in the last-N-chars buffer
+	 * (since the last reset) into the provided buffer.
+	 */
+	size_t copyLastN(char *buf) {
+		memcpy(buf, _lastn_buf, _lastn_cur);
+		return _lastn_cur;
+	}
+
+	/**
+	 * Get const pointer to the last-N-chars buffer.
+	 */
+	const char *lastN() const {
+		return _lastn_buf;
+	}
+
+	/**
+	 * Get current size of the last-N-chars buffer.
+	 */
+	size_t lastNLen() const {
+		return _lastn_cur;
+	}
+
+private:
+
+	void init() {
+		_in = NULL;
+		_inf = NULL;
+		_ins = NULL;
+		_cur = _buf_sz = BUF_SZ;
+		_done = false;
+		_lastn_cur = 0;
+		// no need to clear _buf[]
+	}
+
+	static const size_t BUF_SZ = 256 * 1024;
+	FILE     *_in;
+	std::ifstream *_inf;
+	std::istream  *_ins;
+	size_t    _cur;
+	size_t    _buf_sz;
+	bool      _done;
+	uint8_t   _buf[BUF_SZ]; // (large) input buffer
+	size_t    _lastn_cur;
+	char      _lastn_buf[LASTN_BUF_SZ]; // buffer of the last N chars dispensed
+};
+
+/**
+ * Wrapper for a buffered output stream that writes bitpairs.
+ */
+class BitpairOutFileBuf {
+public:
+	/**
+	 * Open a new output stream to a file with given name.
+	 */
+	BitpairOutFileBuf(const char *in) : bpPtr_(0), cur_(0) {
+		assert(in != NULL);
+		out_ = fopen(in, "wb");
+		if(out_ == NULL) {
+			std::cerr << "Error: Could not open bitpair-output file " << in << std::endl;
+			throw 1;
+		}
+		memset(buf_, 0, BUF_SZ);
+	}
+
+	/**
+	 * Write a single bitpair into the buf.  Flush the buffer if it's
+	 * full.
+	 */
+	void write(int bp) {
+		assert_lt(bp, 4);
+		assert_geq(bp, 0);
+		buf_[cur_] |= (bp << bpPtr_);
+		if(bpPtr_ == 6) {
+			bpPtr_ = 0;
+			cur_++;
+			if(cur_ == BUF_SZ) {
+				// Flush the buffer
+				if(!fwrite((const void *)buf_, BUF_SZ, 1, out_)) {
+					std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
+					throw 1;
+				}
+				// Reset to beginning of the buffer
+				cur_ = 0;
+			}
+			// Initialize next octet to 0
+			buf_[cur_] = 0;
+		} else {
+			bpPtr_ += 2;
+		}
+	}
+
+	/**
+	 * Write any remaining bitpairs and then close the input
+	 */
+	void close() {
+		if(cur_ > 0 || bpPtr_ > 0) {
+			if(bpPtr_ == 0) cur_--;
+			if(!fwrite((const void *)buf_, cur_ + 1, 1, out_)) {
+				std::cerr << "Error writing to the reference index file (.4.ebwt)" << std::endl;
+				throw 1;
+			}
+		}
+		fclose(out_);
+	}
+private:
+	static const size_t BUF_SZ = 128 * 1024;
+	FILE    *out_;
+	int      bpPtr_;
+	size_t   cur_;
+	char     buf_[BUF_SZ]; // (large) input buffer
+};
+
+/**
+ * Wrapper for a buffered output stream that writes characters and
+ * other data types.  This class is *not* synchronized; the caller is
+ * responsible for synchronization.
+ */
+class OutFileBuf {
+
+public:
+
+	/**
+	 * Open a new output stream to a file with given name.
+	 */
+	OutFileBuf(const std::string& out, bool binary = false) :
+		name_(out.c_str()), cur_(0), closed_(false)
+	{
+		out_ = fopen(out.c_str(), binary ? "wb" : "w");
+		if(out_ == NULL) {
+			std::cerr << "Error: Could not open alignment output file " << out.c_str() << std::endl;
+			throw 1;
+		}
+		if(setvbuf(out_, NULL, _IOFBF, 10* 1024* 1024)) 
+			std::cerr << "Warning: Could not allocate the proper buffer size for output file stream. " << std::endl;
+	}
+
+	/**
+	 * Open a new output stream to a file with given name.
+	 */
+	OutFileBuf(const char *out, bool binary = false) :
+		name_(out), cur_(0), closed_(false)
+	{
+		assert(out != NULL);
+		out_ = fopen(out, binary ? "wb" : "w");
+		if(out_ == NULL) {
+			std::cerr << "Error: Could not open alignment output file " << out << std::endl;
+			throw 1;
+		}
+	}
+
+	/**
+	 * Open a new output stream to standard out.
+	 */
+	OutFileBuf() : name_("cout"), cur_(0), closed_(false) {
+		out_ = stdout;
+	}
+	
+	/**
+	 * Close buffer when object is destroyed.
+	 */
+	~OutFileBuf() { close(); }
+
+	/**
+	 * Open a new output stream to a file with given name.
+	 */
+	void setFile(const char *out, bool binary = false) {
+		assert(out != NULL);
+		out_ = fopen(out, binary ? "wb" : "w");
+		if(out_ == NULL) {
+			std::cerr << "Error: Could not open alignment output file " << out << std::endl;
+			throw 1;
+		}
+		reset();
+	}
+
+	/**
+	 * Write a single character into the write buffer and, if
+	 * necessary, flush.
+	 */
+	void write(char c) {
+		assert(!closed_);
+		if(cur_ == BUF_SZ) flush();
+		buf_[cur_++] = c;
+	}
+
+	/**
+	 * Write a c++ string to the write buffer and, if necessary, flush.
+	 */
+	void writeString(const std::string& s) {
+		assert(!closed_);
+		size_t slen = s.length();
+		if(cur_ + slen > BUF_SZ) {
+			if(cur_ > 0) flush();
+			if(slen >= BUF_SZ) {
+				fwrite(s.c_str(), slen, 1, out_);
+			} else {
+				memcpy(&buf_[cur_], s.data(), slen);
+				assert_eq(0, cur_);
+				cur_ = slen;
+			}
+		} else {
+			memcpy(&buf_[cur_], s.data(), slen);
+			cur_ += slen;
+		}
+		assert_leq(cur_, BUF_SZ);
+	}
+
+	/**
+	 * Write a c++ string to the write buffer and, if necessary, flush.
+	 */
+	template<typename T>
+	void writeString(const T& s) {
+		assert(!closed_);
+		size_t slen = s.length();
+		if(cur_ + slen > BUF_SZ) {
+			if(cur_ > 0) flush();
+			if(slen >= BUF_SZ) {
+				fwrite(s.toZBuf(), slen, 1, out_);
+			} else {
+				memcpy(&buf_[cur_], s.toZBuf(), slen);
+				assert_eq(0, cur_);
+				cur_ = slen;
+			}
+		} else {
+			memcpy(&buf_[cur_], s.toZBuf(), slen);
+			cur_ += slen;
+		}
+		assert_leq(cur_, BUF_SZ);
+	}
+
+	/**
+	 * Write a c++ string to the write buffer and, if necessary, flush.
+	 */
+	void writeChars(const char * s, size_t len) {
+		assert(!closed_);
+		if(cur_ + len > BUF_SZ) {
+			if(cur_ > 0) flush();
+			if(len >= BUF_SZ) {
+				fwrite(s, len, 1, out_);
+			} else {
+				memcpy(&buf_[cur_], s, len);
+				assert_eq(0, cur_);
+				cur_ = len;
+			}
+		} else {
+			memcpy(&buf_[cur_], s, len);
+			cur_ += len;
+		}
+		assert_leq(cur_, BUF_SZ);
+	}
+
+	/**
+	 * Write a 0-terminated C string to the output stream.
+	 */
+	void writeChars(const char * s) {
+		writeChars(s, strlen(s));
+	}
+
+	/**
+	 * Write any remaining bitpairs and then close the input
+	 */
+	void close() {
+		if(closed_) return;
+		if(cur_ > 0) flush();
+		closed_ = true;
+		if(out_ != stdout) {
+			fclose(out_);
+		}
+	}
+
+	/**
+	 * Reset so that the next write is as though it's the first.
+	 */
+	void reset() {
+		cur_ = 0;
+		closed_ = false;
+	}
+
+	void flush() {
+		if(!fwrite((const void *)buf_, cur_, 1, out_)) {
+			std::cerr << "Error while flushing and closing output" << std::endl;
+			throw 1;
+		}
+		cur_ = 0;
+	}
+
+	/**
+	 * Return true iff this stream is closed.
+	 */
+	bool closed() const {
+		return closed_;
+	}
+
+	/**
+	 * Return the filename.
+	 */
+	const char *name() {
+		return name_;
+	}
+
+private:
+
+	static const size_t BUF_SZ = 16 * 1024;
+
+	const char *name_;
+	FILE       *out_;
+	size_t      cur_;
+	char        buf_[BUF_SZ]; // (large) input buffer
+	bool        closed_;
+};
+
+#endif /*ndef FILEBUF_H_*/
diff --git a/formats.h b/formats.h
new file mode 100644
index 0000000..05ee679
--- /dev/null
+++ b/formats.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef FORMATS_H_
+#define FORMATS_H_
+
+#include <iostream>
+
+/**
+ * File-format constants and names
+ */
+
+enum file_format {
+	FASTA = 1,
+	FASTA_CONT,
+	FASTQ,
+	TAB_MATE5,
+	TAB_MATE6,
+	RAW,
+	CMDLINE,
+	QSEQ,
+    SRA_FASTA,
+    SRA_FASTQ
+};
+
+static const std::string file_format_names[] = {
+	"Invalid!",
+	"FASTA",
+	"FASTA sampling",
+	"FASTQ",
+	"Tabbed mated",
+	"Raw",
+	"Command line",
+	"Chain file",
+	"Random",
+	"Qseq",
+    "SRA_FASTA",
+    "SRA_FASTQ"
+};
+
+#endif /*FORMATS_H_*/
diff --git a/functions.sh b/functions.sh
new file mode 100644
index 0000000..082c44d
--- /dev/null
+++ b/functions.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+function check_or_mkdir {
+    echo -n "Creating $1 ... " >&2
+    if [[ -d $1 && ! -n `find $1 -prune -empty -type d` ]]; then
+        echo "Directory exists - skipping it!" >&2
+        return `false`
+    else 
+        echo "Done" >&2
+        mkdir -p $1
+        return `true`
+    fi
+}
+
+function check_or_mkdir_no_fail {
+    echo -n "Creating $1 ... " >&2
+    if [[ -d $1 && ! -n `find $1 -prune -empty -type d` ]]; then
+        echo "Directory exists already! Continuing" >&2
+        return `true`
+    else 
+        echo "Done" >&2
+        mkdir -p $1
+        return `true`
+    fi
+}
+
+
+
+## Functions
+function validate_url(){
+  if [[ `wget --reject="index.html*" -S --spider $1  2>&1 | egrep 'HTTP/1.1 200 OK|File .* exists.'` ]]; then echo "true"; fi
+}
+export -f validate_url
+
+function c_echo() {
+        printf "\033[34m$*\033[0m\n"
+}
+
+progressfilt () {
+    # from http://stackoverflow.com/a/4687912/299878
+    local flag=false c count cr=$'\r' nl=$'\n'
+    while IFS='' read -d '' -rn 1 c
+    do
+        if $flag
+        then
+            printf '%c' "$c"
+        else
+            if [[ $c != $cr && $c != $nl ]]
+            then
+                count=0
+            else
+                ((count++))
+                if ((count > 1))
+                then
+                    flag=true
+                fi
+            fi
+        fi
+    done
+}
diff --git a/group_walk.cpp b/group_walk.cpp
new file mode 100644
index 0000000..4abb1de
--- /dev/null
+++ b/group_walk.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "group_walk.h"
diff --git a/group_walk.h b/group_walk.h
new file mode 100644
index 0000000..df0b8b0
--- /dev/null
+++ b/group_walk.h
@@ -0,0 +1,1285 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * group_walk.h
+ *
+ * Classes and routines for walking a set of BW ranges backwards from the edge
+ * of a seed hit with the goal of resolving the offset of each row in each
+ * range.  Here "offset" means offset into the concatenated string of all
+ * references.  The main class is 'GroupWalk' and an important helper is
+ * 'GWState'.
+ *
+ * For each combination of seed offset and orientation, there is an associated
+ * QVal.  Each QVal describes a (possibly empty) set of suffix array ranges.
+ * Call these "seed range sets."  Each range in the set is "backed" by a range
+ * of the salist, represented as a PListSlice. Such a range is the origin of a
+ * walk.
+ *
+ * When an offset is resolved, it is entered into the salist via the
+ * PListSlice.  Note that other routines in this same thread might also be
+ * setting elements of the salist, so routines here should expect that elements
+ * can go from unresolved to resolved at any time.
+ *
+ * What bookkeeping do we have to do as we walk?  Before the first step, we
+ * convert the initial QVal into a list of SATuples; the SATuples are our link
+ * to the correpsonding ranges in the suffix array.  The list of SATuples is
+ * then converted to a list of GWState objects; these keep track of where we
+ * are in our walk (e.g. what 'top' and 'bot' are, how many steps have we gone,
+ * etc) as well as how the elements in the current range correspond to elements
+ * from the original range.
+ *
+ * The user asks the GroupWalk to resolve another offset by calling advance().
+ * advance() can be called in various ways:
+ *
+ * (a) The user can request that the GroupWalk proceed until a
+ *     *particular* element is resolved, then return that resolved
+ *     element.  Other elements may be resolved along the way, but
+ *     those results are buffered and may be dispensed in future calls
+ *     to advance().
+ *
+ * (b) The user can request that the GroupWalk select an as-yet-
+ *     unreported element at random and and proceed until that element
+ *     is resolved and report it.  Again, other elements may be
+ *     resolved along the way but they are buffered.
+ *
+ * (c) The user can request that the GroupWalk resolve elements in a
+ *     particular BW range (with a particular offset and orientation)
+ *     in an order of its choosing.  The GroupWalk in this case
+ *     attempts to resolve as many offsets as possible as quickly as
+ *     possible, and returns them as soon as they're found.  The res_
+ *     buffer is used in this case.
+ *
+ * (d) Like (c) but resolving elements at a paritcular offset and
+ *     orientation instead of at a specific BW range.  The res_ buffer
+ *     is used in this case, since there's a chance that the 
+ *
+ * There are simple ways to heuristically reduce the problem size while
+ * maintaining randomness.  For instance, the user put a ceiling on the
+ * number of elements that we walk from any given seed offset or range.
+ * We can then trim away random subranges to reduce the size of the
+ * problem.  There is no need for the caller to do this for us.
+ */
+
+#ifndef GROUP_WALK_H_
+#define GROUP_WALK_H_
+
+#include <stdint.h>
+#include <limits>
+#include "ds.h"
+#include "bt2_idx.h"
+#include "read.h"
+#include "reference.h"
+#include "mem_ids.h"
+
+/**
+ * Encapsulate an SA range and an associated list of slots where the resolved
+ * offsets can be placed.
+ */
+template<typename T>
+class SARangeWithOffs {
+
+public:
+
+	SARangeWithOffs() { reset(); };
+
+	SARangeWithOffs(TIndexOffU tf, size_t len, const T& o) {
+		init(tf, len, o);
+	}
+	
+	void init(TIndexOffU tf, size_t len_, const T& o) {
+		topf = tf; len = len_, offs = o;
+	}
+
+	/**
+	 * Reset to uninitialized state.
+	 */
+	void reset() { topf = std::numeric_limits<TIndexOffU>::max(); }
+	
+	/**
+	 * Return true if this is initialized.
+	 */
+	bool inited() const {
+		return topf != std::numeric_limits<TIndexOffU>::max();
+	}
+	
+	/**
+	 * Return the number of times this reference substring occurs in the
+	 * reference, which is also the size of the 'offs' TSlice.
+	 */
+	size_t size() const { return offs.size(); }
+
+	TIndexOffU topf; // top in BWT index
+	size_t    len;  // length of the reference sequence involved
+	T         offs; // offsets
+};
+
+/**
+ * A group of per-thread state that can be shared between all the GroupWalks
+ * used in that thread.
+ */
+template <typename index_t>
+struct GroupWalkState {
+
+	GroupWalkState(int cat) : map(cat) {
+		masks[0].setCat(cat);
+		masks[1].setCat(cat);
+		masks[2].setCat(cat);
+		masks[3].setCat(cat);
+	}
+
+	EList<bool> masks[4];      // temporary list for masks; used in GWState
+	EList<index_t, 16> map;   // temporary list of GWState maps
+};
+
+/**
+ * Encapsulates counters that encode how much work the walk-left logic
+ * has done.
+ */
+struct WalkMetrics {
+
+	WalkMetrics() {
+	    reset();
+	}
+
+	/**
+	 * Sum each across this object and 'm'.  This is the only safe way
+	 * to update a WalkMetrics shared by many threads.
+	 */
+	void merge(const WalkMetrics& m, bool getLock = false) {
+		ThreadSafe ts(&mutex_m, getLock);
+		bwops += m.bwops;
+		branches += m.branches;
+		resolves += m.resolves;
+		refresolves += m.refresolves;
+		reports += m.reports;
+	}
+	
+	/**
+	 * Set all to 0.
+	 */
+	void reset() {
+		bwops = branches = resolves = refresolves = reports = 0;
+	}
+
+	uint64_t bwops;       // Burrows-Wheeler operations
+	uint64_t branches;    // BW range branch-offs
+	uint64_t resolves;    // # offs resolved with BW walk-left
+	uint64_t refresolves; // # resolutions caused by reference scanning
+	uint64_t reports;     // # offs reported (1 can be reported many times)
+	MUTEX_T mutex_m;
+};
+
+/**
+ * Coordinates for a BW element that the GroupWalk might resolve.
+ */
+template <typename index_t>
+struct GWElt {
+
+	GWElt() { reset(); }
+	
+	/**
+	 * Reset GWElt to uninitialized state.
+	 */
+	void reset() {
+		offidx = range = elt = len = (index_t)0xffffffff;
+		fw = false;
+	}
+
+	/**
+	 * Initialize this WalkResult.
+	 */
+	void init(
+		index_t oi,
+		bool f,
+		index_t r,
+		index_t e,
+		index_t l)
+	{
+		offidx = oi;
+		fw = f;
+		range = r;
+		elt = e;
+		len = l;
+	}
+
+	/**
+	 * Return true iff this GWElt and the given GWElt refer to the same
+	 * element.
+	 */
+	bool operator==(const GWElt& o) const {
+		return offidx == o.offidx &&
+		       fw == o.fw &&
+		       range == o.range &&
+		       elt == o.elt &&
+		       len == o.len;
+	}
+	
+	/**
+	 * Return true iff this GWElt and the given GWElt refer to
+	 * different elements.
+	 */
+	bool operator!=(const GWElt& o) const {
+		return !(*this == o);
+	}
+
+	index_t offidx; // seed offset index
+	bool    fw;     // strand
+	index_t range;  // range
+	index_t elt;    // element
+	index_t len;    // length
+};
+
+/**
+ * A record encapsulating the result of looking up one BW element in
+ * the Bowtie index.
+ */
+template <typename index_t>
+struct WalkResult {
+
+	WalkResult() { reset(); }
+	
+	/**
+	 * Reset GWElt to uninitialized state.
+	 */
+	void reset() {
+		elt.reset();
+		bwrow = toff = (index_t)OFF_MASK;
+	}
+
+	/**
+	 * Initialize this WalkResult.
+	 */
+	void init(
+		index_t oi,  // seed offset index
+		bool f,       // strand
+		index_t r,   // range
+		index_t e,   // element
+		index_t bwr, // BW row
+		index_t len, // length
+		index_t to)  // text offset
+	{
+		elt.init(oi, f, r, e, len);
+		bwrow = bwr;
+		toff = to;
+	}
+
+	GWElt<index_t> elt;   // element resolved
+	index_t        bwrow; // SA row resolved
+	index_t        toff;  // resolved offset from SA sample
+};
+
+/**
+ * A GW hit encapsulates an SATuple describing a reference substring
+ * in the cache, along with a bool indicating whether each element of
+ * the hit has been reported yet.
+ */
+template<typename index_t, typename T>
+class GWHit {
+
+public:
+	GWHit() :
+		fmap(0, GW_CAT),
+		offidx((index_t)OFF_MASK),
+		fw(false),
+		range((index_t)OFF_MASK),
+		len((index_t)OFF_MASK),
+		reported_(0, GW_CAT),
+		nrep_(0)
+	{
+		assert(repOkBasic());
+	}
+
+	/**
+	 * Initialize with a new SA range.  Resolve the done vector so that
+	 * there's one bool per suffix array element.
+	 */
+	void init(
+		SARangeWithOffs<T>& sa,
+		index_t oi,
+		bool f,
+		index_t r)
+	{
+		nrep_ = 0;
+		offidx = oi;
+		fw = f;
+		range = r;
+		len = (index_t)sa.len;
+		reported_.resize(sa.offs.size());
+		reported_.fill(false);
+		fmap.resize(sa.offs.size());
+		fmap.fill(make_pair((index_t)OFF_MASK, (index_t)OFF_MASK));
+	}
+	
+	/**
+	 * Clear contents of sat and done.
+	 */
+	void reset() {
+		reported_.clear();
+		fmap.clear();
+		nrep_ = 0;
+		offidx = (index_t)OFF_MASK;
+		fw = false;
+		range = (index_t)OFF_MASK;
+		len = (index_t)OFF_MASK;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that GWHit is internally consistent.  If a pointer to an
+	 * EList of GWStates is given, we assume that it is the EList
+	 * corresponding to this GWHit and check whether the forward and
+	 * reverse mappings match up for the as-yet-unresolved elements.
+	 */
+	bool repOk(const SARangeWithOffs<T>& sa) const {
+		assert_eq(reported_.size(), sa.offs.size());
+		assert_eq(fmap.size(), sa.offs.size());
+		// Shouldn't be any repeats among as-yet-unresolveds
+		size_t nrep = 0;
+		for(size_t i = 0; i < fmap.size(); i++) {
+			if(reported_[i]) nrep++;
+			if(sa.offs[i] != (index_t)OFF_MASK) {
+				continue;
+			}
+			for(size_t j = i+1; j < fmap.size(); j++) {
+				if(sa.offs[j] != (index_t)OFF_MASK) {
+					continue;
+				}
+				assert(fmap[i] != fmap[j]);
+			}
+		}
+		assert_eq(nrep_, nrep);
+		return true;
+	}
+
+	/**
+	 * Return true iff this GWHit is not obviously corrupt.
+	 */
+	bool repOkBasic() {
+		return true;
+	}
+#endif
+	
+	/**
+	 * Set the ith element to be reported.
+	 */
+	void setReported(index_t i) {
+		assert(!reported_[i]);
+		assert_lt(i, reported_.size());
+		reported_[i] = true;
+		nrep_++;
+	}
+	
+	/**
+	 * Return true iff element i has been reported.
+	 */
+	bool reported(index_t i) const {
+		assert_lt(i, reported_.size());
+		return reported_[i];
+	}
+	
+	/**
+	 * Return true iff all elements have been reported.
+	 */
+	bool done() const {
+		assert_leq(nrep_, reported_.size());
+		return nrep_ == reported_.size();
+	}
+
+	EList<std::pair<index_t, index_t>, 16> fmap; // forward map; to GWState & elt
+	index_t offidx; // offset idx
+	bool fw;         // orientation
+	index_t range;  // original range index
+	index_t len;    // length of hit
+
+protected:
+
+	EList<bool, 16> reported_; // per-elt bool indicating whether it's been reported
+	index_t nrep_;
+};
+
+/**
+ * Encapsulates the progress made along a particular path from the original
+ * range.
+ */
+template<typename index_t, typename T>
+class GWState {
+	
+public:
+
+	GWState() : map_(0, GW_CAT) {
+		reset(); assert(repOkBasic());
+	}
+	
+	/**
+	 * Initialize this GWState with new ebwt, top, bot, step, and sat.
+	 *
+	 * We assume map is already set up.
+	 *
+	 * Returns true iff at least one elt was resolved.
+	 */
+	template<int S>
+	pair<int, int> init(
+		const Ebwt<index_t>& ebwt,    // index to walk left in
+		const BitPairReference& ref,  // bitpair-encoded reference
+		SARangeWithOffs<T>& sa,       // SA range with offsets
+		EList<GWState, S>& sts,       // EList of GWStates for range being advanced
+		GWHit<index_t, T>& hit,       // Corresponding hit structure
+		index_t range,                // which range is this?
+		bool reportList,              // if true, "report" resolved offsets immediately by adding them to 'res' list
+		EList<WalkResult<index_t>, 16>* res,   // EList where resolved offsets should be appended
+		index_t tp,                   // top of range at this step
+		index_t bt,                   // bot of range at this step
+		index_t st,                   // # steps taken to get to this step
+		WalkMetrics& met)
+	{
+		assert_gt(bt, tp);
+		assert_lt(range, sts.size());
+		top = tp;
+		bot = bt;
+		step = (int)st;
+		assert(!inited_);
+		ASSERT_ONLY(inited_ = true);
+		ASSERT_ONLY(lastStep_ = step-1);
+		return init(ebwt, ref, sa, sts, hit, range, reportList, res, met);
+	}
+
+	/**
+	 * Initialize this GWState.
+	 *
+	 * We assume map is already set up, and that 'step' is equal to the
+	 * number of steps taken to get to the new top/bot pair *currently*
+	 * in the top and bot fields.
+	 *
+	 * Returns a pair of numbers, the first being the number of
+	 * resolved but unreported offsets found during this advance, the
+	 * second being the number of as-yet-unresolved offsets.
+	 */
+	template<int S>
+	pair<int, int> init(
+		const Ebwt<index_t>& ebwt,    // forward Bowtie index
+		const BitPairReference& ref,  // bitpair-encoded reference
+		SARangeWithOffs<T>& sa,       // SA range with offsets
+		EList<GWState, S>& st,        // EList of GWStates for advancing range
+		GWHit<index_t, T>& hit,       // Corresponding hit structure
+		index_t range,                // range being inited
+		bool reportList,              // report resolutions, adding to 'res' list?
+		EList<WalkResult<index_t>, 16>* res,   // EList to append resolutions
+		WalkMetrics& met)             // update these metrics
+	{
+		assert(inited_);
+		assert_eq(step, lastStep_+1);
+		ASSERT_ONLY(lastStep_++);
+		assert_leq((index_t)step, ebwt.eh().len());
+		assert_lt(range, st.size());
+		pair<int, int> ret = make_pair(0, 0);
+		index_t trimBegin = 0, trimEnd = 0;
+		bool empty = true; // assume all resolved until proven otherwise
+		// Commit new information, if any, to the PListSlide.  Also,
+		// trim and check if we're done.
+		for(size_t i = mapi_; i < map_.size(); i++) {
+			bool resolved = (off(i, sa) != (index_t)OFF_MASK);
+			if(!resolved) {
+				// Elt not resolved yet; try to resolve it now
+				index_t bwrow = (index_t)(top - mapi_ + i);
+				index_t toff = ebwt.tryOffset(bwrow);
+				ASSERT_ONLY(index_t origBwRow = sa.topf + map(i));
+				assert_eq(bwrow, ebwt.walkLeft(origBwRow, step));
+				if(toff != (index_t)OFF_MASK) {
+					// Yes, toff was resolvable
+					assert_eq(toff, ebwt.getOffset(bwrow));
+					met.resolves++;
+#ifdef CENTRIFUGE
+#else
+					toff += step;
+                    assert_eq(toff, ebwt.getOffset(origBwRow));
+#endif
+					setOff(i, toff, sa, met);
+					if(!reportList) ret.first++;
+#if 0
+// used to be #ifndef NDEBUG, but since we no longer require that the reference
+// string info be included, this is no longer relevant.
+
+					// Sanity check that the reference characters under this
+					// hit match the seed characters in hit.satup->key.seq.
+					// This is NOT a check that we associated the exact right
+					// text offset with the BW row.  This is an important
+					// distinction because when resolved offsets are filled in
+					// via refernce scanning, they are not necessarily the
+					// exact right text offsets to associate with the
+					// respective BW rows but they WILL all be correct w/r/t
+					// the reference sequence underneath, which is what really
+					// matters here.
+					index_t tidx = (index_t)OFF_MASK, tof, tlen;
+					bool straddled = false;
+					ebwt.joinedToTextOff(
+						hit.len, // length of seed
+						toff,    // offset in joined reference string
+						tidx,    // reference sequence id
+						tof,     // offset in reference coordinates
+						tlen,    // length of reference sequence
+						true,    // don't reject straddlers
+						straddled);
+					if(tidx != (index_t)OFF_MASK &&
+					   hit.satup->key.seq != std::numeric_limits<uint64_t>::max())
+					{
+						// key: 2-bit characters packed into a 64-bit word with
+						// the least significant bitpair corresponding to the
+						// rightmost character on the Watson reference strand.
+						uint64_t key = hit.satup->key.seq;
+						for(int64_t j = tof + hit.len-1; j >= tof; j--) {
+							// Get next reference base to the left
+							int c = ref.getBase(tidx, j);
+							assert_range(0, 3, c);
+							// Must equal least significant bitpair of key
+							if(c != (int)(key & 3)) {
+								// Oops; when we jump to the piece of the
+								// reference where the seed hit is, it doesn't
+								// match the seed hit.  Before dying, check
+								// whether we have the right spot in the joined
+								// reference string
+								SString<char> jref;
+								ebwt.restore(jref);
+								uint64_t key2 = hit.satup->key.seq;
+								for(int64_t k = toff + hit.len-1; k >= toff; k--) {
+									int c = jref[k];
+									assert_range(0, 3, c);
+									assert_eq(c, (int)(key2 & 3));
+									key2 >>= 2;
+								}
+								assert(false);
+							}
+							key >>= 2;
+						}
+					}
+#endif
+				}
+			}
+			// Is the element resolved?  We ask this regardless of how it was
+			// resolved (whether this function did it just now, whether it did
+			// it a while ago, or whether some other function outside GroupWalk
+			// did it).
+			if(off(i, sa) != (index_t)OFF_MASK) {
+				if(reportList && !hit.reported(map(i))) {
+					// Report it
+					index_t toff = off(i, sa);
+					assert(res != NULL);
+					res->expand();
+					index_t origBwRow = sa.topf + map(i);
+					res->back().init(
+						hit.offidx, // offset idx
+						hit.fw,     // orientation
+						hit.range,  // original range index
+						map(i),     // original element offset
+						origBwRow,  // BW row resolved
+						hit.len,    // hit length
+						toff);      // text offset
+					hit.setReported(map(i));
+					met.reports++;
+				}
+				// Offset resolved
+				if(empty) {
+					// Haven't seen a non-empty entry yet, so we
+					// can trim this from the beginning.
+					trimBegin++;
+				} else {
+					trimEnd++;
+				}
+			} else {
+				// Offset not yet resolved
+				ret.second++;
+				trimEnd = 0;
+				empty = false;
+				// Set the forward map in the corresponding GWHit
+				// object to point to the appropriate element of our
+				// range
+				assert_geq(i, mapi_);
+				index_t bmap = map(i);
+				hit.fmap[bmap].first = range;
+				hit.fmap[bmap].second = (index_t)i;
+#ifndef NDEBUG
+				for(size_t j = 0; j < bmap; j++) {
+					if(sa.offs[j] == (index_t)OFF_MASK &&
+					   hit.fmap[j].first == range)
+					{
+						assert_neq(i, hit.fmap[j].second);
+					}
+				}
+#endif
+			}
+		}
+		// Trim from beginning
+		assert_geq(trimBegin, 0);
+		mapi_ += trimBegin;
+		top += trimBegin;
+		if(trimEnd > 0) {
+			// Trim from end
+			map_.resize(map_.size() - trimEnd);
+			bot -= trimEnd;
+		}
+		if(empty) {
+			assert(done());
+#ifndef NDEBUG
+			// If range is done, all elements from map should be
+			// resolved
+			for(size_t i = mapi_; i < map_.size(); i++) {
+				assert_neq((index_t)OFF_MASK, off(i, sa));
+			}
+			// If this range is done, then it should be the case that
+			// all elements in the corresponding GWHit that point to
+			// this range are resolved.
+			for(size_t i = 0; i < hit.fmap.size(); i++) {
+				if(sa.offs[i] == (index_t)OFF_MASK) {
+					assert_neq(range, hit.fmap[i].first);
+				}
+			}
+#endif
+			return ret;
+		} else {
+			assert(!done());
+		}
+		// Is there a dollar sign in the middle of the range?
+		assert_neq(top, ebwt._zOff);
+		assert_neq(bot-1, ebwt._zOff);
+		if(ebwt._zOff > top && ebwt._zOff < bot-1) {
+			// Yes, the dollar sign is in the middle of this range.  We
+			// must split it into the two ranges on either side of the
+			// dollar.  Let 'bot' and 'top' delimit the portion of the
+			// range prior to the dollar.
+			index_t oldbot = bot;
+			bot = ebwt._zOff;
+			// Note: might be able to do additional trimming off the
+			// end.
+			// Create a new range for the portion after the dollar.
+			st.expand();
+			st.back().reset();
+			index_t ztop = ebwt._zOff+1;
+			st.back().initMap(oldbot - ztop);
+			assert_eq((index_t)map_.size(), oldbot-top+mapi_);
+			for(index_t i = ztop; i < oldbot; i++) {
+				st.back().map_[i - ztop] = map(i-top+mapi_);
+			}
+			map_.resize(bot - top + mapi_);
+			st.back().init(
+				ebwt,
+				ref,
+				sa,
+				st,
+				hit,
+				(index_t)st.size()-1,
+				reportList,
+				res,
+				ztop,
+				oldbot,
+				step,
+				met);
+		}
+		assert_gt(bot, top);
+		// Prepare SideLocus's for next step
+		if(bot-top > 1) {
+			SideLocus<index_t>::initFromTopBot(top, bot, ebwt.eh(), ebwt.ebwt(), tloc, bloc);
+			assert(tloc.valid()); assert(tloc.repOk(ebwt.eh()));
+			assert(bloc.valid()); assert(bloc.repOk(ebwt.eh()));
+		} else {
+			tloc.initFromRow(top, ebwt.eh(), ebwt.ebwt());
+			assert(tloc.valid()); assert(tloc.repOk(ebwt.eh()));
+			bloc.invalidate();
+		}
+		return ret;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check if this GWP is internally consistent.
+	 */
+	bool repOk(
+		const Ebwt<index_t>& ebwt,
+		GWHit<index_t, T>& hit,
+		index_t range) const
+	{
+		assert(done() || bot > top);
+		assert(doneResolving(hit) || (tloc.valid() && tloc.repOk(ebwt.eh())));
+		assert(doneResolving(hit) || bot == top+1 || (bloc.valid() && bloc.repOk(ebwt.eh())));
+		assert_eq(map_.size()-mapi_, bot-top);
+		// Make sure that 'done' is compatible with whether we have >=
+		// 1 elements left to resolve.
+		int left = 0;
+		for(size_t i = mapi_; i < map_.size(); i++) {
+			ASSERT_ONLY(index_t row = (index_t)(top + i - mapi_));
+			ASSERT_ONLY(index_t origRow = hit.satup->topf + map(i));
+			assert(step == 0 || row != origRow);
+			assert_eq(row, ebwt.walkLeft(origRow, step));
+			assert_lt(map_[i], hit.satup->offs.size());
+			if(off(i, hit) == (index_t)OFF_MASK) left++;
+		}
+		assert(repOkMapRepeats());
+		assert(repOkMapInclusive(hit, range));
+		return true;
+	}
+	
+	/**
+	 * Return true iff this GWState is not obviously corrupt.
+	 */
+	bool repOkBasic() {
+		assert_geq(bot, top);
+		return true;
+	}
+
+	/**
+	 * Check that the fmap elements pointed to by our map_ include all
+	 * of the fmap elements that point to this range.
+	 */
+	bool repOkMapInclusive(GWHit<index_t, T>& hit, index_t range) const {
+		for(size_t i = 0; i < hit.fmap.size(); i++) {
+			if(hit.satup->offs[i] == (index_t)OFF_MASK) {
+				if(range == hit.fmap[i].first) {
+					ASSERT_ONLY(bool found = false);
+					for(size_t j = mapi_; j < map_.size(); j++) {
+						if(map(j) == i) {
+							ASSERT_ONLY(found = true);
+							break;
+						}
+					}
+					assert(found);
+				}
+			}
+		}
+		return true;
+	}
+	
+	/**
+	 * Check that no two elements in map_ are the same.
+	 */
+	bool repOkMapRepeats() const {
+		for(size_t i = mapi_; i < map_.size(); i++) {
+			for(size_t j = i+1; j < map_.size(); j++) {
+				assert_neq(map_[i], map_[j]);
+			}
+		}
+		return true;
+	}
+#endif
+	
+	/**
+	 * Return the offset currently assigned to the ith element.  If it
+	 * has not yet been resolved, return 0xffffffff.
+	 */
+	index_t off(
+				index_t i,
+				const SARangeWithOffs<T>& sa)
+	{
+		assert_geq(i, mapi_);
+		assert_lt(i, map_.size());
+		assert_lt(map_[i], sa.offs.size());
+		return sa.offs.get(map_[i]);
+	}
+
+	/**
+	 * Return the offset of the element within the original range's
+	 * PListSlice that the ith element of this range corresponds to.
+	 */
+	index_t map(index_t i) const {
+		assert_geq(i, mapi_);
+		assert_lt(i, map_.size());
+		return map_[i];
+	}
+
+	/**
+	 * Return the offset of the first untrimmed offset in the map.
+	 */
+	index_t mapi() const {
+		return mapi_;
+	}
+
+	/**
+	 * Return number of active elements in the range being tracked by
+	 * this GWState.
+	 */
+	index_t size() const {
+		return map_.size() - mapi_;
+	}
+	
+	/**
+	 * Return true iff all elements in this leaf range have been
+	 * resolved.
+	 */
+	bool done() const {
+		return size() == 0;
+	}
+
+	/**
+	 * Set the PListSlice element that corresponds to the ith element
+	 * of 'map' to the specified offset.
+	 */
+	void setOff(
+		index_t i,
+		index_t off,
+		SARangeWithOffs<T>& sa,
+		WalkMetrics& met)
+	{
+		assert_lt(i + mapi_, map_.size());
+		assert_lt(map_[i + mapi_], sa.offs.size());
+		size_t saoff = map_[i + mapi_];
+		sa.offs[saoff] = off;
+		assert_eq(off, sa.offs[saoff]);
+	}
+
+	/**
+	 * Advance this GWState by one step (i.e. one BW operation).  In
+	 * the event of a "split", more elements are added to the EList
+	 * 'st', which must have room for at least 3 more elements without
+	 * needing another expansion.  If an expansion of 'st' is
+	 * triggered, this GWState object becomes invalid.
+	 *
+	 * Returns a pair of numbers, the first being the number of
+	 * resolved but unreported offsets found during this advance, the
+	 * second being the number of as-yet-unresolved offsets.
+	 */
+	template <int S>
+	pair<int, int> advance(
+		const Ebwt<index_t>& ebwt,   // the forward Bowtie index, for stepping left
+		const BitPairReference& ref, // bitpair-encoded reference
+		SARangeWithOffs<T>& sa,      // SA range with offsets
+		GWHit<index_t, T>& hit,      // the associated GWHit object
+		index_t range,               // which range is this?
+		bool reportList,             // if true, "report" resolved offsets immediately by adding them to 'res' list
+		EList<WalkResult<index_t>, 16>* res,  // EList where resolved offsets should be appended
+		EList<GWState, S>& st,       // EList of GWStates for range being advanced
+		GroupWalkState<index_t>& gws,         // temporary storage for masks
+		WalkMetrics& met,
+		PerReadMetrics& prm)
+	{
+		ASSERT_ONLY(index_t origTop = top);
+		ASSERT_ONLY(index_t origBot = bot);
+		assert_geq(step, 0);
+		assert_eq(step, lastStep_);
+		assert_geq(st.capacity(), st.size() + 4);
+		assert(tloc.valid()); assert(tloc.repOk(ebwt.eh()));
+		assert_eq(bot-top, (index_t)(map_.size()-mapi_));
+		pair<int, int> ret = make_pair(0, 0);
+		assert_eq(top, tloc.toBWRow());
+		if(bloc.valid()) {
+			// Still multiple elements being tracked
+			assert_lt(top+1, bot);
+			index_t upto[4], in[4];
+			upto[0] = in[0] = upto[1] = in[1] =
+			upto[2] = in[2] = upto[3] = in[3] = 0;
+			assert_eq(bot, bloc.toBWRow());
+			met.bwops++;
+			prm.nExFmops++;
+			// Assert that there's not a dollar sign in the middle of
+			// this range
+			assert(bot <= ebwt._zOff || top > ebwt._zOff);
+			ebwt.mapLFRange(tloc, bloc, bot-top, upto, in, gws.masks);
+#ifndef NDEBUG
+			for(int i = 0; i < 4; i++) {
+			  assert_eq(bot-top, (index_t)(gws.masks[i].size()));
+			}
+#endif
+			bool first = true;
+			ASSERT_ONLY(index_t sum = 0);
+			index_t newtop = 0, newbot = 0;
+			gws.map.clear();
+			for(int i = 0; i < 4; i++) {
+				if(in[i] > 0) {
+					// Non-empty range resulted
+					if(first) {
+						// For the first one, 
+						first = false;
+						newtop = upto[i];
+						newbot = newtop + in[i];
+						assert_leq(newbot-newtop, bot-top);
+						// Range narrowed so we have to look at the masks
+						for(size_t j = 0; j < gws.masks[i].size(); j++) {
+							assert_lt(j+mapi_, map_.size());
+							if(gws.masks[i][j]) {
+								gws.map.push_back(map_[j+mapi_]);
+								assert(gws.map.size() <= 1 || gws.map.back() != gws.map[gws.map.size()-2]);
+#ifndef NDEBUG
+								// If this element is not yet resolved,
+								// then check that it really is the
+								// expected number of steps to the left
+								// of the corresponding element in the
+								// root range
+								assert_lt(gws.map.back(), sa.size());
+								if(sa.offs[gws.map.back()] == (index_t)OFF_MASK) {
+									assert_eq(newtop + gws.map.size() - 1,
+											  ebwt.walkLeft(sa.topf + gws.map.back(), step+1));
+								}
+#endif
+							}
+						}
+ 						assert_eq(newbot-newtop, (index_t)(gws.map.size()));
+					} else {
+						// For each beyond the first, create a new
+						// GWState and add it to the GWState list. 
+						// NOTE: this can cause the underlying list to
+						// be expanded which in turn might leave 'st'
+						// pointing to bad memory.
+						st.expand();
+						st.back().reset();
+						index_t ntop = upto[i];
+						index_t nbot = ntop + in[i];
+						assert_lt(nbot-ntop, bot-top);
+						st.back().mapi_ = 0;
+						st.back().map_.clear();
+						met.branches++;
+						// Range narrowed so we have to look at the masks
+						for(size_t j = 0; j < gws.masks[i].size(); j++) {
+							if(gws.masks[i][j]) st.back().map_.push_back(map_[j+mapi_]);
+						}
+						pair<int, int> rret =
+						st.back().init(
+							ebwt,        // forward Bowtie index
+							ref,         // bitpair-encodede reference
+							sa,          // SA range with offsets
+							st,          // EList of all GWStates associated with original range
+							hit,         // associated GWHit object
+							(index_t)st.size()-1, // range offset
+							reportList,  // if true, report hits to 'res' list
+							res,         // report hits here if reportList is true
+							ntop,        // BW top of new range
+							nbot,        // BW bot of new range
+							step+1,      // # steps taken to get to this new range
+							met);        // update these metrics
+						ret.first += rret.first;
+						ret.second += rret.second;
+					}
+					ASSERT_ONLY(sum += in[i]);
+				}
+			}
+			mapi_ = 0;
+			assert_eq(bot-top, sum);
+			assert_gt(newbot, newtop);
+			assert_leq(newbot-newtop, bot-top);
+			assert(top != newtop || bot != newbot);
+			//assert(!(newtop < top && newbot > top));
+			top = newtop;
+			bot = newbot;
+			if(!gws.map.empty()) {
+				map_ = gws.map;
+			}
+			//assert(repOkMapRepeats());
+			//assert(repOkMapInclusive(hit, range));
+			assert_eq(bot-top, (index_t)map_.size());
+		} else {
+			// Down to one element
+			assert_eq(bot, top+1);
+			assert_eq(1, map_.size()-mapi_);
+			// Sets top, returns char walked through (which we ignore)
+			ASSERT_ONLY(index_t oldtop = top);
+			met.bwops++;
+			prm.nExFmops++;
+			ebwt.mapLF1(top, tloc);
+			assert_neq(top, oldtop);
+			bot = top+1;
+			if(mapi_ > 0) {
+				map_[0] = map_[mapi_];
+				mapi_ = 0;
+			}
+			map_.resize(1);
+		}
+		assert(top != origTop || bot != origBot);
+		step++;
+		assert_gt(step, 0);
+		assert_leq((index_t)step, ebwt.eh().len());
+		pair<int, int> rret =
+		init<S>(
+			ebwt,       // forward Bowtie index
+			ref,        // bitpair-encodede reference
+			sa,         // SA range with offsets
+			st,         // EList of all GWStates associated with original range
+			hit,        // associated GWHit object
+			range,      // range offset
+			reportList, // if true, report hits to 'res' list
+			res,        // report hits here if reportList is true
+			met);       // update these metrics
+		ret.first += rret.first;
+		ret.second += rret.second;
+		return ret;
+	}
+
+	/**
+	 * Clear all state in preparation for the next walk.
+	 */
+	void reset() {
+		top = bot = step = mapi_ = 0;
+		ASSERT_ONLY(lastStep_ = -1);
+		ASSERT_ONLY(inited_ = false);
+		tloc.invalidate();
+		bloc.invalidate();
+		map_.clear();
+	}
+	
+	/**
+	 * Resize the map_ field to the given size.
+	 */
+	void initMap(size_t newsz) {
+		mapi_ = 0;
+		map_.resize(newsz);
+		for(size_t i = 0; i < newsz; i++) {
+			map_[i] = (index_t)i;
+		}
+	}
+
+	/**
+	 * Return true iff all rows corresponding to this GWState have been
+	 * resolved and reported.
+	 */
+	bool doneReporting(const GWHit<index_t, T>& hit) const {
+		for(size_t i = mapi_; i < map_.size(); i++) {
+			if(!hit.reported(map(i))) return false;
+		}
+		return true;
+	}
+
+	/**
+	 * Return true iff all rows corresponding to this GWState have been
+	 * resolved (but not necessarily reported).
+	 */
+	bool doneResolving(const SARangeWithOffs<T>& sa) const {
+		for(size_t i = mapi_; i < map_.size(); i++) {
+			if(sa.offs[map(i)] == (index_t)OFF_MASK) return false;
+		}
+		return true;
+	}
+
+	SideLocus<index_t> tloc;      // SideLocus for top
+	SideLocus<index_t> bloc;      // SideLocus for bottom
+	index_t            top;       // top elt of range in BWT
+	index_t            bot;       // bot elt of range in BWT
+	int                step;      // how many steps have we walked to the left so far
+
+protected:
+	
+	ASSERT_ONLY(bool inited_);
+	ASSERT_ONLY(int lastStep_);
+	EList<index_t, 16> map_; // which elts in range 'range' we're tracking
+	index_t mapi_;           // first untrimmed element of map
+};
+
+template<typename index_t, typename T, int S>
+class GroupWalk2S {
+public:
+	typedef EList<GWState<index_t, T>, S> TStateV;
+
+	GroupWalk2S() : st_(8, GW_CAT) {
+		reset();
+	}
+	
+	/**
+	 * Reset the GroupWalk in preparation for the next SeedResults.
+	 */
+	void reset() {
+		elt_ = rep_ = 0;
+		ASSERT_ONLY(inited_ = false);
+	}
+
+	/**
+	 * Initialize a new group walk w/r/t a QVal object.
+	 */
+	void init(
+		const Ebwt<index_t>& ebwtFw, // forward Bowtie index for walking left
+		const BitPairReference& ref, // bitpair-encoded reference
+		SARangeWithOffs<T>& sa,      // SA range with offsets
+		RandomSource& rnd,           // pseudo-random generator for sampling rows
+		WalkMetrics& met)            // update metrics here
+	{
+		reset();
+#ifndef NDEBUG
+		inited_ = true;
+#endif
+		// Init GWHit
+		hit_.init(sa, 0, false, 0);
+		// Init corresponding GWState
+		st_.resize(1);
+		st_.back().reset();
+		assert(st_.back().repOkBasic());
+		index_t top = sa.topf;
+		index_t bot = (index_t)(top + sa.size());
+		st_.back().initMap(bot-top);
+		st_.ensure(4);
+		st_.back().init(
+			ebwtFw,             // Bowtie index
+			ref,                // bitpair-encoded reference
+			sa,                 // SA range with offsets
+			st_,                // EList<GWState>
+			hit_,               // GWHit
+			0,                  // range 0
+			false,              // put resolved elements into res_?
+			NULL,               // put resolved elements here
+			top,                // BW row at top
+			bot,                // BW row at bot
+			0,                  // # steps taken
+			met);               // update metrics here
+		elt_ += sa.size();
+		assert(hit_.repOk(sa));
+	}
+
+	//
+	// ELEMENT-BASED
+	//
+
+	/**
+	 * Advance the GroupWalk until all elements have been resolved.
+	 * FIXME FB: Commented as the types of advanceElements do not correlate with the types of the function definition.
+	 */
+//	void resolveAll(WalkMetrics& met, PerReadMetrics& prm) {
+//		WalkResult<index_t> res; // ignore results for now
+//		for(size_t i = 0; i < elt_; i++) {
+//			advanceElement((index_t)i, res, met, prm);
+//		}
+//	}
+
+	/**
+	 * Advance the GroupWalk until the specified element has been
+	 * resolved.
+	 */
+	bool advanceElement(
+		index_t elt,                  // element within the range
+		const Ebwt<index_t>& ebwtFw,  // forward Bowtie index for walking left
+		const BitPairReference& ref,  // bitpair-encoded reference
+		SARangeWithOffs<T>& sa,       // SA range with offsets
+		GroupWalkState<index_t>& gws, // GroupWalk state; scratch space
+		WalkResult<index_t>& res,     // put the result here
+		WalkMetrics& met,             // metrics
+		PerReadMetrics& prm)          // per-read metrics
+	{
+		assert(inited_);
+		assert(!done());
+		assert(hit_.repOk(sa));
+		assert_lt(elt, sa.size()); // elt must fall within range
+		// Until we've resolved our element of interest...
+		while(sa.offs[elt] == (index_t)OFF_MASK) {
+			// Get the GWState that contains our element of interest
+			size_t range = hit_.fmap[elt].first;
+			st_.ensure(4);
+			GWState<index_t, T>& st = st_[range];
+			assert(!st.doneResolving(sa));
+			// Returns a pair of numbers, the first being the number of
+			// resolved but unreported offsets found during this advance, the
+			// second being the number of as-yet-unresolved offsets.
+			st.advance(
+				ebwtFw,
+				ref,
+				sa,
+				hit_,
+				(index_t)range,
+				false,
+				NULL,
+				st_,
+				gws,
+				met,
+				prm);
+			assert(sa.offs[elt] != (index_t)OFF_MASK ||
+			       !st_[hit_.fmap[elt].first].doneResolving(sa));
+		}
+		assert_neq((index_t)OFF_MASK, sa.offs[elt]);
+		// Report it!
+		if(!hit_.reported(elt)) {
+			hit_.setReported(elt);
+		}
+		met.reports++;
+		res.init(
+			0,              // seed offset
+			false,          // orientation
+			0,              // range
+			elt,            // element
+			sa.topf + elt,  // bw row
+			(index_t)sa.len, // length of hit
+			sa.offs[elt]);  // resolved text offset
+		rep_++;
+		return true;
+	}
+
+	/**
+	 * Return true iff all elements have been resolved and reported.
+	 */
+	bool done() const { return rep_ == elt_; }
+	
+#ifndef NDEBUG
+	/**
+	 * Check that GroupWalk is internally consistent.
+	 */
+	bool repOk(const SARangeWithOffs<T>& sa) const {
+		assert(hit_.repOk(sa));
+		assert_leq(rep_, elt_);
+		// This is a lot of work
+		size_t resolved = 0, reported = 0;
+		// For each element
+		const size_t sz = sa.size();
+		for(size_t m = 0; m < sz; m++) {
+			// Is it resolved?
+			if(sa.offs[m] != (index_t)OFF_MASK) {
+				resolved++;
+			} else {
+				assert(!hit_.reported(m));
+			}
+			// Is it reported?
+			if(hit_.reported(m)) {
+				reported++;
+			}
+			assert_geq(resolved, reported);
+		}
+		assert_geq(resolved, reported);
+		assert_eq(rep_, reported);
+		assert_eq(elt_, sz);
+		return true;
+	}
+#endif
+
+	/**
+	 * Return the number of BW elements that we can resolve.
+	 */
+	index_t numElts() const { return elt_; }
+	
+	/**
+	 * Return the size occupied by this GroupWalk and all its constituent
+	 * objects.
+	 */
+	size_t totalSizeBytes() const {
+		return 2 * sizeof(size_t) + st_.totalSizeBytes() + sizeof(GWHit<index_t, T>);
+	}
+	/**
+	 * Return the capacity of this GroupWalk and all its constituent objects.
+	 */
+	size_t totalCapacityBytes() const {
+		return 2 * sizeof(size_t) + st_.totalCapacityBytes() + sizeof(GWHit<index_t, T>);
+	}
+	
+#ifndef NDEBUG
+	bool initialized() const { return inited_; }
+#endif
+	
+protected:
+
+	ASSERT_ONLY(bool inited_);    // initialized?
+	
+	index_t elt_;    // # BW elements under the control of the GropuWalk
+	index_t rep_;    // # BW elements reported
+
+	// For each orientation and seed offset, keep a GWState object that
+	// holds the state of the walk so far.
+	TStateV st_;
+
+	// For each orientation and seed offset, keep an EList of GWHit.
+	GWHit<index_t, T> hit_;
+};
+
+#endif /*GROUP_WALK_H_*/
diff --git a/hi_aligner.h b/hi_aligner.h
new file mode 100644
index 0000000..27cc395
--- /dev/null
+++ b/hi_aligner.h
@@ -0,0 +1,1033 @@
+/*
+ * Copyright 2014, Daehwan Kim <infphilo at gmail.com>
+ *
+ * This file is part of HISAT.
+ *
+ * HISAT is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * HISAT is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with HISAT.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HI_ALIGNER_H_
+#define HI_ALIGNER_H_
+
+#include <iostream>
+#include <utility>
+#include <limits>
+#include "qual.h"
+#include "ds.h"
+#include "sstring.h"
+#include "alphabet.h"
+#include "edit.h"
+#include "read.h"
+// Threading is necessary to synchronize the classes that dump
+// intermediate alignment results to files.  Otherwise, all data herein
+// is constant and shared, or per-thread.
+#include "threading.h"
+#include "aligner_result.h"
+#include "scoring.h"
+#include "mem_ids.h"
+#include "simple_func.h"
+#include "group_walk.h"
+
+/**
+ * Hit types for BWTHit class below
+ * Three hit types to anchor a read on the genome
+ *
+ */
+enum {
+    CANDIDATE_HIT = 1,
+    PSEUDOGENE_HIT,
+    ANCHOR_HIT,
+};
+
+/**
+ * Simple struct for holding a partial alignment for the read
+ * The alignment locations are represented by FM offsets [top, bot),
+ * and later genomic offsets are calculated when necessary
+ */
+template <typename index_t>
+struct BWTHit {
+	
+	BWTHit() { reset(); }
+	
+	void reset() {
+		_top = _bot = 0;
+		_fw = true;
+		_bwoff = (index_t)OFF_MASK;
+		_len = 0;
+		_coords.clear();
+        _anchor_examined = false;
+        _hit_type = CANDIDATE_HIT;
+	}
+	
+	void init(
+			  index_t top,
+			  index_t bot,
+  			  bool fw,
+			  uint32_t bwoff,
+			  uint32_t len,
+              index_t hit_type = CANDIDATE_HIT)
+	{
+		_top = top;
+        _bot = bot;
+		_fw = fw;
+		_bwoff = bwoff;
+		_len = len;
+        _coords.clear();
+        _anchor_examined = false;
+        _hit_type = hit_type;
+	}
+    
+    bool hasGenomeCoords() const { return !_coords.empty(); }
+	
+	/**
+	 * Return true iff there is no hit.
+	 */
+	bool empty() const {
+		return _bot <= _top;
+	}
+	
+	/**
+	 * Higher score = higher priority.
+	 */
+	bool operator<(const BWTHit& o) const {
+		return _len > o._len;
+	}
+	
+	/**
+	 * Return the size of the alignments SA ranges.
+	 */
+	index_t size() const {
+        assert_leq(_top, _bot);
+        return _bot - _top;
+    }
+    
+    index_t len() const {
+        // assert_gt(_len, 0);
+        return _len;
+    }
+	
+#ifndef NDEBUG
+	/**
+	 * Check that hit is sane w/r/t read.
+	 */
+	bool repOk(const Read& rd) const {
+		assert_gt(_bot, _top);
+		assert_neq(_bwoff, (index_t)OFF_MASK);
+		assert_gt(_len, 0);
+		return true;
+	}
+#endif
+	
+	index_t         _top;               // start of the range in the FM index
+	index_t         _bot;               // end of the range in the FM index
+	bool            _fw;                // whether read is forward or reverse complemented
+	index_t         _bwoff;             // current base of a read to search from the right end
+	index_t         _len;               // read length
+	
+    EList<Coord>    _coords;            // genomic offsets corresponding to [_top, _bot)
+    
+    bool            _anchor_examined;   // whether or not this hit is examined
+    index_t         _hit_type;          // hit type (anchor hit, pseudogene hit, or candidate hit)
+};
+
+
+/**
+ * Simple struct for holding alignments for the read
+ * The alignments are represented by chains of BWTHits
+ */
+template <typename index_t>
+struct ReadBWTHit {
+	
+	ReadBWTHit() { reset(); }
+	
+	void reset() {
+        _fw = true;
+		_len = 0;
+        _cur = 0;
+        _done = false;
+        _numPartialSearch = 0;
+        _numUniqueSearch = 0;
+        _partialHits.clear();
+	}
+
+	void init(
+			  bool fw,
+              index_t len)
+	{
+        _fw = fw;
+        assert_gt(len, 0);
+        _len = len;
+        _cur = 0;
+        _done = false;
+        _numPartialSearch = 0;
+        _numUniqueSearch = 0;
+        _partialHits.clear();
+	}
+    
+    bool done() {
+#ifndef NDEBUG
+        assert_gt(_len, 0);
+        if(_cur >= _len) {
+            assert(_done);
+        }
+#endif
+        return _done;
+    }
+    
+    void done(bool done) {
+        // assert(!_done);
+        assert(done);
+        _done = done;
+    }
+    
+    index_t len() const { return _len; }
+    index_t cur() const { return _cur; }
+    
+    size_t  offsetSize()             { return _partialHits.size(); }
+    size_t  numPartialSearch()       { return _numPartialSearch; }
+    size_t  numActualPartialSearch()
+    {
+        assert_leq(_numUniqueSearch, _numPartialSearch);
+        return _numPartialSearch - _numUniqueSearch;
+    }
+    
+    bool width(index_t offset_) {
+        assert_lt(offset_, _partialHits.size());
+        return _partialHits[offset_].size();
+    }
+    
+    bool hasGenomeCoords(index_t offset_) {
+        assert_lt(offset_, _partialHits.size());
+        index_t width_ = width(offset_);
+        if(width_ == 0) {
+            return true;
+        } else {
+            return _partialHits[offset_].hasGenomeCoords();
+        }
+    }
+    
+    bool hasAllGenomeCoords() {
+        if(_cur < _len) return false;
+        if(_partialHits.size() <= 0) return false;
+        for(size_t oi = 0; oi < _partialHits.size(); oi++) {
+            if(!_partialHits[oi].hasGenomeCoords())
+                return false;
+        }
+        return true;
+    }
+    
+    /**
+     *
+     */
+    index_t minWidth(index_t& offset) const {
+        index_t minWidth_ = (index_t)OFF_MASK;
+        index_t minWidthLen_ = 0;
+        for(size_t oi = 0; oi < _partialHits.size(); oi++) {
+            const BWTHit<index_t>& hit = _partialHits[oi];
+            if(hit.empty()) continue;
+            // if(!hit.hasGenomeCoords()) continue;
+            assert_gt(hit.size(), 0);
+            if((minWidth_ > hit.size()) ||
+               (minWidth_ == hit.size() && minWidthLen_ < hit.len())) {
+                minWidth_ = hit.size();
+                minWidthLen_ = hit.len();
+                offset = (index_t)oi;
+            }
+        }
+        return minWidth_;
+    }
+    
+    // add policy for calculating a search score
+    int64_t searchScore(index_t minK) {
+        int64_t score = 0;
+        const int64_t penaltyPerOffset = minK * minK;
+        for(size_t i = 0; i < _partialHits.size(); i++) {
+            index_t len = _partialHits[i]._len;
+            score += (len * len);
+        }
+        
+        assert_geq(_numPartialSearch, _partialHits.size());
+        index_t actualPartialSearch = numActualPartialSearch();
+        score -= (actualPartialSearch * penaltyPerOffset);
+        score -= (1 << (actualPartialSearch << 1));
+        return score;
+    }
+    
+    BWTHit<index_t>& getPartialHit(index_t offset_) {
+        assert_lt(offset_, _partialHits.size());
+        return _partialHits[offset_];
+    }
+    
+    bool adjustOffset(index_t minK) {
+        assert_gt(_partialHits.size(), 0);
+        const BWTHit<index_t>& hit = _partialHits.back();
+        if(hit.len() >= minK + 3) {
+            return false;
+        }
+        assert_geq(_cur, hit.len());
+        index_t origCur = _cur - hit.len();
+        _cur = origCur + max(hit.len(), minK + 1) - minK;
+        _partialHits.pop_back();
+        return true;
+    }
+    
+    void setOffset(index_t offset) {
+        //assert_lt(offset, _len); //FIXME: assertion fails as offset == _len
+        _cur = offset;
+    }
+    
+#ifndef NDEBUG
+	/**
+	 */
+	bool repOk() const {
+        for(size_t i = 0; i < _partialHits.size(); i++) {
+            if(i == 0) {
+                assert_geq(_partialHits[i]._bwoff, 0);
+            }
+            
+            if(i + 1 < _partialHits.size()) {
+                assert_leq(_partialHits[i]._bwoff + _partialHits[i]._len, _partialHits[i+1]._bwoff);
+            } else {
+                assert_eq(i+1, _partialHits.size());
+                assert_eq(_partialHits[i]._bwoff + _partialHits[i]._len, _cur);
+            }
+        }
+		return true;
+	}
+#endif
+	
+	bool     _fw;
+	index_t  _len;
+    index_t  _cur;
+    bool     _done;
+    index_t  _numPartialSearch;
+    index_t  _numUniqueSearch;
+    index_t  _cur_local;
+    
+    EList<BWTHit<index_t> >  _partialHits;
+};
+
+
+/**
+ * this is per-thread data, which are shared by GenomeHit classes
+ * the main purpose of this struct is to avoid extensive use of memory related functions
+ * such as new and delete - those are really slow and lock based
+ */
+template <typename index_t>
+struct SharedTempVars {
+    SStringExpandable<char> raw_refbuf;
+    SStringExpandable<char> raw_refbuf2;
+    EList<int64_t> temp_scores;
+    EList<int64_t> temp_scores2;
+    ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
+    
+    ASSERT_ONLY(BTDnaString editstr);
+    ASSERT_ONLY(BTDnaString partialseq);
+    ASSERT_ONLY(BTDnaString refstr);
+    ASSERT_ONLY(EList<index_t> reflens);
+    ASSERT_ONLY(EList<index_t> refoffs);
+    
+    LinkedEList<EList<Edit> > raw_edits;
+};
+
+/**
+ * GenomeHit represents read alignment or alignment of a part of a read
+ * Two GenomeHits that represents alignments of different parts of a read
+ * can be combined together.  Also, GenomeHit can be extended in both directions.
+ */
+template <typename index_t>
+struct GenomeHit {
+	
+	GenomeHit() :
+    _fw(false),
+    _rdoff((index_t)OFF_MASK),
+    _len((index_t)OFF_MASK),
+    _trim5(0),
+    _trim3(0),
+    _tidx((index_t)OFF_MASK),
+    _toff((index_t)OFF_MASK),
+    _edits(NULL),
+    _score(MIN_I64),
+    _hitcount(1),
+    _edits_node(NULL),
+    _sharedVars(NULL)
+    {
+    }
+    
+    GenomeHit(const GenomeHit& otherHit) :
+    _fw(false),
+    _rdoff((index_t)OFF_MASK),
+    _len((index_t)OFF_MASK),
+    _trim5(0),
+    _trim3(0),
+    _tidx((index_t)OFF_MASK),
+    _toff((index_t)OFF_MASK),
+    _edits(NULL),
+    _score(MIN_I64),
+    _hitcount(1),
+    _edits_node(NULL),
+    _sharedVars(NULL)
+    {
+        init(otherHit._fw,
+             otherHit._rdoff,
+             otherHit._len,
+             otherHit._trim5,
+             otherHit._trim3,
+             otherHit._tidx,
+             otherHit._toff,
+             *(otherHit._sharedVars),
+             otherHit._edits,
+             otherHit._score,
+             otherHit._splicescore);
+    }
+    
+    GenomeHit<index_t>& operator=(const GenomeHit<index_t>& otherHit) {
+        if(this == &otherHit) return *this;
+        init(otherHit._fw,
+             otherHit._rdoff,
+             otherHit._len,
+             otherHit._trim5,
+             otherHit._trim3,
+             otherHit._tidx,
+             otherHit._toff,
+             *(otherHit._sharedVars),
+             otherHit._edits,
+             otherHit._score,
+             otherHit._splicescore);
+        
+        return *this;
+    }
+    
+    ~GenomeHit() {
+        if(_edits_node != NULL) {
+            assert(_edits != NULL);
+            assert(_sharedVars != NULL);
+            _sharedVars->raw_edits.delete_node(_edits_node);
+            _edits = NULL;
+            _edits_node = NULL;
+            _sharedVars = NULL;
+        }
+    }
+	
+	void init(
+              bool                      fw,
+			  index_t                   rdoff,
+			  index_t                   len,
+              index_t                   trim5,
+              index_t                   trim3,
+              index_t                   tidx,
+              index_t                   toff,
+              SharedTempVars<index_t>&  sharedVars,
+              EList<Edit>*              edits = NULL,
+              int64_t                   score = 0,
+              double                    splicescore = 0.0)
+	{
+		_fw = fw;
+		_rdoff = rdoff;
+		_len = len;
+        _trim5 = trim5;
+        _trim3 = trim3;
+        _tidx = tidx;
+        _toff = toff;
+		_score = score;
+        _splicescore = splicescore;
+        
+        assert(_sharedVars == NULL || _sharedVars == &sharedVars);
+        _sharedVars = &sharedVars;
+        if(_edits == NULL) {
+            assert(_edits_node == NULL);
+            _edits_node = _sharedVars->raw_edits.new_node();
+            assert(_edits_node != NULL);
+            _edits = &(_edits_node->payload);
+        }
+        assert(_edits != NULL);
+        _edits->clear();
+        
+        if(edits != NULL) *_edits = *edits;
+        _hitcount = 1;
+	}
+    
+    bool inited() const {
+        return _len >= 0 && _len < (index_t)OFF_MASK;
+    }
+    
+    index_t rdoff() const { return _rdoff; }
+    index_t len()   const { return _len; }
+    index_t trim5() const { return _trim5; }
+    index_t trim3() const { return _trim3; }
+    
+    void trim5(index_t trim5) { _trim5 = trim5; }
+    void trim3(index_t trim3) { _trim3 = trim3; }
+    
+    index_t ref()    const { return _tidx; }
+    index_t refoff() const { return _toff; }
+    index_t fw()     const { return _fw; }
+    
+    index_t hitcount() const { return _hitcount; }
+    
+    /**
+     * Leftmost coordinate
+     */
+    Coord coord() const {
+        return Coord(_tidx, _toff, _fw);
+    }
+    
+    const EList<Edit>& edits() const { return *_edits; }
+    
+    bool operator== (const GenomeHit<index_t>& other) const {
+        if(_fw != other._fw ||
+           _rdoff != other._rdoff ||
+           _len != other._len ||
+           _tidx != other._tidx ||
+           _toff != other._toff ||
+           _trim5 != other._trim5 ||
+           _trim3 != other._trim3) {
+            return false;
+        }
+        
+        if(_edits->size() != other._edits->size()) return false;
+        for(index_t i = 0; i < _edits->size(); i++) {
+            if(!((*_edits)[i] == (*other._edits)[i])) return false;
+        }
+        // daehwan - this may not be true when some splice sites are provided from outside
+        // assert_eq(_score, other._score);
+        return true;
+    }
+    
+    bool contains(const GenomeHit<index_t>& other) const {
+        return (*this) == other;
+    }
+
+
+#ifndef NDEBUG
+	/**
+	 * Check that hit is sane w/r/t read.
+	 */
+	bool repOk(const Read& rd, const BitPairReference& ref);
+#endif
+    
+public:
+	bool            _fw;
+	index_t         _rdoff;
+	index_t         _len;
+    index_t         _trim5;
+    index_t         _trim3;
+    
+    index_t         _tidx;
+    index_t         _toff;
+	EList<Edit>*    _edits;
+    int64_t         _score;
+    double          _splicescore;
+    
+    index_t         _hitcount;  // for selection purposes
+    
+    LinkedEListNode<EList<Edit> >*  _edits_node;
+    SharedTempVars<index_t>* _sharedVars;
+};
+
+
+#ifndef NDEBUG
+/**
+ * Check that hit is sane w/r/t read.
+ */
+template <typename index_t>
+bool GenomeHit<index_t>::repOk(const Read& rd, const BitPairReference& ref)
+{
+    assert(_sharedVars != NULL);
+    SStringExpandable<char>& raw_refbuf = _sharedVars->raw_refbuf;
+    SStringExpandable<uint32_t>& destU32 = _sharedVars->destU32;
+    
+    BTDnaString& editstr = _sharedVars->editstr;
+    BTDnaString& partialseq = _sharedVars->partialseq;
+    BTDnaString& refstr = _sharedVars->refstr;
+    EList<index_t>& reflens = _sharedVars->reflens;
+    EList<index_t>& refoffs = _sharedVars->refoffs;
+    
+    editstr.clear(); partialseq.clear(); refstr.clear();
+    reflens.clear(); refoffs.clear();
+    
+    const BTDnaString& seq = _fw ? rd.patFw : rd.patRc;
+    partialseq.install(seq.buf() + this->_rdoff, (size_t)this->_len);
+    Edit::toRef(partialseq, *_edits, editstr);
+    
+    index_t refallen = 0;
+    int64_t reflen = 0;
+    int64_t refoff = this->_toff;
+    refoffs.push_back(refoff);
+    size_t eidx = 0;
+    for(size_t i = 0; i < _len; i++, reflen++, refoff++) {
+        while(eidx < _edits->size() && (*_edits)[eidx].pos == i) {
+            const Edit& edit = (*_edits)[eidx];
+            if(edit.isReadGap()) {
+                reflen++;
+                refoff++;
+            } else if(edit.isRefGap()) {
+                reflen--;
+                refoff--;
+            }
+            if(edit.isSpliced()) {
+                assert_gt(reflen, 0);
+                refallen += reflen;
+                reflens.push_back((index_t)reflen);
+                reflen = 0;
+                refoff += edit.splLen;
+                assert_gt(refoff, 0);
+                refoffs.push_back((index_t)refoff);
+            }
+            eidx++;
+        }
+    }
+    assert_gt(reflen, 0);
+    refallen += (index_t)reflen;
+    reflens.push_back(reflen);
+    assert_gt(reflens.size(), 0);
+    assert_gt(refoffs.size(), 0);
+    assert_eq(reflens.size(), refoffs.size());
+    refstr.clear();
+    for(index_t i = 0; i < reflens.size(); i++) {
+        assert_gt(reflens[i], 0);
+        if(i > 0) {
+            assert_gt(refoffs[i], refoffs[i-1]);
+        }
+        raw_refbuf.resize(reflens[i] + 16);
+        raw_refbuf.clear();
+        int off = ref.getStretch(
+                                 reinterpret_cast<uint32_t*>(raw_refbuf.wbuf()),
+                                 (size_t)this->_tidx,
+                                 (size_t)max<TRefOff>(refoffs[i], 0),
+                                 reflens[i],
+                                 destU32);
+        assert_leq(off, 16);
+        for(index_t j = 0; j < reflens[i]; j++) {
+            char rfc = *(raw_refbuf.buf()+off+j);
+            refstr.append(rfc);
+        }
+    }
+    if(refstr != editstr) {
+        cerr << "Decoded nucleotides and edits don't match reference:" << endl;
+        //cerr << "           score: " << score.score()
+        //<< " (" << gaps << " gaps)" << endl;
+        cerr << "           edits: ";
+        Edit::print(cerr, *_edits);
+        cerr << endl;
+        cerr << "    decoded nucs: " << partialseq << endl;
+        cerr << "     edited nucs: " << editstr << endl;
+        cerr << "  reference nucs: " << refstr << endl;
+        assert(0);
+    }
+
+    return true;
+}
+#endif
+
+
+/**
+ * Encapsulates counters that measure how much work has been done by
+ * hierarchical indexing
+ */
+struct HIMetrics {
+    
+	HIMetrics() : mutex_m() {
+	    reset();
+	}
+    
+	void reset() {
+		anchoratts = 0;
+        localatts = 0;
+        localindexatts = 0;
+        localextatts = 0;
+        localsearchrecur = 0;
+        globalgenomecoords = 0;
+        localgenomecoords = 0;
+	}
+	
+	void init(
+              uint64_t localatts_,
+              uint64_t anchoratts_,
+              uint64_t localindexatts_,
+              uint64_t localextatts_,
+              uint64_t localsearchrecur_,
+              uint64_t globalgenomecoords_,
+              uint64_t localgenomecoords_)
+	{
+        localatts = localatts_;
+        anchoratts = anchoratts_;
+        localindexatts = localindexatts_;
+        localextatts = localextatts_;
+        localsearchrecur = localsearchrecur_;
+        globalgenomecoords = globalgenomecoords_;
+        localgenomecoords = localgenomecoords_;
+    }
+	
+	/**
+	 * Merge (add) the counters in the given HIMetrics object into this
+	 * object.  This is the only safe way to update a HIMetrics shared
+	 * by multiple threads.
+	 */
+	void merge(const HIMetrics& r, bool getLock = false) {
+        ThreadSafe ts(&mutex_m, getLock);
+        localatts += r.localatts;
+        anchoratts += r.anchoratts;
+        localindexatts += r.localindexatts;
+        localextatts += r.localextatts;
+        localsearchrecur += r.localsearchrecur;
+        globalgenomecoords += r.globalgenomecoords;
+        localgenomecoords += r.localgenomecoords;
+    }
+	   
+    uint64_t localatts;      // # attempts of local search
+    uint64_t anchoratts;     // # attempts of anchor search
+    uint64_t localindexatts; // # attempts of local index search
+    uint64_t localextatts;   // # attempts of extension search
+    uint64_t localsearchrecur;
+    uint64_t globalgenomecoords;
+    uint64_t localgenomecoords;
+	
+	MUTEX_T mutex_m;
+};
+
+/**
+ * With a hierarchical indexing, SplicedAligner provides several alignment strategies
+ * , which enable effective alignment of RNA-seq reads
+ */
+template <typename index_t, typename local_index_t>
+class HI_Aligner {
+
+public:
+	
+	/**
+	 * Initialize with index.
+	 */
+	HI_Aligner(
+               const Ebwt<index_t>& ebwt,
+               bool secondary = false,
+               bool local = false,
+               uint64_t threads_rids_mindist = 0,
+               bool no_spliced_alignment = false) :
+    _secondary(secondary),
+    _local(local),
+    _gwstate(GW_CAT),
+    _gwstate_local(GW_CAT),
+    _thread_rids_mindist(threads_rids_mindist),
+    _no_spliced_alignment(no_spliced_alignment)
+    {
+        index_t genomeLen = ebwt.eh().len();
+        _minK = 0;
+        while(genomeLen > 0) {
+            genomeLen >>= 2;
+            _minK++;
+        }
+        _minK_local = 8;
+    }
+    
+    HI_Aligner() {
+    }
+    
+    /**
+     */
+    void initRead(Read *rd, bool nofw, bool norc, TAlScore minsc, TAlScore maxpen, bool rightendonly = false) {
+        assert(rd != NULL);
+        _rds[0] = rd;
+        _rds[1] = NULL;
+		_paired = false;
+        _rightendonly = rightendonly;
+        _nofw[0] = nofw;
+        _nofw[1] = true;
+        _norc[0] = norc;
+        _norc[1] = true;
+        _minsc[0] = minsc;
+        _minsc[1] = OFF_MASK;
+        _maxpen[0] = maxpen;
+        _maxpen[1] = OFF_MASK;
+        for(size_t fwi = 0; fwi < 2; fwi++) {
+            bool fw = (fwi == 0);
+            _hits[0][fwi].init(fw, _rds[0]->length());
+        }
+        _genomeHits.clear();
+        _concordantPairs.clear();
+        _hits_searched[0].clear();
+        assert(!_paired);
+    }
+    
+    /**
+     */
+    void initReads(Read *rds[2], bool nofw[2], bool norc[2], TAlScore minsc[2], TAlScore maxpen[2]) {
+        assert(rds[0] != NULL && rds[1] != NULL);
+		_paired = true;
+        _rightendonly = false;
+        for(size_t rdi = 0; rdi < 2; rdi++) {
+            _rds[rdi] = rds[rdi];
+            _nofw[rdi] = nofw[rdi];
+            _norc[rdi] = norc[rdi];
+            _minsc[rdi] = minsc[rdi];
+            _maxpen[rdi] = maxpen[rdi];
+            for(size_t fwi = 0; fwi < 2; fwi++) {
+                bool fw = (fwi == 0);
+		        _hits[rdi][fwi].init(fw, _rds[rdi]->length());
+            }
+            _hits_searched[rdi].clear();
+        }
+        _genomeHits.clear();
+        _concordantPairs.clear();
+        assert(_paired);
+        assert(!_rightendonly);
+    }
+    
+    /**
+     * Aligns a read or a pair
+     * This funcion is called per read or pair
+     */
+    virtual
+    int go(
+           const Scoring&           sc,
+           const Ebwt<index_t>&     ebwtFw,
+           const Ebwt<index_t>&     ebwtBw,
+           const BitPairReference&  ref,
+           WalkMetrics&             wlm,
+           PerReadMetrics&          prm,
+           HIMetrics&               him,
+		   SpeciesMetrics&          spm,
+           RandomSource&            rnd,
+           AlnSinkWrap<index_t>&    sink) = 0;
+    
+   	/**
+     * Align a part of a read without any edits
+	 */
+    size_t partialSearch(
+                         const Ebwt<index_t>&    ebwt,    // BWT index
+                         const Read&             read,    // read to align
+                         const Scoring&          sc,      // scoring scheme
+                         bool                    fw,      // don't align forward read
+                         size_t                  mineMax, // don't care about edit bounds > this
+                         size_t&                 mineFw,  // minimum # edits for forward read
+                         size_t&                 mineRc,  // minimum # edits for revcomp read
+                         ReadBWTHit<index_t>&    hit,     // holds all the seed hits (and exact hit)
+                         RandomSource&           rnd);
+    
+protected:
+  
+    Read *   _rds[2];
+    bool     _paired;
+    bool     _rightendonly;
+    bool     _nofw[2];
+    bool     _norc[2];
+    TAlScore _minsc[2];
+    TAlScore _maxpen[2];
+    
+    bool     _secondary;  // allow secondary alignments
+    bool     _local;      // perform local alignments
+    
+    ReadBWTHit<index_t> _hits[2][2];
+    
+    EList<index_t, 16>                                 _offs;
+    SARangeWithOffs<EListSlice<index_t, 16> >          _sas;
+    GroupWalk2S<index_t, EListSlice<index_t, 16>, 16>  _gws;
+    GroupWalkState<index_t>                            _gwstate;
+    
+    EList<local_index_t, 16>                                       _offs_local;
+    SARangeWithOffs<EListSlice<local_index_t, 16> >                _sas_local;
+    GroupWalk2S<local_index_t, EListSlice<local_index_t, 16>, 16>  _gws_local;
+    GroupWalkState<local_index_t>                                  _gwstate_local;
+            
+    // temporary and shared variables used for GenomeHit
+    // this should be defined before _genomeHits and _hits_searched
+    SharedTempVars<index_t> _sharedVars;
+    
+    // temporary and shared variables for AlnRes
+    LinkedEList<EList<Edit> > _rawEdits;
+    
+    // temporary
+    EList<GenomeHit<index_t> >     _genomeHits;
+    EList<bool>                    _genomeHits_done;
+    ELList<Coord>                  _coords;
+    
+    EList<pair<index_t, index_t> >  _concordantPairs;
+    
+    size_t _minK; // log4 of the size of a genome
+    size_t _minK_local; // log4 of the size of a local index (8)
+
+    ELList<GenomeHit<index_t> >     _local_genomeHits;
+    EList<uint8_t>                  _anchors_added;
+    uint64_t max_localindexatts;
+    
+	uint64_t bwops_;                    // Burrows-Wheeler operations
+	uint64_t bwedits_;                  // Burrows-Wheeler edits
+    
+    //
+    EList<GenomeHit<index_t> >     _hits_searched[2];
+    
+    uint64_t   _thread_rids_mindist;
+    bool _no_spliced_alignment;
+
+    // For AlnRes::matchesRef
+	ASSERT_ONLY(EList<bool> raw_matches_);
+	ASSERT_ONLY(BTDnaString tmp_rf_);
+	ASSERT_ONLY(BTDnaString tmp_rdseq_);
+	ASSERT_ONLY(BTString tmp_qseq_);
+};
+
+#define HIER_INIT_LOCS(top, bot, tloc, bloc, e) { \
+	if(bot - top == 1) { \
+		tloc.initFromRow(top, (e).eh(), (e).ebwt()); \
+		bloc.invalidate(); \
+	} else { \
+		SideLocus<index_t>::initFromTopBot(top, bot, (e).eh(), (e).ebwt(), tloc, bloc); \
+		assert(bloc.valid()); \
+	} \
+}
+
+#define HIER_SANITY_CHECK_4TUP(t, b, tp, bp) { \
+	ASSERT_ONLY(cur_index_t tot = (b[0]-t[0])+(b[1]-t[1])+(b[2]-t[2])+(b[3]-t[3])); \
+	ASSERT_ONLY(cur_index_t totp = (bp[0]-tp[0])+(bp[1]-tp[1])+(bp[2]-tp[2])+(bp[3]-tp[3])); \
+	assert_eq(tot, totp); \
+}
+
+/**
+ * Sweep right-to-left and left-to-right using exact matching.  Remember all
+ * the SA ranges encountered along the way.  Report exact matches if there are
+ * any.  Calculate a lower bound on the number of edits in an end-to-end
+ * alignment.
+ */
+template <typename index_t, typename local_index_t>
+size_t HI_Aligner<index_t, local_index_t>::partialSearch(
+                                                         const Ebwt<index_t>&      ebwt,    // BWT index
+                                                         const Read&               read,    // read to align
+                                                         const Scoring&            sc,      // scoring scheme
+                                                         bool                      fw,
+                                                         size_t                    mineMax, // don't care about edit bounds > this
+                                                         size_t&                   mineFw,  // minimum # edits for forward read
+                                                         size_t&                   mineRc,  // minimum # edits for revcomp read
+                                                         ReadBWTHit<index_t>&      hit,     // holds all the seed hits (and exact hit)
+                                                         RandomSource&             rnd)     // pseudo-random source
+
+{
+	const index_t ftabLen = ebwt.eh().ftabChars();
+	SideLocus<index_t> tloc, bloc;
+	const index_t len = (index_t)read.length();
+    const BTDnaString& seq = fw ? read.patFw : read.patRc;
+    assert(!seq.empty());
+    
+    size_t nelt = 0;
+    EList<BWTHit<index_t> >& partialHits = hit._partialHits;
+    index_t& cur = hit._cur;
+    assert_lt(cur, hit._len);
+    
+    hit._numPartialSearch++;
+    
+    index_t offset = cur;
+    index_t dep = offset;
+    index_t top = 0, bot = 0;
+    index_t topTemp = 0, botTemp = 0;
+    index_t left = len - dep;
+    assert_gt(left, 0);
+    if(left < ftabLen) {
+        cur = hit._len;
+        partialHits.expand();
+        partialHits.back().init((index_t)OFF_MASK,
+                                (index_t)OFF_MASK,
+                                fw,
+                                (uint32_t)offset,
+                                (uint32_t)(cur - offset));
+        hit.done(true);
+		return 0;
+    }
+    // Does N interfere with use of Ftab?
+    for(index_t i = 0; i < ftabLen; i++) {
+        int c = seq[len-dep-1-i];
+        if(c > 3) {
+            cur += (i+1);
+            partialHits.expand();
+            partialHits.back().init((index_t)OFF_MASK,
+                                    (index_t)OFF_MASK,
+                                    fw,
+                                    (uint32_t)offset,
+                                    (uint32_t)(cur - offset));
+            if(cur >= hit._len) {
+                hit.done(true);
+            }
+			return 0;
+        }
+    }
+    
+    // Use ftab
+    ebwt.ftabLoHi(seq, len - dep - ftabLen, false, top, bot);
+    dep += ftabLen;
+    if(bot <= top) {
+        cur = dep;
+        partialHits.expand();
+        partialHits.back().init((index_t)OFF_MASK,
+                                (index_t)OFF_MASK,
+                                fw,
+                                (uint32_t)offset,
+                                (uint32_t)(cur - offset));
+        if(cur >= hit._len) {
+            hit.done(true);
+        }
+        return 0;
+    }
+    HIER_INIT_LOCS(top, bot, tloc, bloc, ebwt);
+    // Keep going
+    while(dep < len) {
+        int c = seq[len-dep-1];
+        if(c > 3) {
+            topTemp = botTemp = 0;
+        } else {
+            if(bloc.valid()) {
+                bwops_ += 2;
+                topTemp = ebwt.mapLF(tloc, c);
+                botTemp = ebwt.mapLF(bloc, c);
+            } else {
+                bwops_++;
+                topTemp = ebwt.mapLF1(top, tloc, c);
+                if(topTemp == (index_t)OFF_MASK) {
+                    topTemp = botTemp = 0;
+                } else {
+                    botTemp = topTemp + 1;
+                }
+            }
+        }
+        if(botTemp <= topTemp) {
+            break;
+        }
+        top = topTemp;
+        bot = botTemp;
+        dep++;
+        HIER_INIT_LOCS(top, bot, tloc, bloc, ebwt);
+    }
+    
+    // Done
+    if(bot > top) {
+        // This is an exact hit
+        assert_gt(dep, offset);
+        assert_leq(dep, len);
+        partialHits.expand();
+        index_t hit_type = CANDIDATE_HIT;
+        partialHits.back().init(top,
+                                bot,
+                                fw,
+                                (uint32_t)offset,
+                                (uint32_t)(dep - offset),
+                                hit_type);
+        
+        nelt += (bot - top);
+        cur = dep;
+        if(cur >= hit._len) {
+            if(hit_type == CANDIDATE_HIT) hit._numUniqueSearch++;
+            hit.done(true);
+        }
+    }
+    return nelt;
+}
+
+#endif /*HI_ALIGNER_H_*/
diff --git a/hier_idx.h b/hier_idx.h
new file mode 100644
index 0000000..ac2b7e2
--- /dev/null
+++ b/hier_idx.h
@@ -0,0 +1,1877 @@
+/*
+ * Copyright 2013, Daehwan Kim <infphilo at gmail.com>
+ *
+ * This file is part of Beast.  Beast is based on Bowtie 2.
+ *
+ * Beast is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Beast is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Beast.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HIEREBWT_H_
+#define HIEREBWT_H_
+
+#include "hier_idx_common.h"
+#include "bt2_idx.h"
+#include "bt2_io.h"
+#include "bt2_util.h"
+
+/**
+ * Extended Burrows-Wheeler transform data.
+ * LocalEbwt is a specialized Ebwt index that represents ~64K bps
+ * and therefore uses two bytes as offsets within 64K bps.
+ * This class has only two additional member variables to denote the genomic sequenuce it represents:
+ * (1) the contig index and (2) the offset within the contig.
+ *
+ */
+template <typename index_t = uint16_t, typename full_index_t = uint32_t>
+class LocalEbwt : public Ebwt<index_t> {
+	typedef Ebwt<index_t> PARENT_CLASS;
+public:
+	/// Construct an Ebwt from the given input file
+	LocalEbwt(const string& in,
+			  FILE *in5,
+			  FILE *in6,
+			  char *mmFile5,
+			  char *mmFile6,
+			  full_index_t& tidx,
+			  full_index_t& localOffset,
+			  bool switchEndian,
+			  size_t& bytesRead,
+			  int color,
+			  int needEntireReverse,
+			  bool fw,
+			  int32_t overrideOffRate, // = -1,
+			  int32_t offRatePlus, // = -1,
+			  uint32_t lineRate,
+			  uint32_t offRate,
+			  uint32_t ftabChars,
+			  bool useMm, // = false,
+			  bool useShmem, // = false,
+			  bool mmSweep, // = false,
+			  bool loadNames, // = false,
+			  bool loadSASamp, // = true,
+			  bool loadFtab, // = true,
+			  bool loadRstarts, // = true,
+			  bool verbose, // = false,
+			  bool startVerbose, // = false,
+			  bool passMemExc, // = false,
+			  bool sanityCheck) : // = false) :
+	Ebwt<index_t>(in,
+				  color,
+				  needEntireReverse,
+				  fw,
+				  overrideOffRate,
+				  offRatePlus,
+				  useMm,
+				  useShmem,
+				  mmSweep,
+				  loadNames,
+				  loadSASamp,
+				  loadFtab,
+				  loadRstarts,
+				  verbose,
+				  startVerbose,
+				  passMemExc,
+				  sanityCheck,
+				  true)
+	{
+		this->_in1Str = in + ".5." + gEbwt_ext;
+		this->_in2Str = in + ".5." + gEbwt_ext;
+		readIntoMemory(
+					   in5,
+					   in6,
+					   mmFile5,
+					   mmFile6,
+					   tidx,
+					   localOffset,
+					   switchEndian,
+					   bytesRead,
+					   color,
+					   needEntireReverse,
+					   loadSASamp,
+					   loadFtab,
+					   loadRstarts,
+					   false,              //justHeader
+					   lineRate,
+					   offRate,
+					   ftabChars,
+					   mmSweep,
+					   loadNames,
+					   startVerbose);
+		
+		_tidx = tidx;
+		_localOffset = localOffset;
+		
+		// If the offRate has been overridden, reflect that in the
+		// _eh._offRate field
+		if(offRatePlus > 0 && this->_overrideOffRate == -1) {
+			this->_overrideOffRate = this->_eh._offRate + offRatePlus;
+		}
+		if(this->_overrideOffRate > this->_eh._offRate) {
+			this->_eh.setOffRate(this->_overrideOffRate);
+			assert_eq(this->_overrideOffRate, this->_eh._offRate);
+		}
+		assert(this->repOk());
+	}
+
+
+	/// Construct an Ebwt from the given header parameters and string
+	/// vector, optionally using a blockwise suffix sorter with the
+	/// given 'bmax' and 'dcv' parameters.  The string vector is
+	/// ultimately joined and the joined string is passed to buildToDisk().
+	template<typename TStr>
+	LocalEbwt(
+			  TStr& s,
+			  full_index_t tidx,
+			  full_index_t local_offset,
+			  index_t local_size,
+			  bool packed,
+			  int color,
+			  int needEntireReverse,
+			  int32_t lineRate,
+			  int32_t offRate,
+			  int32_t ftabChars,
+			  const string& file,   // base filename for EBWT files
+			  bool fw,
+			  int dcv,
+			  EList<RefRecord>& szs,
+			  index_t sztot,
+			  const RefReadInParams& refparams,
+			  uint32_t seed,
+			  ostream& out5,
+			  ostream& out6,
+			  int32_t overrideOffRate = -1,
+			  bool verbose = false,
+			  bool passMemExc = false,
+			  bool sanityCheck = false) :
+	Ebwt<index_t>(packed,
+				  color,
+				  needEntireReverse,
+				  lineRate,
+				  offRate,
+				  ftabChars,
+				  file,
+				  fw,
+				  dcv,
+				  szs,
+				  sztot,
+				  refparams,
+				  seed,
+				  overrideOffRate,
+				  verbose,
+				  passMemExc,
+				  sanityCheck)
+	{
+		const EbwtParams<index_t>& eh = this->_eh;
+		assert(eh.repOk());
+		uint32_t be = this->toBe();
+		assert(out5.good());
+		assert(out6.good());
+		writeIndex<full_index_t>(out5, tidx, be);
+		writeIndex<full_index_t>(out5, local_offset, be);
+		writeU32(out5, eh._len,      be); // length of string (and bwt and suffix array)
+		if(eh._len > 0) {
+			assert_gt(szs.size(), 0);
+			assert_gt(sztot, 0);
+			// Not every fragment represents a distinct sequence - many
+			// fragments may correspond to a single sequence.  Count the
+			// number of sequences here by counting the number of "first"
+			// fragments.
+			this->_nPat = 0;
+			this->_nFrag = 0;
+			for(size_t i = 0; i < szs.size(); i++) {
+				if(szs[i].len > 0) this->_nFrag++;
+				if(szs[i].first && szs[i].len > 0) this->_nPat++;
+			}
+			assert_eq(this->_nPat, 1);
+			assert_geq(this->_nFrag, this->_nPat);
+			this->_rstarts.reset();
+			writeIndex(out5, this->_nPat, be);
+			assert_eq(this->_nPat, 1);
+			this->_plen.init(new index_t[this->_nPat], this->_nPat);
+			// For each pattern, set plen
+			int npat = -1;
+			for(size_t i = 0; i < szs.size(); i++) {
+				if(szs[i].first && szs[i].len > 0) {
+					if(npat >= 0) {
+						writeIndex(out5, this->plen()[npat], be);
+					}
+					npat++;
+					this->plen()[npat] = (szs[i].len + szs[i].off);
+				} else {
+					this->plen()[npat] += (szs[i].len + szs[i].off);
+				}
+			}
+			assert_eq((index_t)npat, this->_nPat-1);
+			writeIndex(out5, this->plen()[npat], be);
+			// Write the number of fragments
+			writeIndex(out5, this->_nFrag, be);
+			
+			if(refparams.reverse == REF_READ_REVERSE) {
+				EList<RefRecord> tmp(EBWT_CAT);
+                reverseRefRecords(szs, tmp, false, verbose);
+				this->szsToDisk(tmp, out5, refparams.reverse);
+			} else {
+				this->szsToDisk(szs, out5, refparams.reverse);
+			}
+			
+			VMSG_NL("Constructing suffix-array element generator");
+			KarkkainenBlockwiseSA<TStr> bsa(s, s.length()+1, dcv, seed, this->_sanity, this->_passMemExc, this->_verbose);
+			assert(bsa.suffixItrIsReset());
+			assert_eq(bsa.size(), s.length()+1);
+			VMSG_NL("Converting suffix-array elements to index image");
+			buildToDisk(bsa, s, out5, out6);
+		}
+		
+		out5.flush(); out6.flush();
+		if(out5.fail() || out6.fail()) {
+			cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
+			throw 1;
+		}
+	}
+	
+	template <typename TStr> void buildToDisk(
+											  InorderBlockwiseSA<TStr>& sa,
+											  const TStr& s,
+											  ostream& out1, 
+											  ostream& out2);
+	
+	// I/O
+	void readIntoMemory(
+						FILE *in5,
+						FILE *in6,
+						char *mmFile5,
+						char *mmFile6,
+						full_index_t& tidx,
+						full_index_t& localOffset,
+						bool switchEndian,
+						size_t bytesRead,
+						int color,
+						int needEntireRev, 
+						bool loadSASamp, 
+						bool loadFtab,
+						bool loadRstarts, 
+						bool justHeader, 
+						int32_t lineRate,
+						int32_t offRate,
+						int32_t ftabChars,
+						bool mmSweep, 
+						bool loadNames, 
+						bool startVerbose);
+	
+	/**
+	 * Sanity-check various pieces of the Ebwt
+	 */
+	void sanityCheckAll(int reverse) const {
+		if(this->_eh._len > 0) {
+			PARENT_CLASS::sanityCheckAll(reverse);
+		}
+	}
+    
+    bool empty() const { return this->_eh._len == 0; }
+	
+public:
+	full_index_t _tidx;
+	full_index_t _localOffset;
+};
+
+/**
+ * Build an Ebwt from a string 's' and its suffix array 'sa' (which
+ * might actually be a suffix array *builder* that builds blocks of the
+ * array on demand).  The bulk of the Ebwt, i.e. the ebwt and offs
+ * arrays, is written directly to disk.  This is by design: keeping
+ * those arrays in memory needlessly increases the footprint of the
+ * building process.  Instead, we prefer to build the Ebwt directly
+ * "to disk" and then read it back into memory later as necessary.
+ *
+ * It is assumed that the header values and join-related values (nPat,
+ * plen) have already been written to 'out1' before this function
+ * is called.  When this function is finished, it will have
+ * additionally written ebwt, zOff, fchr, ftab and eftab to the primary
+ * file and offs to the secondary file.
+ *
+ * Assume DNA/RNA/any alphabet with 4 or fewer elements.
+ * Assume occ array entries are 32 bits each.
+ *
+ * @param sa            the suffix array to convert to a Ebwt
+ * @param s             the original string
+ * @param out
+ */
+template <typename index_t, typename full_index_t>
+template <typename TStr>
+void LocalEbwt<index_t, full_index_t>::buildToDisk(
+									 InorderBlockwiseSA<TStr>& sa,
+									 const TStr& s,
+									 ostream& out5,
+									 ostream& out6)
+{
+	assert_leq(s.length(), std::numeric_limits<index_t>::max());
+	const EbwtParams<index_t>& eh = this->_eh;
+	
+	assert(eh.repOk());
+	assert_eq(s.length()+1, sa.size());
+	assert_eq(s.length(), eh._len);
+	assert_gt(eh._lineRate, 3);
+	assert(sa.suffixItrIsReset());
+	
+	index_t len = eh._len;
+	index_t ftabLen = eh._ftabLen;
+	index_t sideSz = eh._sideSz;
+	index_t ebwtTotSz = eh._ebwtTotSz;
+	index_t fchr[] = {0, 0, 0, 0, 0};
+	EList<index_t> ftab(EBWT_CAT);
+	index_t zOff = (index_t)OFF_MASK;
+	
+	// Save # of occurrences of each character as we walk along the bwt
+	index_t occ[4] = {0, 0, 0, 0};
+	index_t occSave[4] = {0, 0, 0, 0};
+	
+	// Record rows that should "absorb" adjacent rows in the ftab.
+	// The absorbed rows represent suffixes shorter than the ftabChars
+	// cutoff.
+	uint8_t absorbCnt = 0;
+	EList<uint8_t> absorbFtab(EBWT_CAT);
+	try {
+		VMSG_NL("Allocating ftab, absorbFtab");
+		ftab.resize(ftabLen);
+		ftab.fillZero();
+		absorbFtab.resize(ftabLen);
+		absorbFtab.fillZero();
+	} catch(bad_alloc &e) {
+		cerr << "Out of memory allocating ftab[] or absorbFtab[] "
+		<< "in Ebwt::buildToDisk() at " << __FILE__ << ":"
+		<< __LINE__ << endl;
+		throw e;
+	}
+	
+	// Allocate the side buffer; holds a single side as its being
+	// constructed and then written to disk.  Reused across all sides.
+#ifdef SIXTY4_FORMAT
+	EList<uint64_t> ebwtSide(EBWT_CAT);
+#else
+	EList<uint8_t> ebwtSide(EBWT_CAT);
+#endif
+	try {
+#ifdef SIXTY4_FORMAT
+		ebwtSide.resize(sideSz >> 3);
+#else
+		ebwtSide.resize(sideSz);
+#endif
+	} catch(bad_alloc &e) {
+		cerr << "Out of memory allocating ebwtSide[] in "
+		<< "Ebwt::buildToDisk() at " << __FILE__ << ":"
+		<< __LINE__ << endl;
+		throw e;
+	}
+	
+	// Points to the base offset within ebwt for the side currently
+	// being written
+	index_t side = 0;
+	
+	// Whether we're assembling a forward or a reverse bucket
+	bool fw;
+	int sideCur = 0;
+	fw = true;
+	
+	// Have we skipped the '$' in the last column yet?
+	ASSERT_ONLY(bool dollarSkipped = false);
+
+	index_t si = 0;   // string offset (chars)
+	ASSERT_ONLY(uint32_t lastSufInt = 0);
+	ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
+	// array (as opposed to the padding at the
+	// end)
+	// Iterate over packed bwt bytes
+	VMSG_NL("Entering Ebwt loop");
+	ASSERT_ONLY(uint32_t beforeEbwtOff = (uint32_t)out5.tellp());
+	while(side < ebwtTotSz) {
+		// Sanity-check our cursor into the side buffer
+		assert_geq(sideCur, 0);
+		assert_lt(sideCur, (int)eh._sideBwtSz);
+		assert_eq(0, side % sideSz); // 'side' must be on side boundary
+		ebwtSide[sideCur] = 0; // clear
+		assert_lt(side + sideCur, ebwtTotSz);
+		// Iterate over bit-pairs in the si'th character of the BWT
+#ifdef SIXTY4_FORMAT
+		for(int bpi = 0; bpi < 32; bpi++, si++) {
+#else
+		for(int bpi = 0; bpi < 4; bpi++, si++) {
+#endif
+			int bwtChar;
+			bool count = true;
+			if(si <= len) {
+				// Still in the SA; extract the bwtChar
+				index_t saElt = (index_t)sa.nextSuffix();
+				// (that might have triggered sa to calc next suf block)
+				if(saElt == 0) {
+					// Don't add the '$' in the last column to the BWT
+					// transform; we can't encode a $ (only A C T or G)
+					// and counting it as, say, an A, will mess up the
+					// LR mapping
+					bwtChar = 0; count = false;
+					ASSERT_ONLY(dollarSkipped = true);
+					zOff = si; // remember the SA row that
+					// corresponds to the 0th suffix
+				} else {
+					bwtChar = (int)(s[saElt-1]);
+					assert_lt(bwtChar, 4);
+					// Update the fchr
+					fchr[bwtChar]++;
+				}
+				// Update ftab
+				if((len-saElt) >= (index_t)eh._ftabChars) {
+					// Turn the first ftabChars characters of the
+					// suffix into an integer index into ftab.  The
+					// leftmost (lowest index) character of the suffix
+					// goes in the most significant bit pair if the
+					// integer.
+					uint32_t sufInt = 0;
+					for(int i = 0; i < eh._ftabChars; i++) {
+						sufInt <<= 2;
+						assert_lt((index_t)i, len-saElt);
+						sufInt |= (unsigned char)(s[saElt+i]);
+					}
+					// Assert that this prefix-of-suffix is greater
+					// than or equal to the last one (true b/c the
+					// suffix array is sorted)
+#ifndef NDEBUG
+					if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
+					lastSufInt = sufInt;
+#endif
+					// Update ftab
+					assert_lt(sufInt+1, ftabLen);
+					ftab[sufInt+1]++;
+					if(absorbCnt > 0) {
+						// Absorb all short suffixes since the last
+						// transition into this transition
+						absorbFtab[sufInt] = absorbCnt;
+						absorbCnt = 0;
+					}
+				} else {
+					// Otherwise if suffix is fewer than ftabChars
+					// characters long, then add it to the 'absorbCnt';
+					// it will be absorbed into the next transition
+					assert_lt(absorbCnt, 255);
+					absorbCnt++;
+				}
+				// Suffix array offset boundary? - update offset array
+				if((si & eh._offMask) == si) {
+					assert_lt((si >> eh._offRate), eh._offsLen);
+					// Write offsets directly to the secondary output
+					// stream, thereby avoiding keeping them in memory
+					writeIndex(out6, saElt, this->toBe());
+				}
+			} else {
+				// Strayed off the end of the SA, now we're just
+				// padding out a bucket
+#ifndef NDEBUG
+				if(inSA) {
+					// Assert that we wrote all the characters in the
+					// string before now
+					assert_eq(si, len+1);
+					inSA = false;
+				}
+#endif
+				// 'A' used for padding; important that padding be
+				// counted in the occ[] array
+				bwtChar = 0;
+			}
+			if(count) occ[bwtChar]++;
+			// Append BWT char to bwt section of current side
+			if(fw) {
+				// Forward bucket: fill from least to most
+#ifdef SIXTY4_FORMAT
+				ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
+				if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
+#else
+				pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi);
+				assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar);
+#endif
+			} else {
+				// Backward bucket: fill from most to least
+#ifdef SIXTY4_FORMAT
+				ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
+				if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
+#else
+				pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi);
+				assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
+#endif
+			}
+		} // end loop over bit-pairs
+		assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
+#ifdef SIXTY4_FORMAT
+		assert_eq(0, si & 31);
+#else
+		assert_eq(0, si & 3);
+#endif
+		
+		sideCur++;
+		if(sideCur == (int)eh._sideBwtSz) {
+			sideCur = 0;
+			index_t *uside = reinterpret_cast<index_t*>(ebwtSide.ptr());
+			// Write 'A', 'C', 'G' and 'T' tallies
+			side += sideSz;
+			assert_leq(side, eh._ebwtTotSz);
+			uside[(sideSz / sizeof(index_t))-4] = endianizeIndex(occSave[0], this->toBe());
+			uside[(sideSz / sizeof(index_t))-3] = endianizeIndex(occSave[1], this->toBe());
+			uside[(sideSz / sizeof(index_t))-2] = endianizeIndex(occSave[2], this->toBe());
+			uside[(sideSz / sizeof(index_t))-1] = endianizeIndex(occSave[3], this->toBe());
+			occSave[0] = occ[0];
+			occSave[1] = occ[1];
+			occSave[2] = occ[2];
+			occSave[3] = occ[3];
+			// Write backward side to primary file
+			out5.write((const char *)ebwtSide.ptr(), sideSz);
+		}
+	}
+	VMSG_NL("Exited Ebwt loop");
+	assert_neq(zOff, (index_t)OFF_MASK);
+	if(absorbCnt > 0) {
+		// Absorb any trailing, as-yet-unabsorbed short suffixes into
+		// the last element of ftab
+		absorbFtab[ftabLen-1] = absorbCnt;
+	}
+	// Assert that our loop counter got incremented right to the end
+	assert_eq(side, eh._ebwtTotSz);
+	// Assert that we wrote the expected amount to out1
+	assert_eq(((uint32_t)out5.tellp() - beforeEbwtOff), eh._ebwtTotSz);
+	// assert that the last thing we did was write a forward bucket
+	
+	//
+	// Write zOff to primary stream
+	//
+	writeIndex(out5, zOff, this->toBe());
+	
+	//
+	// Finish building fchr
+	//
+	// Exclusive prefix sum on fchr
+	for(int i = 1; i < 4; i++) {
+		fchr[i] += fchr[i-1];
+	}
+	assert_eq(fchr[3], len);
+	// Shift everybody up by one
+	for(int i = 4; i >= 1; i--) {
+		fchr[i] = fchr[i-1];
+	}
+	fchr[0] = 0;
+	if(this->_verbose) {
+		for(int i = 0; i < 5; i++)
+			cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
+	}
+	// Write fchr to primary file
+	for(int i = 0; i < 5; i++) {
+		writeIndex(out5, fchr[i], this->toBe());
+	}
+	
+	//
+	// Finish building ftab and build eftab
+	//
+	// Prefix sum on ftable
+	index_t eftabLen = 0;
+	assert_eq(0, absorbFtab[0]);
+	for(index_t i = 1; i < ftabLen; i++) {
+		if(absorbFtab[i] > 0) eftabLen += 2;
+	}
+	assert_leq(eftabLen, (index_t)eh._ftabChars*2);
+	eftabLen = eh._ftabChars*2;
+	EList<index_t> eftab(EBWT_CAT);
+	try {
+		eftab.resize(eftabLen);
+		eftab.fillZero();
+	} catch(bad_alloc &e) {
+		cerr << "Out of memory allocating eftab[] "
+		<< "in Ebwt::buildToDisk() at " << __FILE__ << ":"
+		<< __LINE__ << endl;
+		throw e;
+	}
+	index_t eftabCur = 0;
+	for(index_t i = 1; i < ftabLen; i++) {
+		index_t lo = ftab[i] + Ebwt<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
+		if(absorbFtab[i] > 0) {
+			// Skip a number of short pattern indicated by absorbFtab[i]
+			index_t hi = lo + absorbFtab[i];
+			assert_lt(eftabCur*2+1, eftabLen);
+			eftab[eftabCur*2] = lo;
+			eftab[eftabCur*2+1] = hi;
+			ftab[i] = (eftabCur++) ^ (index_t)OFF_MASK; // insert pointer into eftab
+			assert_eq(lo, Ebwt<index_t>::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
+			assert_eq(hi, Ebwt<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
+		} else {
+			ftab[i] = lo;
+		}
+	}
+	assert_eq(Ebwt<index_t>::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
+	// Write ftab to primary file
+	for(index_t i = 0; i < ftabLen; i++) {
+		writeIndex(out5, ftab[i], this->toBe());
+	}
+	// Write eftab to primary file
+	for(index_t i = 0; i < eftabLen; i++) {
+		writeIndex(out5, eftab[i], this->toBe());
+	}
+	
+	// Note: if you'd like to sanity-check the Ebwt, you'll have to
+	// read it back into memory first!
+	assert(!this->isInMemory());
+	VMSG_NL("Exiting Ebwt::buildToDisk()");
+}
+
+/**
+ * Read an Ebwt from file with given filename.
+ */
+template <typename index_t, typename full_index_t>
+void LocalEbwt<index_t, full_index_t>::readIntoMemory(
+										FILE *in5,
+										FILE *in6,
+										char *mmFile5,
+										char *mmFile6,
+										full_index_t& tidx,
+										full_index_t& localOffset,
+										bool switchEndian,
+										size_t bytesRead,
+										int color,
+										int entireRev,
+										bool loadSASamp,
+										bool loadFtab,
+										bool loadRstarts,
+										bool justHeader,
+										int32_t lineRate,
+										int32_t offRate,
+										int32_t ftabChars,
+										bool mmSweep,
+										bool loadNames,
+										bool startVerbose)
+{
+#ifdef BOWTIE_MM
+	char *mmFile[] = { mmFile5, mmFile6 };
+#endif
+	
+	// Reads header entries one by one from primary stream
+	tidx = readIndex<full_index_t>(in5, switchEndian); bytesRead += sizeof(full_index_t);
+	localOffset = readIndex<full_index_t>(in5, switchEndian); bytesRead += sizeof(full_index_t);
+	uint32_t len = readU32(in5, switchEndian); bytesRead += 4;
+	
+	// Create a new EbwtParams from the entries read from primary stream
+	this->_eh.init(len, lineRate, offRate, ftabChars, color, entireRev);
+	
+	if(len <= 0) {
+		return;
+	}
+	
+	// Set up overridden suffix-array-sample parameters
+	uint32_t offsLen = this->_eh._offsLen;
+	uint32_t offRateDiff = 0;
+	uint32_t offsLenSampled = offsLen;
+	if(this->_overrideOffRate > offRate) {
+		offRateDiff = this->_overrideOffRate - offRate;
+	}
+	if(offRateDiff > 0) {
+		offsLenSampled >>= offRateDiff;
+		if((offsLen & ~((index_t)OFF_MASK << offRateDiff)) != 0) {
+			offsLenSampled++;
+		}
+	}
+	
+	// Can't override the offrate or isarate and use memory-mapped
+	// files; ultimately, all processes need to copy the sparser sample
+	// into their own memory spaces.
+	if(this->_useMm && (offRateDiff)) {
+		cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl;
+		throw 1;
+	}
+	
+	// Read nPat from primary stream
+	this->_nPat = readIndex<index_t>(in5, switchEndian);
+	assert_eq(this->_nPat, 1);
+	bytesRead += sizeof(index_t);
+	this->_plen.reset();
+	
+	// Read plen from primary stream
+	if(this->_useMm) {
+#ifdef BOWTIE_MM
+		this->_plen.init((index_t*)(mmFile[0] + bytesRead), this->_nPat, false);
+		bytesRead += this->_nPat*sizeof(index_t);
+		fseek(in5, this->_nPat*sizeof(index_t), SEEK_CUR);
+#endif
+	} else {
+		try {
+			if(this->_verbose || startVerbose) {
+				cerr << "Reading plen (" << this->_nPat << "): ";
+				logTime(cerr);
+			}
+			this->_plen.init(new index_t[this->_nPat], this->_nPat, true);
+			if(switchEndian) {
+				for(index_t i = 0; i < this->_nPat; i++) {
+					this->plen()[i] = readIndex<index_t>(in5, switchEndian);
+				}
+			} else {
+				size_t r = MM_READ(in5, (void*)(this->plen()), this->_nPat*sizeof(index_t));
+				if(r != (size_t)(this->_nPat*sizeof(index_t))) {
+					cerr << "Error reading _plen[] array: " << r << ", " << this->_nPat*sizeof(index_t) << endl;
+					throw 1;
+				}
+			}
+		} catch(bad_alloc& e) {
+			cerr << "Out of memory allocating plen[] in Ebwt::read()"
+			<< " at " << __FILE__ << ":" << __LINE__ << endl;
+			throw e;
+		}
+	}
+
+	bool shmemLeader;
+	
+	// TODO: I'm not consistent on what "header" means.  Here I'm using
+	// "header" to mean everything that would exist in memory if we
+	// started to build the Ebwt but stopped short of the build*() step
+	// (i.e. everything up to and including join()).
+	if(justHeader) return;
+	
+	this->_nFrag = readIndex<index_t>(in5, switchEndian);
+	bytesRead += sizeof(index_t);
+	if(this->_verbose || startVerbose) {
+		cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
+		logTime(cerr);
+	}
+	assert_geq(this->_nFrag, this->_nPat);
+	this->_rstarts.reset();
+	if(loadRstarts) {
+		if(this->_useMm) {
+#ifdef BOWTIE_MM
+			this->_rstarts.init((index_t*)(mmFile[0] + bytesRead), this->_nFrag*3, false);
+			bytesRead += this->_nFrag*sizeof(index_t)*3;
+			fseek(in5, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
+#endif
+		} else {
+			this->_rstarts.init(new index_t[this->_nFrag*3], this->_nFrag*3, true);
+			if(switchEndian) {
+				for(index_t i = 0; i < this->_nFrag*3; i += 3) {
+					// fragment starting position in joined reference
+					// string, text id, and fragment offset within text
+					this->rstarts()[i]   = readIndex<index_t>(in5, switchEndian);
+					this->rstarts()[i+1] = readIndex<index_t>(in5, switchEndian);
+					this->rstarts()[i+2] = readIndex<index_t>(in5, switchEndian);
+				}
+			} else {
+				size_t r = MM_READ(in5, (void *)this->rstarts(), this->_nFrag*sizeof(index_t)*3);
+				if(r != (size_t)(this->_nFrag*sizeof(index_t)*3)) {
+					cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*sizeof(index_t)*3) << endl;
+					throw 1;
+				}
+			}
+		}
+	} else {
+		// Skip em
+		assert(this->rstarts() == NULL);
+		bytesRead += this->_nFrag*sizeof(index_t)*3;
+		fseek(in5, this->_nFrag*sizeof(index_t)*3, SEEK_CUR);
+	}
+	
+	this->_ebwt.reset();
+	if(this->_useMm) {
+#ifdef BOWTIE_MM
+		this->_ebwt.init((uint8_t*)(mmFile[0] + bytesRead), this->_eh._ebwtTotLen, false);
+		bytesRead += this->_eh._ebwtTotLen;
+		fseek(in5, this->_eh._ebwtTotLen, SEEK_CUR);
+#endif
+	} else {
+		// Allocate ebwt (big allocation)
+		if(this->_verbose || startVerbose) {
+			cerr << "Reading ebwt (" << this->_eh._ebwtTotLen << "): ";
+			logTime(cerr);
+		}
+		bool shmemLeader = true;
+		if(this->useShmem_) {
+			uint8_t *tmp = NULL;
+			shmemLeader = ALLOC_SHARED_U8(
+										  (this->_in1Str + "[ebwt]"), this->_eh._ebwtTotLen, &tmp,
+										  "ebwt[]", (this->_verbose || startVerbose));
+			assert(tmp != NULL);
+			this->_ebwt.init(tmp, this->_eh._ebwtTotLen, false);
+			if(this->_verbose || startVerbose) {
+				cerr << "  shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
+			}
+		} else {
+			try {
+				this->_ebwt.init(new uint8_t[this->_eh._ebwtTotLen], this->_eh._ebwtTotLen, true);
+			} catch(bad_alloc& e) {
+				cerr << "Out of memory allocating the ebwt[] array for the Bowtie index.  Please try" << endl
+				<< "again on a computer with more memory." << endl;
+				throw 1;
+			}
+		}
+		if(shmemLeader) {
+			// Read ebwt from primary stream
+			uint64_t bytesLeft = this->_eh._ebwtTotLen;
+			char *pebwt = (char*)this->ebwt();
+            
+			while (bytesLeft>0){
+				size_t r = MM_READ(in5, (void *)pebwt, bytesLeft);
+				if(MM_IS_IO_ERR(in5, r, bytesLeft)) {
+					cerr << "Error reading _ebwt[] array: " << r << ", "
+                    << bytesLeft << endl;
+					throw 1;
+				}
+				pebwt += r;
+				bytesLeft -= r;
+			}
+			if(switchEndian) {
+				uint8_t *side = this->ebwt();
+				for(size_t i = 0; i < this->_eh._numSides; i++) {
+					index_t *cums = reinterpret_cast<index_t*>(side + this->_eh._sideSz - sizeof(index_t)*2);
+					cums[0] = endianSwapIndex(cums[0]);
+					cums[1] = endianSwapIndex(cums[1]);
+					side += this->_eh._sideSz;
+				}
+			}
+#ifdef BOWTIE_SHARED_MEM
+			if(useShmem_) NOTIFY_SHARED(this->ebwt(), this->_eh._ebwtTotLen);
+#endif
+		} else {
+			// Seek past the data and wait until master is finished
+			fseek(in5, this->_eh._ebwtTotLen, SEEK_CUR);
+#ifdef BOWTIE_SHARED_MEM
+			if(useShmem_) WAIT_SHARED(this->ebwt(), this->_eh._ebwtTotLen);
+#endif
+		}
+	}
+	
+	// Read zOff from primary stream
+	this->_zOff = readIndex<index_t>(in5, switchEndian);
+	bytesRead += sizeof(index_t);
+	assert_lt(this->_zOff, len);
+	
+	try {
+		// Read fchr from primary stream
+		if(this->_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
+		this->_fchr.reset();
+		if(this->_useMm) {
+#ifdef BOWTIE_MM
+			this->_fchr.init((index_t*)(mmFile[0] + bytesRead), 5, false);
+			bytesRead += 5*sizeof(index_t);
+			fseek(in5, 5*sizeof(index_t), SEEK_CUR);
+#endif
+		} else {
+			this->_fchr.init(new index_t[5], 5, true);
+			for(index_t i = 0; i < 5; i++) {
+				this->fchr()[i] = readIndex<index_t>(in5, switchEndian);
+				assert_leq(this->fchr()[i], len);
+				assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]);
+			}
+		}
+		assert_gt(this->fchr()[4], this->fchr()[0]);
+		// Read ftab from primary stream
+		if(this->_verbose || startVerbose) {
+			if(loadFtab) {
+				cerr << "Reading ftab (" << this->_eh._ftabLen << "): ";
+				logTime(cerr);
+			} else {
+				cerr << "Skipping ftab (" << this->_eh._ftabLen << "): ";
+			}
+		}
+		this->_ftab.reset();
+		if(loadFtab) {
+			if(this->_useMm) {
+#ifdef BOWTIE_MM
+				this->_ftab.init((index_t*)(mmFile[0] + bytesRead), this->_eh._ftabLen, false);
+				bytesRead += this->_eh._ftabLen*sizeof(index_t);
+				fseek(in5, this->_eh._ftabLen*sizeof(index_t), SEEK_CUR);
+#endif
+			} else {
+				this->_ftab.init(new index_t[this->_eh._ftabLen], this->_eh._ftabLen, true);
+				if(switchEndian) {
+					for(uint32_t i = 0; i < this->_eh._ftabLen; i++)
+						this->ftab()[i] = readIndex<index_t>(in5, switchEndian);
+				} else {
+					size_t r = MM_READ(in5, (void *)this->ftab(), this->_eh._ftabLen*sizeof(index_t));
+					if(r != (size_t)(this->_eh._ftabLen*sizeof(index_t))) {
+						cerr << "Error reading _ftab[] array: " << r << ", " << (this->_eh._ftabLen*sizeof(index_t)) << endl;
+						throw 1;
+					}
+				}
+			}
+			// Read etab from primary stream
+			if(this->_verbose || startVerbose) {
+				if(loadFtab) {
+					cerr << "Reading eftab (" << this->_eh._eftabLen << "): ";
+					logTime(cerr);
+				} else {
+					cerr << "Skipping eftab (" << this->_eh._eftabLen << "): ";
+				}
+				
+			}
+			this->_eftab.reset();
+			if(this->_useMm) {
+#ifdef BOWTIE_MM
+				this->_eftab.init((index_t*)(mmFile[0] + bytesRead), this->_eh._eftabLen, false);
+				bytesRead += this->_eh._eftabLen*sizeof(index_t);
+				fseek(in5, this->_eh._eftabLen*sizeof(index_t), SEEK_CUR);
+#endif
+			} else {
+				this->_eftab.init(new index_t[this->_eh._eftabLen], this->_eh._eftabLen, true);
+				if(switchEndian) {
+					for(uint32_t i = 0; i < this->_eh._eftabLen; i++)
+						this->eftab()[i] = readIndex<index_t>(in5, switchEndian);
+				} else {
+					size_t r = MM_READ(in5, (void *)this->eftab(), this->_eh._eftabLen*sizeof(index_t));
+					if(r != (size_t)(this->_eh._eftabLen*sizeof(index_t))) {
+						cerr << "Error reading _eftab[] array: " << r << ", " << (this->_eh._eftabLen*sizeof(index_t)) << endl;
+						throw 1;
+					}
+				}
+			}
+			for(uint32_t i = 0; i < this->_eh._eftabLen; i++) {
+				if(i > 0 && this->eftab()[i] > 0) {
+					assert_geq(this->eftab()[i], this->eftab()[i-1]);
+				} else if(i > 0 && this->eftab()[i-1] == 0) {
+					assert_eq(0, this->eftab()[i]);
+				}
+			}
+		} else {
+			assert(this->ftab() == NULL);
+			assert(this->eftab() == NULL);
+			// Skip ftab
+			bytesRead += this->_eh._ftabLen*sizeof(index_t);
+			fseek(in5, this->_eh._ftabLen*sizeof(index_t), SEEK_CUR);
+			// Skip eftab
+			bytesRead += this->_eh._eftabLen*sizeof(index_t);
+			fseek(in5, this->_eh._eftabLen*sizeof(index_t), SEEK_CUR);
+		}
+	} catch(bad_alloc& e) {
+		cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
+		<< "Please try again on a computer with more memory." << endl;
+		throw 1;
+	}
+	
+	this->_offs.reset();
+	if(loadSASamp) {
+		bytesRead = 4; // reset for secondary index file (already read 1-sentinel)		
+		shmemLeader = true;
+		if(this->_verbose || startVerbose) {
+			cerr << "Reading offs (" << offsLenSampled << " " << std::setw(2) << sizeof(index_t)*8 << "-bit words): ";
+			logTime(cerr);
+		}
+		
+		if(!this->_useMm) {
+			if(!this->useShmem_) {
+				// Allocate offs_
+				try {
+					this->_offs.init(new index_t[offsLenSampled], offsLenSampled, true);
+				} catch(bad_alloc& e) {
+					cerr << "Out of memory allocating the offs[] array  for the Bowtie index." << endl
+					<< "Please try again on a computer with more memory." << endl;
+					throw 1;
+				}
+			} else {
+				index_t *tmp = NULL;
+				shmemLeader = ALLOC_SHARED_U32(
+											   (this->_in2Str + "[offs]"), offsLenSampled*2, &tmp,
+											   "offs", (this->_verbose || startVerbose));
+				this->_offs.init((index_t*)tmp, offsLenSampled, false);
+			}
+		}
+		
+		if(this->_overrideOffRate < 32) {
+			if(shmemLeader) {
+				// Allocate offs (big allocation)
+				if(switchEndian || offRateDiff > 0) {
+					assert(!this->_useMm);
+					const uint32_t blockMaxSz = (2 * 1024 * 1024); // 2 MB block size
+					const uint32_t blockMaxSzUIndex = (blockMaxSz / sizeof(index_t)); // # UIndexs per block
+					char *buf;
+					try {
+						buf = new char[blockMaxSz];
+					} catch(std::bad_alloc& e) {
+						cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl;
+						throw e;
+					}
+					for(index_t i = 0; i < offsLen; i += blockMaxSzUIndex) {
+					  index_t block = min<index_t>((index_t)blockMaxSzUIndex, (index_t)(offsLen - i));
+						size_t r = MM_READ(in6, (void *)buf, block * sizeof(index_t));
+						if(r != (size_t)(block * sizeof(index_t))) {
+							cerr << "Error reading block of _offs[] array: " << r << ", " << (block * sizeof(index_t)) << endl;
+							throw 1;
+						}
+						index_t idx = i >> offRateDiff;
+						for(index_t j = 0; j < block; j += (1 << offRateDiff)) {
+							assert_lt(idx, offsLenSampled);
+							this->offs()[idx] = ((index_t*)buf)[j];
+							if(switchEndian) {
+								this->offs()[idx] = endianSwapIndex(this->offs()[idx]);
+							}
+							idx++;
+						}
+					}
+					delete[] buf;
+				} else {
+					if(this->_useMm) {
+#ifdef BOWTIE_MM
+						this->_offs.init((index_t*)(mmFile[1] + bytesRead), offsLen, false);
+						bytesRead += (offsLen * sizeof(index_t));
+						fseek(in6, (offsLen * sizeof(index_t)), SEEK_CUR);
+#endif
+					} else {
+						// If any of the high two bits are set
+						if((offsLen & 0xc0000000) != 0) {
+							if(sizeof(char *) <= 4) {
+								cerr << "Sanity error: sizeof(char *) <= 4 but offsLen is " << hex << offsLen << endl;
+								throw 1;
+							}
+							// offsLen << 2 overflows, so do it in four reads
+							char *offs = (char *)this->offs();
+							for(size_t i = 0; i < sizeof(index_t); i++) {
+								size_t r = MM_READ(in6, (void*)offs, offsLen);
+								if(r != (size_t)(offsLen)) {
+									cerr << "Error reading block of _offs[] array: " << r << ", " << offsLen << endl;
+									throw 1;
+								}
+								offs += offsLen;
+							}
+						} else {
+							// Do it all in one read
+							size_t r = MM_READ(in6, (void*)this->offs(), offsLen * sizeof(index_t));
+							if(r != (size_t)(offsLen * sizeof(index_t))) {
+								cerr << "Error reading _offs[] array: " << r << ", " << (offsLen * sizeof(index_t)) << endl;
+								throw 1;
+							}
+						}
+					}
+				}
+#ifdef BOWTIE_SHARED_MEM				
+				if(this->useShmem_) NOTIFY_SHARED(this->offs(), offsLenSampled*sizeof(index_t));
+#endif
+			} else {
+				// Not the shmem leader
+				fseek(in6, offsLenSampled*sizeof(index_t), SEEK_CUR);
+#ifdef BOWTIE_SHARED_MEM				
+				if(this->useShmem_) WAIT_SHARED(this->offs(), offsLenSampled*sizeof(index_t));
+#endif
+			}
+		}
+	}
+	
+	this->postReadInit(this->_eh); // Initialize fields of Ebwt not read from file
+	if(this->_verbose || startVerbose) this->print(cerr, this->_eh);
+}
+
+/**
+ * Extended Burrows-Wheeler transform data.
+ * HierEbwt is a specialized Ebwt index that represents one global index and a large set of local indexes.
+ *
+ */
+template <typename index_t = uint32_t, typename local_index_t = uint16_t>
+class HierEbwt : public Ebwt<index_t> {
+	typedef Ebwt<index_t> PARENT_CLASS;
+public:
+	/// Construct an Ebwt from the given input file
+	HierEbwt(const string& in,
+			 int color,
+			 int needEntireReverse,
+			 bool fw,
+			 int32_t overrideOffRate, // = -1,
+			 int32_t offRatePlus, // = -1,
+			 bool useMm, // = false,
+			 bool useShmem, // = false,
+			 bool mmSweep, // = false,
+			 bool loadNames, // = false,
+			 bool loadSASamp, // = true,
+			 bool loadFtab, // = true,
+			 bool loadRstarts, // = true,
+			 bool verbose, // = false,
+			 bool startVerbose, // = false,
+			 bool passMemExc, // = false,
+			 bool sanityCheck, // = false
+             bool skipLoading = false) :
+	         Ebwt<index_t>(in,
+						   color,
+						   needEntireReverse,
+						   fw,
+						   overrideOffRate,
+						   offRatePlus,
+						   useMm,
+						   useShmem,
+						   mmSweep,
+						   loadNames,
+						   loadSASamp,
+						   loadFtab,
+						   loadRstarts,
+						   verbose,
+						   startVerbose,
+						   passMemExc,
+						   sanityCheck,
+						   skipLoading),
+	         _in5(NULL),
+	         _in6(NULL)
+	{
+		_in5Str = in + ".5." + gEbwt_ext;
+		_in6Str = in + ".6." + gEbwt_ext;
+        
+        if(!skipLoading && false) {
+            readIntoMemory(
+                           color,       // expect index to be colorspace?
+                           fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
+                           loadSASamp,  // load the SA sample portion?
+                           loadFtab,    // load the ftab & eftab?
+                           loadRstarts, // load the rstarts array?
+                           true,        // stop after loading the header portion?
+                           &(this->_eh),
+                           mmSweep,     // mmSweep
+                           loadNames,   // loadNames
+                           startVerbose); // startVerbose
+            // If the offRate has been overridden, reflect that in the
+            // _eh._offRate field
+            if(offRatePlus > 0 && this->_overrideOffRate == -1) {
+                this->_overrideOffRate = this->_eh._offRate + offRatePlus;
+            }
+            if(this->_overrideOffRate > this->_eh._offRate) {
+                this->_eh.setOffRate(this->_overrideOffRate);
+                assert_eq(this->_overrideOffRate, this->_eh._offRate);
+            }
+            assert(this->repOk());
+        }
+	}
+	
+	/// Construct an Ebwt from the given header parameters and string
+	/// vector, optionally using a blockwise suffix sorter with the
+	/// given 'bmax' and 'dcv' parameters.  The string vector is
+	/// ultimately joined and the joined string is passed to buildToDisk().
+	template<typename TStr>
+	HierEbwt(
+			 TStr& s,
+			 bool packed,
+			 int color,
+			 int needEntireReverse,
+			 int32_t lineRate,
+			 int32_t offRate,
+			 int32_t ftabChars,
+             int32_t localOffRate,
+             int32_t localFtabChars,
+			 const string& file,   // base filename for EBWT files
+			 bool fw,
+			 bool useBlockwise,
+			 TIndexOffU bmax,
+			 TIndexOffU bmaxSqrtMult,
+			 TIndexOffU bmaxDivN,
+			 int dcv,
+			 EList<FileBuf*>& is,
+			 EList<RefRecord>& szs,
+			 index_t sztot,
+			 const RefReadInParams& refparams,
+			 uint32_t seed,
+			 int32_t overrideOffRate = -1,
+			 bool verbose = false,
+			 bool passMemExc = false,
+			 bool sanityCheck = false);
+	        	
+	~HierEbwt() {
+		clearLocalEbwts();
+	}
+    
+    /**
+	 * Load this Ebwt into memory by reading it in from the _in1 and
+	 * _in2 streams.
+	 */
+	void loadIntoMemory(
+                        int color,
+                        int needEntireReverse,
+                        bool loadSASamp,
+                        bool loadFtab,
+                        bool loadRstarts,
+                        bool loadNames,
+                        bool verbose)
+	{
+		readIntoMemory(
+                       color,       // expect index to be colorspace?
+                       needEntireReverse, // require reverse index to be concatenated reference reversed
+                       loadSASamp,  // load the SA sample portion?
+                       loadFtab,    // load the ftab (_ftab[] and _eftab[])?
+                       loadRstarts, // load the r-starts (_rstarts[])?
+                       false,       // stop after loading the header portion?
+                       NULL,        // params
+                       false,       // mmSweep
+                       loadNames,   // loadNames
+                       verbose);    // startVerbose
+	}
+	
+	// I/O
+	void readIntoMemory(
+                        int color,
+                        int needEntireRev,
+                        bool loadSASamp,
+                        bool loadFtab,
+                        bool loadRstarts,
+                        bool justHeader,
+                        EbwtParams<index_t> *params,
+                        bool mmSweep,
+                        bool loadNames,
+                        bool startVerbose);
+	
+	/**
+	 * Frees memory associated with the Ebwt.
+	 */
+	void evictFromMemory() {
+		assert(PARENT_CLASS::isInMemory());
+		clearLocalEbwts();
+		PARENT_CLASS::evictFromMemory();		
+	}
+	
+	/**
+	 * Sanity-check various pieces of the Ebwt
+	 */
+	void sanityCheckAll(int reverse) const {
+		PARENT_CLASS::sanityCheckAll(reverse);
+		for(size_t tidx = 0; tidx < _localEbwts.size(); tidx++) {
+			for(size_t local_idx = 0; local_idx < _localEbwts[tidx].size(); local_idx++) {
+				assert(_localEbwts[tidx][local_idx] != NULL);
+				_localEbwts[tidx][local_idx]->sanityCheckAll(reverse);
+			}
+		}
+	}
+    
+    const LocalEbwt<local_index_t, index_t>* getLocalEbwt(index_t tidx, index_t offset) const {
+        assert_lt(tidx, _localEbwts.size());
+        const EList<LocalEbwt<local_index_t, index_t>*>& localEbwts = _localEbwts[tidx];
+        index_t offsetidx = offset / local_index_interval;
+        if(offsetidx >= localEbwts.size()) {
+            return NULL;
+        } else {
+            return localEbwts[offsetidx];
+        }
+    }
+    
+    const LocalEbwt<local_index_t, index_t>* prevLocalEbwt(const LocalEbwt<local_index_t, index_t>* currLocalEbwt) const {
+        assert(currLocalEbwt != NULL);
+        index_t tidx = currLocalEbwt->_tidx;
+        index_t offset = currLocalEbwt->_localOffset;
+        if(offset < local_index_interval) {
+            return NULL;
+        } else {
+            return getLocalEbwt(tidx, offset - local_index_interval);
+        }
+    }
+    
+    const LocalEbwt<local_index_t, index_t>* nextLocalEbwt(const LocalEbwt<local_index_t, index_t>* currLocalEbwt) const {
+        assert(currLocalEbwt != NULL);
+        index_t tidx = currLocalEbwt->_tidx;
+        index_t offset = currLocalEbwt->_localOffset;
+        return getLocalEbwt(tidx, offset + local_index_interval);
+    }
+	
+	void clearLocalEbwts() {
+		for(size_t tidx = 0; tidx < _localEbwts.size(); tidx++) {
+			for(size_t local_idx = 0; local_idx < _localEbwts[tidx].size(); local_idx++) {
+				assert(_localEbwts[tidx][local_idx] != NULL);
+				delete _localEbwts[tidx][local_idx];
+			}
+			
+			_localEbwts[tidx].clear();
+		}
+		
+		_localEbwts.clear();
+	}
+	
+
+public:
+	index_t                                  _nrefs;      /// the number of reference sequences
+	EList<index_t>                           _refLens;    /// approx lens of ref seqs (excludes trailing ambig chars)
+	
+	EList<EList<LocalEbwt<local_index_t, index_t>*> > _localEbwts;
+	index_t                                  _nlocalEbwts;
+	
+	FILE                                     *_in5;    // input fd for primary index file
+	FILE                                     *_in6;    // input fd for secondary index file
+	string                                   _in5Str;
+	string                                   _in6Str;
+	
+	char                                     *mmFile5_;
+	char                                     *mmFile6_;
+};
+    
+/// Construct an Ebwt from the given header parameters and string
+/// vector, optionally using a blockwise suffix sorter with the
+/// given 'bmax' and 'dcv' parameters.  The string vector is
+/// ultimately joined and the joined string is passed to buildToDisk().
+template <typename index_t, typename local_index_t>
+template <typename TStr>
+HierEbwt<index_t, local_index_t>::HierEbwt(
+                                           TStr& s,
+                                           bool packed,
+                                           int color,
+                                           int needEntireReverse,
+                                           int32_t lineRate,
+                                           int32_t offRate,
+                                           int32_t ftabChars,
+                                           int32_t localOffRate,
+                                           int32_t localFtabChars,
+                                           const string& file,   // base filename for EBWT files
+                                           bool fw,
+                                           bool useBlockwise,
+                                           TIndexOffU bmax,
+                                           TIndexOffU bmaxSqrtMult,
+                                           TIndexOffU bmaxDivN,
+                                           int dcv,
+                                           EList<FileBuf*>& is,
+                                           EList<RefRecord>& szs,
+                                           index_t sztot,
+                                           const RefReadInParams& refparams,
+                                           uint32_t seed,
+                                           int32_t overrideOffRate,
+                                           bool verbose,
+                                           bool passMemExc,
+                                           bool sanityCheck) :
+    Ebwt<index_t>(s,
+                  packed,
+                  color,
+                  needEntireReverse,
+                  lineRate,
+                  offRate,
+                  ftabChars,
+                  file,
+                  fw,
+                  useBlockwise,
+                  bmax,
+                  bmaxSqrtMult,
+                  bmaxDivN,
+                  dcv,
+                  is,
+                  szs,
+                  sztot,
+                  refparams,
+                  seed,
+                  overrideOffRate,
+                  verbose,
+                  passMemExc,
+                  sanityCheck),
+    _in5(NULL),
+    _in6(NULL)
+{
+    _in5Str = file + ".5." + gEbwt_ext;
+    _in6Str = file + ".6." + gEbwt_ext;
+    
+    // Open output files
+    ofstream fout5(_in5Str.c_str(), ios::binary);
+    if(!fout5.good()) {
+        cerr << "Could not open index file for writing: \"" << _in5Str.c_str() << "\"" << endl
+        << "Please make sure the directory exists and that permissions allow writing by" << endl
+        << "Bowtie." << endl;
+        throw 1;
+    }
+    ofstream fout6(_in6Str.c_str(), ios::binary);
+    if(!fout6.good()) {
+        cerr << "Could not open index file for writing: \"" << _in6Str.c_str() << "\"" << endl
+        << "Please make sure the directory exists and that permissions allow writing by" << endl
+        << "Bowtie." << endl;
+        throw 1;
+    }
+    
+    // split the whole genome into a set of local indexes
+    _nrefs = 0;
+    _nlocalEbwts = 0;
+    
+    index_t cumlen = 0;
+    typedef EList<RefRecord, 1> EList_RefRecord;
+    EList<EList<EList_RefRecord> > all_local_recs;
+    // For each unambiguous stretch...
+    for(index_t i = 0; i < szs.size(); i++) {
+        const RefRecord& rec = szs[i];
+        if(rec.first) {
+            if(_nrefs > 0) {
+                // refLens_ links each reference sequence with the total number
+                // of ambiguous and unambiguous characters in it.
+                _refLens.push_back(cumlen);
+            }
+            cumlen = 0;
+            _nrefs++;
+            all_local_recs.expand();
+            assert_eq(_nrefs, all_local_recs.size());
+        } else if(i == 0) {
+            cerr << "First record in reference index file was not marked as "
+            << "'first'" << endl;
+            throw 1;
+        }
+        
+        assert_gt(_nrefs, 0);
+        assert_eq(_nrefs, all_local_recs.size());
+        EList<EList_RefRecord>& ref_local_recs = all_local_recs[_nrefs-1];
+        index_t next_cumlen = cumlen + rec.off + rec.len;
+        index_t local_off = (cumlen / local_index_interval) * local_index_interval;
+        if(local_off >= local_index_interval) {
+            local_off -= local_index_interval;
+        }
+        for(;local_off < next_cumlen; local_off += local_index_interval) {
+            if(local_off + local_index_size < cumlen) {
+                continue;
+            }
+            index_t local_idx = local_off / local_index_interval;
+            
+            if(local_idx >= ref_local_recs.size()) {
+                assert_eq(local_idx, ref_local_recs.size());
+                ref_local_recs.expand();
+                _nlocalEbwts++;
+            }
+            assert_lt(local_idx, ref_local_recs.size());
+            EList_RefRecord& local_recs = ref_local_recs[local_idx];
+            assert_gt(local_off + local_index_size, cumlen);
+            local_recs.expand();
+            if(local_off + local_index_size <= cumlen + rec.off) {
+                local_recs.back().off = local_off + local_index_size - std::max(local_off, cumlen);
+                local_recs.back().len = 0;
+            } else {
+                if(local_off < cumlen + rec.off) {
+                    local_recs.back().off = rec.off - (local_off > cumlen ? local_off - cumlen : 0);
+                } else {
+                    local_recs.back().off = 0;
+                }
+                local_recs.back().len = std::min(next_cumlen, local_off + local_index_size) - std::max(local_off, cumlen + rec.off);
+            }
+            local_recs.back().first = (local_recs.size() == 1);
+        }
+        cumlen = next_cumlen;
+    }
+    
+    // Store a cap entry for the end of the last reference seq
+    _refLens.push_back(cumlen);
+    
+#ifndef NDEBUG
+    EList<RefRecord> temp_szs;
+    index_t temp_sztot = 0;
+    index_t temp_nlocalEbwts = 0;
+    for(size_t tidx = 0; tidx < all_local_recs.size(); tidx++) {
+        assert_lt(tidx, _refLens.size());
+        EList<EList_RefRecord>& ref_local_recs = all_local_recs[tidx];
+        assert_eq((_refLens[tidx] + local_index_interval - 1) / local_index_interval, ref_local_recs.size());
+        temp_szs.expand();
+        temp_szs.back().off = 0;
+        temp_szs.back().len = 0;
+        temp_szs.back().first = true;
+        index_t temp_ref_len = 0;
+        index_t temp_ref_sztot = 0;
+        temp_nlocalEbwts += ref_local_recs.size();
+        for(size_t i = 0; i < ref_local_recs.size(); i++) {
+            EList_RefRecord& local_recs = ref_local_recs[i];
+            index_t local_len = 0;
+            for(size_t j = 0; j < local_recs.size(); j++) {
+                assert(local_recs[j].off != 0 || local_recs[j].len != 0);
+                assert(j != 0 || local_recs[j].first);
+                RefRecord local_rec = local_recs[j];
+                if(local_len < local_index_interval && local_recs[j].off > 0){
+                    if(local_len + local_recs[j].off > local_index_interval) {
+                        temp_ref_len += (local_index_interval - local_len);
+                        local_rec.off = local_index_interval - local_len;
+                    } else {
+                        temp_ref_len += local_recs[j].off;
+                    }
+                } else {
+                    local_rec.off = 0;
+                }
+                local_len += local_recs[j].off;
+                if(local_len < local_index_interval && local_recs[j].len > 0) {
+                    if(local_len + local_recs[j].len > local_index_interval) {
+                        temp_ref_len += (local_index_interval - local_len);
+                        temp_ref_sztot += (local_index_interval - local_len);
+                        local_rec.len = local_index_interval - local_len;
+                    } else {
+                        temp_ref_len += local_recs[j].len;
+                        temp_ref_sztot += local_recs[j].len;
+                    }
+                } else {
+                    local_rec.len = 0;
+                }
+                local_len += local_recs[j].len;
+                if(local_rec.off > 0) {
+                    if(temp_szs.back().len > 0) {
+                        temp_szs.expand();
+                        temp_szs.back().off = local_rec.off;
+                        temp_szs.back().len = local_rec.len;
+                        temp_szs.back().first = false;
+                    } else {
+                        temp_szs.back().off += local_rec.off;
+                        temp_szs.back().len = local_rec.len;
+                    }
+                } else if(local_rec.len > 0) {
+                    temp_szs.back().len += local_rec.len;
+                }
+            }
+            if(i + 1 < ref_local_recs.size()) {
+                assert_eq(local_len, local_index_size);
+                assert_eq(temp_ref_len % local_index_interval, 0);
+            } else {
+                assert_eq(local_len, _refLens[tidx] % local_index_interval);
+            }
+        }
+        assert_eq(temp_ref_len, _refLens[tidx]);
+        temp_sztot += temp_ref_sztot;
+    }
+    assert_eq(temp_sztot, sztot);
+    for(size_t i = 0; i < temp_szs.size(); i++) {
+        assert_lt(i, szs.size());
+        assert_eq(temp_szs[i].off, szs[i].off);
+        assert_eq(temp_szs[i].len, szs[i].len);
+        assert_eq(temp_szs[i].first, szs[i].first);
+    }
+    assert_eq(temp_szs.size(), szs.size());
+    assert_eq(_nlocalEbwts, temp_nlocalEbwts);
+#endif
+    
+    uint32_t be = this->toBe();
+    assert(fout5.good());
+    assert(fout6.good());
+    
+    // When building an Ebwt, these header parameters are known
+    // "up-front", i.e., they can be written to disk immediately,
+    // before we join() or buildToDisk()
+    writeI32(fout5, 1, be); // endian hint for priamry stream
+    writeI32(fout6, 1, be); // endian hint for secondary stream
+    writeIndex<index_t>(fout5, _nlocalEbwts, be); // number of local Ebwts
+    writeI32(fout5, local_lineRate,  be); // 2^lineRate = size in bytes of 1 line
+    writeI32(fout5, 2, be); // not used
+    writeI32(fout5, (int32_t)localOffRate,   be); // every 2^offRate chars is "marked"
+    writeI32(fout5, (int32_t)localFtabChars, be); // number of 2-bit chars used to address ftab
+    int32_t flags = 1;
+    if(this->_eh._color) flags |= EBWT_COLOR;
+    if(this->_eh._entireReverse) flags |= EBWT_ENTIRE_REV;
+    writeI32(fout5, -flags, be); // BTL: chunkRate is now deprecated
+    
+    // build local FM indexes
+    index_t curr_sztot = 0;
+    bool firstIndex = true;
+    for(size_t tidx = 0; tidx < _refLens.size(); tidx++) {
+        index_t refLen = _refLens[tidx];
+        index_t local_offset = 0;
+        _localEbwts.expand();
+        assert_lt(tidx, _localEbwts.size());
+        while(local_offset < refLen) {
+            index_t index_size = std::min<index_t>(refLen - local_offset, local_index_size);
+            assert_lt(tidx, all_local_recs.size());
+            assert_lt(local_offset / local_index_interval, all_local_recs[tidx].size());
+            EList_RefRecord& local_szs = all_local_recs[tidx][local_offset / local_index_interval];
+            
+            EList<RefRecord> conv_local_szs;
+            index_t local_len = 0, local_sztot = 0, local_sztot_interval = 0;
+            for(size_t i = 0; i < local_szs.size(); i++) {
+                assert(local_szs[i].off != 0 || local_szs[i].len != 0);
+                assert(i != 0 || local_szs[i].first);
+                conv_local_szs.push_back(local_szs[i]);
+                local_len += local_szs[i].off;
+                if(local_len < local_index_interval && local_szs[i].len > 0) {
+                    if(local_len + local_szs[i].len > local_index_interval) {
+                        local_sztot_interval += (local_index_interval - local_len);
+                    } else {
+                        local_sztot_interval += local_szs[i].len;
+                    }
+                }
+                local_sztot += local_szs[i].len;
+                local_len += local_szs[i].len;
+            }
+            TStr local_s;
+            local_s.resize(local_sztot);
+            if(refparams.reverse == REF_READ_REVERSE) {
+                local_s.install(s.buf() + s.length() - curr_sztot - local_sztot, local_sztot);
+            } else {
+                local_s.install(s.buf() + curr_sztot, local_sztot);
+            }
+            LocalEbwt<local_index_t, index_t>* localEbwt = new LocalEbwt<local_index_t, index_t>(
+                                                                                                 local_s,
+                                                                                                 tidx,
+                                                                                                 local_offset,
+                                                                                                 index_size,
+                                                                                                 packed,
+                                                                                                 color,
+                                                                                                 needEntireReverse,
+                                                                                                 local_lineRate,
+                                                                                                 localOffRate,      // suffix-array sampling rate
+                                                                                                 localFtabChars,    // number of chars in initial arrow-pair calc
+                                                                                                 file,               // basename for .?.ebwt files
+                                                                                                 fw,                 // fw
+                                                                                                 dcv,                // difference-cover period
+                                                                                                 conv_local_szs,     // list of reference sizes
+                                                                                                 local_sztot,        // total size of all unambiguous ref chars
+                                                                                                 refparams,          // reference read-in parameters
+                                                                                                 seed,               // pseudo-random number generator seed
+                                                                                                 fout5,
+                                                                                                 fout6,
+                                                                                                 -1,                 // override offRate
+                                                                                                 false,              // be silent
+                                                                                                 passMemExc,         // pass exceptions up to the toplevel so that we can adjust memory settings automatically
+                                                                                                 sanityCheck);       // verify results and internal consistency
+            firstIndex = false;
+            _localEbwts[tidx].push_back(localEbwt);
+            curr_sztot += local_sztot_interval;
+            local_offset += local_index_interval;
+        }
+    }
+    assert_eq(curr_sztot, sztot);
+    
+    
+    fout5 << '\0';
+    fout5.flush(); fout6.flush();
+    if(fout5.fail() || fout6.fail()) {
+        cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
+        throw 1;
+    }
+    VMSG_NL("Returning from initFromVector");
+    
+    // Close output files
+    fout5.flush();
+    int64_t tellpSz5 = (int64_t)fout5.tellp();
+    VMSG_NL("Wrote " << fout5.tellp() << " bytes to primary EBWT file: " << _in5Str.c_str());
+    fout5.close();
+    bool err = false;
+    if(tellpSz5 > fileSize(_in5Str.c_str())) {
+        err = true;
+        cerr << "Index is corrupt: File size for " << _in5Str.c_str() << " should have been " << tellpSz5
+        << " but is actually " << fileSize(_in5Str.c_str()) << "." << endl;
+    }
+    fout6.flush();
+    int64_t tellpSz6 = (int64_t)fout6.tellp();
+    VMSG_NL("Wrote " << fout6.tellp() << " bytes to secondary EBWT file: " << _in6Str.c_str());
+    fout6.close();
+    if(tellpSz6 > fileSize(_in6Str.c_str())) {
+        err = true;
+        cerr << "Index is corrupt: File size for " << _in6Str.c_str() << " should have been " << tellpSz6
+        << " but is actually " << fileSize(_in6Str.c_str()) << "." << endl;
+    }
+    if(err) {
+        cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
+        throw 1;
+    }
+    // Reopen as input streams
+    VMSG_NL("Re-opening _in5 and _in5 as input streams");
+    if(this->_sanity) {
+        VMSG_NL("Sanity-checking Bt2");
+        assert(!this->isInMemory());
+        readIntoMemory(
+                       color,                       // colorspace?
+                       fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
+                       true,                        // load SA sample (_offs[])?
+                       true,                        // load ftab (_ftab[] & _eftab[])?
+                       true,                        // load r-starts (_rstarts[])?
+                       false,                       // just load header?
+                       NULL,                        // Params object to fill
+                       false,                       // mm sweep?
+                       true,                        // load names?
+                       false);                      // verbose startup?
+        sanityCheckAll(refparams.reverse);
+        evictFromMemory();
+        assert(!this->isInMemory());
+    }
+    VMSG_NL("Returning from HierEbwt constructor");
+}
+
+    
+/**
+ * Read an Ebwt from file with given filename.
+ */
+template <typename index_t, typename local_index_t>
+void HierEbwt<index_t, local_index_t>::readIntoMemory(
+													  int color,
+													  int needEntireRev,
+													  bool loadSASamp,
+													  bool loadFtab,
+													  bool loadRstarts,
+													  bool justHeader,
+													  EbwtParams<index_t> *params,
+													  bool mmSweep,
+													  bool loadNames,
+													  bool startVerbose)
+{
+    PARENT_CLASS::readIntoMemory(color,
+                                 needEntireRev,
+                                 loadSASamp,
+                                 loadFtab,
+                                 loadRstarts,
+                                 justHeader || needEntireRev == 1,
+                                 params,
+                                 mmSweep,
+                                 loadNames,
+                                 startVerbose);
+    
+    return;
+
+	bool switchEndian; // dummy; caller doesn't care
+#ifdef BOWTIE_MM
+	char *mmFile[] = { NULL, NULL };
+#endif
+	if(_in5Str.length() > 0) {
+		if(this->_verbose || startVerbose) {
+			cerr << "  About to open input files: ";
+			logTime(cerr);
+		}
+        // Initialize our primary and secondary input-stream fields
+		if(_in5 != NULL) fclose(_in5);
+		if(this->_verbose || startVerbose) cerr << "Opening \"" << _in5Str.c_str() << "\"" << endl;
+		if((_in5 = fopen(_in5Str.c_str(), "rb")) == NULL) {
+			cerr << "Could not open index file " << _in5Str.c_str() << endl;
+		}
+		if(loadSASamp) {
+			if(_in6 != NULL) fclose(_in6);
+			if(this->_verbose || startVerbose) cerr << "Opening \"" << _in6Str.c_str() << "\"" << endl;
+			if((_in6 = fopen(_in6Str.c_str(), "rb")) == NULL) {
+				cerr << "Could not open index file " << _in6Str.c_str() << endl;
+			}
+		}
+		if(this->_verbose || startVerbose) {
+			cerr << "  Finished opening input files: ";
+			logTime(cerr);
+		}
+		
+#ifdef BOWTIE_MM
+		if(this->_useMm /*&& !justHeader*/) {
+			const char *names[] = {_in5Str.c_str(), _in6Str.c_str()};
+            int fds[] = { fileno(_in5), fileno(_in6) };
+			for(int i = 0; i < (loadSASamp ? 2 : 1); i++) {
+				if(this->_verbose || startVerbose) {
+					cerr << "  ¯ " << (i+1) << ": ";
+					logTime(cerr);
+				}
+				struct stat sbuf;
+				if (stat(names[i], &sbuf) == -1) {
+					perror("stat");
+					cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
+					throw 1;
+				}
+                mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size,
+										PROT_READ, MAP_SHARED, fds[(size_t)i], 0);
+				if(mmFile[i] == (void *)(-1)) {
+					perror("mmap");
+					cerr << "Error: Could not memory-map the index file " << names[i] << endl;
+					throw 1;
+				}
+				if(mmSweep) {
+					int sum = 0;
+					for(off_t j = 0; j < sbuf.st_size; j += 1024) {
+						sum += (int) mmFile[i][j];
+					}
+					if(startVerbose) {
+						cerr << "  Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
+						logTime(cerr);
+					}
+				}
+			}
+			mmFile5_ = mmFile[0];
+			mmFile6_ = loadSASamp ? mmFile[1] : NULL;
+		}
+#endif
+	}
+#ifdef BOWTIE_MM
+	else if(this->_useMm && !justHeader) {
+		mmFile[0] = mmFile5_;
+		mmFile[1] = mmFile6_;
+	}
+	if(this->_useMm && !justHeader) {
+		assert(mmFile[0] == mmFile5_);
+		assert(mmFile[1] == mmFile6_);
+	}
+#endif
+	
+	if(this->_verbose || startVerbose) {
+		cerr << "  Reading header: ";
+		logTime(cerr);
+	}
+	
+	// Read endianness hints from both streams
+	size_t bytesRead = 0;
+	switchEndian = false;
+	uint32_t one = readU32(_in5, switchEndian); // 1st word of primary stream
+	bytesRead += 4;
+	if(loadSASamp) {
+#ifndef NDEBUG
+		assert_eq(one, readU32(_in6, switchEndian)); // should match!
+#else
+		readU32(_in6, switchEndian);
+#endif
+	}
+	if(one != 1) {
+		assert_eq((1u<<24), one);
+		assert_eq(1, endianSwapU32(one));
+		switchEndian = true;
+	}
+	
+	// Can't switch endianness and use memory-mapped files; in order to
+	// support this, someone has to modify the file to switch
+	// endiannesses appropriately, and we can't do this inside Bowtie
+	// or we might be setting up a race condition with other processes.
+	if(switchEndian && this->_useMm) {
+		cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
+		throw 1;
+	}	
+	
+	_nlocalEbwts      = readIndex<index_t>(_in5, switchEndian); bytesRead += sizeof(index_t);
+	int32_t lineRate  = readI32(_in5, switchEndian); bytesRead += 4;
+	readI32(_in5, switchEndian); bytesRead += 4;
+	int32_t offRate   = readI32(_in5, switchEndian); bytesRead += 4;
+	// TODO: add isaRate to the actual file format (right now, the
+	// user has to tell us whether there's an ISA sample and what the
+	// sampling rate is.
+	int32_t ftabChars = readI32(_in5, switchEndian); bytesRead += 4;
+	/*int32_t flag  =*/ readI32(_in5, switchEndian); bytesRead += 4;
+    
+    if(this->_verbose || startVerbose) {
+        cerr << "    number of local indexes: " << _nlocalEbwts << endl
+             << "    local offRate: " << offRate << endl
+             << "    local ftabLen: " << (1 << (2 * ftabChars)) << endl
+             << "    local ftabSz: "  << (2 << (2 * ftabChars)) << endl
+        ;
+    }
+	
+	clearLocalEbwts();
+	
+	index_t tidx = 0, localOffset = 0;
+	string base = "";
+	for(size_t i = 0; i < _nlocalEbwts; i++) {
+		LocalEbwt<local_index_t, index_t> *localEbwt = new LocalEbwt<local_index_t, index_t>(base,
+                                                                                             _in5,
+                                                                                             _in6,
+                                                                                             mmFile5_,
+                                                                                             mmFile6_,
+                                                                                             tidx,
+                                                                                             localOffset,
+                                                                                             switchEndian,
+                                                                                             bytesRead,
+                                                                                             color,
+                                                                                             needEntireRev,
+                                                                                             this->fw_,
+                                                                                             -1, // overrideOffRate
+                                                                                             -1, // offRatePlus
+                                                                                             (uint32_t)lineRate,
+                                                                                             (uint32_t)offRate,
+                                                                                             (uint32_t)ftabChars,
+                                                                                             this->_useMm,
+                                                                                             this->useShmem_,
+                                                                                             mmSweep,
+                                                                                             loadNames,
+                                                                                             loadSASamp,
+                                                                                             loadFtab,
+                                                                                             loadRstarts,
+                                                                                             false,  // _verbose
+                                                                                             false,
+                                                                                             this->_passMemExc,
+                                                                                             this->_sanity);
+		
+		if(tidx >= _localEbwts.size()) {
+			assert_eq(tidx, _localEbwts.size());
+			_localEbwts.expand();
+		}
+		assert_eq(tidx + 1, _localEbwts.size());
+		_localEbwts.back().push_back(localEbwt);
+	}	
+		
+#ifdef BOWTIE_MM
+    fseek(_in5, 0, SEEK_SET);
+	fseek(_in6, 0, SEEK_SET);
+#else
+	rewind(_in5); rewind(_in6);
+#endif
+}
+
+#endif /*HIEREBWT_H_*/
diff --git a/hier_idx_common.h b/hier_idx_common.h
new file mode 100644
index 0000000..16efca2
--- /dev/null
+++ b/hier_idx_common.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2013, Daehwan Kim <infphilo at gmail.com>
+ *
+ * This file is part of Beast.  Beast is based on Bowtie 2.
+ *
+ * Beast is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Beast is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Beast.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HIEREBWT_COMMON_H_
+#define HIEREBWT_COMMON_H_
+
+// maximum size of a sequence represented by a local index
+static const uint32_t local_index_size     = (1 << 16) - (1 << 8);  // 1 << 5 is necessary for eftab index
+
+// size of the overlapped sequence between the sequences represented by two consecutive local indexes
+static const uint32_t local_index_overlap  = 1024;
+
+// interval between two consecutive local indexes 
+static const uint32_t local_index_interval = local_index_size - local_index_overlap;
+
+// line rate in local indexes
+static const int32_t local_lineRate = 6;
+
+// how many rows are marked in a local index, every 2^<int>th row is marked
+static const int32_t  local_offRate        = 3;
+
+// the look table in a local index 4^<int> entries
+static const int32_t  local_ftabChars      = 6;
+
+#endif /*HIEREBWT_COMMON_H_*/
diff --git a/hyperloglogbias.h b/hyperloglogbias.h
new file mode 100644
index 0000000..013bd5b
--- /dev/null
+++ b/hyperloglogbias.h
@@ -0,0 +1,133 @@
+/*
+ * hyperloglogbias.h
+ *
+ *  Created on: Apr 25, 2015
+ *      Author: fbreitwieser
+ */
+
+#ifndef HYPERLOGLOGBIAS_H_
+#define HYPERLOGLOGBIAS_H_
+
+const double rawEstimateData_precision4[] = {
+    11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5 [...]
+};
+
+const double rawEstimateData_precision5[] = {
+    23, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.7 [...]
+};
+
+const double rawEstimateData_precision6[] = {
+    46, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1 [...]
+};
+
+const double rawEstimateData_precision7[] = {
+    92, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191. [...]
+};
+
+const double rawEstimateData_precision8[] = {
+    184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378 [...]
+};
+
+const double rawEstimateData_precision9[] = {
+    369, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.55 [...]
+};
+
+const double rawEstimateData_precision10[] = {
+    738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956 [...]
+};
+
+const double rawEstimateData_precision11[] = {
+    1477, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408 [...]
+};
+
+const double rawEstimateData_precision12[] = {
+    2954, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757,  [...]
+};
+
+const double rawEstimateData_precision13[] = {
+    5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358. [...]
+};
+
+const double rawEstimateData_precision14[] = {
+    11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6 [...]
+};
+
+const double rawEstimateData_precision15[] = {
+    23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683. [...]
+};
+
+const double rawEstimateData_precision16[] = {
+    47271, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424,  [...]
+};
+
+const double rawEstimateData_precision17[] = {
+    94542, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 17 [...]
+};
+
+const double rawEstimateData_precision18[] = {
+    189084, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 34946 [...]
+};
+
+
+const double biasData_precision4[] = {
+    10, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480 [...]
+};
+
+const double biasData_precision5[] = {
+    22, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2 [...]
+};
+
+const double biasData_precision6[] = {
+    45, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.104 [...]
+};
+
+const double biasData_precision7[] = {
+    91, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.1 [...]
+};
+
+const double biasData_precision8[] = {
+    183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60. [...]
+};
+
+const double biasData_precision9[] = {
+    368, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.55 [...]
+};
+
+const double biasData_precision10[] = {
+    737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62 [...]
+};
+
+const double biasData_precision11[] = {
+    1476, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531 [...]
+};
+
+const double biasData_precision12[] = {
+    2953, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757,  [...]
+};
+
+const double biasData_precision13[] = {
+    5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292 [...]
+};
+
+const double biasData_precision14[] = {
+    11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.12 [...]
+};
+
+const double biasData_precision15[] = {
+    23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095. [...]
+};
+
+const double biasData_precision16[] = {
+    47270, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424,  [...]
+};
+
+const double biasData_precision17[] = {
+    94541, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 3850 [...]
+};
+
+const double biasData_precision18[] = {
+    189083, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 788 [...]
+};
+
+
+#endif /* HYPERLOGLOGBIAS_H_ */
diff --git a/hyperloglogplus.h b/hyperloglogplus.h
new file mode 100644
index 0000000..33f5dc1
--- /dev/null
+++ b/hyperloglogplus.h
@@ -0,0 +1,623 @@
+/*
+ * hyperloglogplus.h
+ *
+ * Implementation of HyperLogLog++ algorithm described by Stefan Heule et al.
+ *
+ *  Created on: Apr 25, 2015
+ *      Author: fbreitwieser
+ */
+
+#ifndef HYPERLOGLOGPLUS_H_
+#define HYPERLOGLOGPLUS_H_
+
+#include<set>
+#include<vector>
+#include<stdexcept>
+#include<iostream>
+#include<fstream>
+#include<math.h>    //log
+#include<algorithm> //vector.count
+#include<bitset>
+
+#include "hyperloglogbias.h"
+#include "third_party/MurmurHash3.cpp"
+#include "assert_helpers.h"
+
+using namespace std;
+
+//#define HLL_DEBUG
+//#define NDEBUG
+//#define NDEBUG2
+#define arr_len(a) (a + sizeof a / sizeof a[0])
+
+// experimentally determined threshold values for  p - 4
+static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100,
+							  6500, 11500, 20000, 50000, 120000, 350000};
+
+
+///////////////////////
+
+//
+/**
+ * gives the estimated cardinality for m bins, v of which are non-zero
+ * @param m number of bins in the matrix
+ * @param v number of non-zero bins
+ * @return
+ */
+double linearCounting(uint32_t m, uint32_t v) {
+	if (v > m) {
+	    throw std::invalid_argument("number of v should not be greater than m");
+	}
+	double fm = double(m);
+	return fm * log(fm/double(v));
+}
+
+/**
+  * from Numerical Recipes, 3rd Edition, p 352
+  * Returns hash of u as a 64-bit integer.
+  *
+*/
+inline uint64_t ranhash (uint64_t u) {
+  uint64_t v = u * 3935559000370003845 + 2691343689449507681;
+
+  v ^= v >> 21; v ^= v << 37; v ^= v >>  4;
+
+  v *= 4768777513237032717;
+
+  v ^= v << 20; v ^= v >> 41; v ^= v <<  5;
+
+  return v;
+}
+
+inline uint64_t murmurhash3_finalizer (uint64_t key)  {
+	key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that.
+	key ^= key >> 33;
+	key *= 0xff51afd7ed558ccd;
+	key ^= key >> 33;
+	key *= 0xc4ceb9fe1a85ec53;
+	key ^= key >> 33;
+	return key;
+}
+
+/**
+ * Bias correction factors for specific m's
+ * @param m
+ * @return
+ */
+double alpha(uint32_t m)  {
+	switch (m) {
+	case 16: return 0.673;
+	case 32: return 0.697;
+	case 64: return 0.709;
+	}
+
+	// m >= 128
+	return 0.7213 / (1 + 1.079/double(m));
+}
+
+/**
+ * calculate the raw estimate as harmonic mean of the ranks in the register
+ * @param array
+ * @return
+ */
+double calculateEstimate(vector<uint8_t> array) {
+	double inverseSum = 0.0;
+	for (size_t i = 0; i < array.size(); ++i) {
+		// TODO: pre-calculate the power calculation
+		inverseSum += pow(2,-array[i]);
+	}
+	return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum;
+}
+
+uint32_t countZeros(vector<uint8_t> s) {
+	return (uint32_t)count(s.begin(), s.end(), 0);
+}
+
+/**
+ * Extract bits (from uint32_t or uint64_t) using LSB 0 numbering from hi to lo, including lo
+ * @param bits
+ * @param hi
+ * @param lo
+ * @return
+ */
+template<typename T>
+T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {
+
+    // create a bitmask:
+    //            (T(1) << (hi - lo)                 a 1 at the position (hi - lo)
+    //           ((T(1) << (hi - lo) - 1)              1's from position 0 to position (hi-lo-1)
+    //          (((T(1) << (hi - lo)) - 1) << lo)      1's from position lo to position hi
+
+	// The T(1) is required to not cause overflow on 32bit machines
+	// TODO: consider creating a bitmask only once in the beginning
+	T bitmask = (((T(1) << (hi - lo)) - 1) << lo);
+    T result = value & bitmask;
+
+    if (!shift_left) {
+        // shift resulting bits to the right
+        result = result >> lo;
+    } else {
+        // shift resulting bits to the left
+        result = result << (sizeof(T)*8 - hi);
+    }
+    return result;	
+}
+
+template<typename T>
+T extractBits(T bits, uint8_t hi) {
+    // create a bitmask for first hi bits (LSB 0 numbering)
+	T bitmask = T(-1) << (sizeof(T)*8 - hi);
+
+	return (bits & bitmask);
+}
+
+// functions for counting the number of leading 0-bits (clz)
+//           and counting the number of trailing 0-bits (ctz)
+//#ifdef __GNUC__
+
+// TODO: switch between builtin clz and 64_clz based on architecture
+//#define clz(x) __builtin_clz(x)
+#if 0
+static int clz_manual(uint64_t x)
+{
+  // This uses a binary search (counting down) algorithm from Hacker's Delight.
+   uint64_t y;
+   int n = 64;
+   y = x >>32;  if (y != 0) {n -= 32;  x = y;}
+   y = x >>16;  if (y != 0) {n -= 16;  x = y;}
+   y = x >> 8;  if (y != 0) {n -=  8;  x = y;}
+   y = x >> 4;  if (y != 0) {n -=  4;  x = y;}
+   y = x >> 2;  if (y != 0) {n -=  2;  x = y;}
+   y = x >> 1;  if (y != 0) return n - 2;
+   return n - x;
+}
+#endif
+
+inline uint32_t clz(const uint32_t x) {
+	return __builtin_clz(x);
+}
+
+inline uint32_t clz(const uint64_t x) {
+    uint32_t u32 = (x >> 32);
+    uint32_t result = u32 ? __builtin_clz(u32) : 32;
+    if (result == 32) {
+        u32 = x & 0xFFFFFFFFUL;
+        result += (u32 ? __builtin_clz(u32) : 32);
+    }
+    return result;
+}
+//#else
+
+uint32_t clz_log2(const uint64_t w) {
+	return 63 - floor(log2(w));
+}
+//#endif
+
+
+// TODO: the sparse list may be encoded with variable length encoding
+//   see Heule et al., section 5.3.2
+// Also, using sets might give a larger overhead as each insertion costs more
+//  consider using vector and sort/unique when merging.
+typedef set<uint32_t> SparseListType;
+typedef uint64_t HashSize;
+
+/**
+ * HyperLogLogPlusMinus class
+ * typename T corresponds to the hash size - usually either uint32_t or uint64_t (implemented for uint64_t)
+ */
+
+typedef uint64_t T_KEY;
+template <typename T_KEY>
+class HyperLogLogPlusMinus {
+
+private:
+
+	vector<uint8_t> M;  // registers (M) of size m
+	uint8_t p;            // precision
+	uint32_t m;           // number of registers
+	bool sparse;          // sparse representation of the data?
+	SparseListType sparseList; // TODO: use a compressed list instead
+
+	// vectors containing data for bias correction
+	vector<vector<double> > rawEstimateData; // TODO: make this static
+	vector<vector<double> > biasData;
+
+	// sparse versions of p and m
+	static const uint8_t  pPrime = 25; // precision when using a sparse representation
+	                                   // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32
+	static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime
+
+
+public:
+
+	~HyperLogLogPlusMinus() {};
+
+	/**
+	 * Create new HyperLogLogPlusMinus counter
+	 * @param precision
+	 * @param sparse
+	 */
+	HyperLogLogPlusMinus(uint8_t precision=10, bool sparse=true):p(precision),sparse(sparse) {
+		if (precision > 18 || precision < 4) {
+	        throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
+		}
+
+		this->m = 1 << precision;
+
+		if (sparse) {
+			this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
+		} else {
+			this->M = vector<uint8_t>(m);
+		}
+	}
+
+	/**
+	 * Add a new item to the counter.
+	 * @param item
+	 */
+	void add(T_KEY item) {
+		add(item, sizeof(T_KEY));
+	}
+
+	/**
+	 * Add a new item to the counter.
+	 * @param item
+	 * @param size  size of item
+	 */
+	void add(T_KEY item, size_t size) {
+
+		// compute hash for item
+		HashSize hash_value = murmurhash3_finalizer(item);
+
+#ifdef HLL_DEBUG
+		cerr << "Value: " << item << "; hash(value): " << hash_value << endl;
+		cerr << bitset<64>(hash_value) << endl;
+#endif
+
+		if (sparse) {
+			// sparse mode: put the encoded hash into sparse list
+			uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value);
+			this->sparseList.insert(encoded_hash_value);
+
+#ifdef HLL_DEBUG
+			idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value);
+			assert_eq(ir.idx,get_index(hash_value, p));
+			assert_eq(ir.rank, get_rank(hash_value, p));
+#endif
+
+			// if the sparseList is too large, switch to normal (register) representation
+			if (this->sparseList.size() > this->m) { // TODO: is the size of m correct?
+				switchToNormalRepresentation();
+			}
+		} else {
+			// normal mode
+			// take first p bits as index  {x63,...,x64-p}
+			uint32_t idx = get_index(hash_value, p);
+			// shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0}
+			uint8_t rank = get_rank(hash_value, p);
+
+			// update the register if current rank is bigger
+			if (rank > this->M[idx]) {
+				this->M[idx] = rank;
+			}
+		}
+	}
+
+	void add(vector<T_KEY> words) {
+		for(size_t i = 0; i < words.size(); ++i) {
+			this->add(words[i]);
+		}
+	}
+
+	/**
+	 * Reset to its initial state.
+	 */
+	void reset() {
+		this->sparse = true;
+		this->sparseList.clear();  // 
+		this->M.clear();
+	}
+
+	/**
+	 * Convert from sparse representation (using tmpSet and sparseList) to normal (using register)
+	 */
+	void switchToNormalRepresentation() {
+#ifdef HLL_DEBUG
+		cerr << "switching to normal representation" << endl;
+		cerr << " est before: " << cardinality(true) << endl;
+#endif
+		this->sparse = false;
+		this->M = vector<uint8_t>(this->m);
+		if (sparseList.size() > 0) { //TDOD: do I need to check this, here?
+			addToRegisters(this->sparseList);
+			this->sparseList.clear();
+		}
+#ifdef HLL_DEBUG
+		cerr << " est after: " << cardinality(true) << endl;
+#endif
+	}
+
+	/**
+	 * add sparseList to the registers of M
+	 */
+	void addToRegisters(const SparseListType &sparseList) {
+		if (sparseList.size() == 0) {
+			return;
+		}
+		for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) {
+
+			idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr);
+
+			assert_lt(ir.idx,M.size());
+			if (ir.rank > this->M[ir.idx]) {
+				this->M[ir.idx] = ir.rank;
+			}
+		}
+	}
+
+	/**
+	 * Merge another HyperLogLogPlusMinus into this. Converts to normal representation
+	 * @param other
+	 */
+	void merge(const HyperLogLogPlusMinus* other) {
+		if (this->p != other->p) {
+			throw std::invalid_argument("precisions must be equal");
+		}
+
+		if (this->sparse && other->sparse) {
+			if (this->sparseList.size()+other->sparseList.size() > this->m) {
+				switchToNormalRepresentation();
+				addToRegisters(other->sparseList);
+			} else {
+				this->sparseList.insert(other->sparseList.begin(),other->sparseList.end());
+			}
+		} else if (other->sparse) {
+			// other is sparse, but this is not
+			addToRegisters(other->sparseList);
+		} else {
+			if (this->sparse) {
+				switchToNormalRepresentation();
+			}
+
+			// merge registers
+			for (size_t i = 0; i < other->M.size(); ++i) {
+				if (other->M[i] > this->M[i]) {
+					this->M[i] = other->M[i];
+				}
+			}
+		}
+	}
+
+	/**
+	 *
+	 * @return cardinality estimate
+	 */
+	uint64_t cardinality(bool verbose=true) {
+		if (sparse) {
+			// if we are still 'sparse', then use linear counting, which is more
+			//  accurate for low cardinalities, and use increased precision pPrime
+			return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
+		}
+
+		// initialize bias correction data
+		if (rawEstimateData.empty()) { initRawEstimateData(); }
+		if (biasData.empty())        { initBiasData(); }
+
+		// calculate raw estimate on registers
+		//double est = alpha(m) * harmonicMean(M, m);
+		double est = calculateEstimate(M);
+
+		// correct for biases if estimate is smaller than 5m
+		if (est <= double(m)*5.0) {
+			est -= getEstimateBias(est);
+		}
+
+		uint32_t v = countZeros(M);
+		if (v > 2) {
+			// calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix
+			double lc_estimate = linearCounting(m, v);
+
+			// check if the lc estimate is below the threshold
+			if (lc_estimate <= double(threshold[p-4])) {
+				if (lc_estimate < 0) { throw; }
+				// return lc estimate of cardinality
+				return lc_estimate;
+			}
+			return lc_estimate; // always use lc_estimate when available
+		}
+
+		// return bias-corrected hyperloglog estimate of cardinality
+		return uint64_t(est);
+	}
+
+private:
+
+    uint8_t rank(HashSize x, uint8_t b) {
+        uint8_t v = 1;
+        while (v <= b && !(x & 0x80000000)) {
+            v++;
+            x <<= 1;
+        }
+        return v;
+    }
+
+    template<typename T> inline uint32_t get_index(const T hash_value, const uint8_t p, const uint8_t size) const {
+    	// take first p bits as index  {x63,...,x64-p}
+    	assert_lt(p,size);
+    	uint32_t idx = hash_value >> (size - p);
+    	return idx;
+    }
+
+    inline uint32_t get_index(const uint64_t hash_value, const uint8_t p) const {
+        return get_index(hash_value, p, 64);
+    }
+
+    inline uint32_t get_index(const uint32_t hash_value, const uint8_t p) const {
+    	return get_index(hash_value, p, 32);
+    }
+
+    template<typename T> inline
+	T get_trailing_ones(const uint8_t p) const {
+    	return (T(1) << p ) - 1;
+    }
+
+    template<typename T> inline
+    uint8_t get_rank(const T hash_value, const uint8_t p) const {
+    	// shift p values off, and count leading zeros of the remaining string {x63-p,...,x0}
+    	T_KEY rank_bits = (hash_value << p | get_trailing_ones<T>(p));
+#ifdef HLL_DEBUG
+    	cerr << "rank bits: " << bitset<32>(rank_bits) << endl;
+#endif
+
+    	uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1;
+    	assert_leq(rank_val,64-p+1);
+    	return rank_val;
+    }
+
+	void initRawEstimateData() {
+	    rawEstimateData = vector<vector<double> >();
+
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)));
+
+	}
+
+	void initBiasData() {
+		biasData = vector<vector<double> >();
+
+		biasData.push_back(vector<double>(biasData_precision4,arr_len(biasData_precision4)));
+		biasData.push_back(vector<double>(biasData_precision5,arr_len(biasData_precision5)));
+		biasData.push_back(vector<double>(biasData_precision6,arr_len(biasData_precision6)));
+		biasData.push_back(vector<double>(biasData_precision7,arr_len(biasData_precision7)));
+		biasData.push_back(vector<double>(biasData_precision8,arr_len(biasData_precision8)));
+		biasData.push_back(vector<double>(biasData_precision9,arr_len(biasData_precision9)));
+		biasData.push_back(vector<double>(biasData_precision10,arr_len(biasData_precision10)));
+		biasData.push_back(vector<double>(biasData_precision11,arr_len(biasData_precision11)));
+		biasData.push_back(vector<double>(biasData_precision12,arr_len(biasData_precision12)));
+		biasData.push_back(vector<double>(biasData_precision13,arr_len(biasData_precision13)));
+		biasData.push_back(vector<double>(biasData_precision14,arr_len(biasData_precision14)));
+		biasData.push_back(vector<double>(biasData_precision15,arr_len(biasData_precision15)));
+		biasData.push_back(vector<double>(biasData_precision16,arr_len(biasData_precision16)));
+		biasData.push_back(vector<double>(biasData_precision17,arr_len(biasData_precision17)));
+		biasData.push_back(vector<double>(biasData_precision18,arr_len(biasData_precision18)));
+	}
+
+	/**
+	 * Estimate the bias using empirically determined values.
+	 * Uses weighted average of the two cells between which the estimate falls.
+	 * TODO: Check if nearest neighbor average gives better values, as proposed in the paper
+	 * @param est
+	 * @return correction value for
+	 */
+	double getEstimateBias(double estimate) {
+		vector<double> rawEstimateTable = rawEstimateData[p-4];
+		vector<double> biasTable = biasData[p-4];
+	
+		// check if estimate is lower than first entry, or larger than last
+		if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); }
+		if (rawEstimateTable.back()  <= estimate) { return rawEstimateTable.back() - biasTable.back(); }
+	
+		// get iterator to first element that is not smaller than estimate
+		vector<double>::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate);
+		size_t pos = it - rawEstimateTable.begin();
+
+		double e1 = rawEstimateTable[pos-1];
+		double e2 = rawEstimateTable[pos];
+	
+		double c = (estimate - e1) / (e2 - e1);
+
+		return biasTable[pos-1]*(1-c) + biasTable[pos]*c;
+	}
+	
+
+	/**
+	 * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation.
+	 *
+	 * Difference from the algorithm described in the paper:
+	 * The index always is in the p most significant bits
+	 *
+	 * see section 5.3 in Heule et al.
+	 * @param x the hash bits
+	 * @return encoded hash value
+	 */
+	uint32_t encodeHashIn32Bit(uint64_t hash_value) {
+		// extract first pPrime bits, and shift them onto a 32-bit integer
+		uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32);
+
+#ifdef HLL_DEBUG
+		cerr << "value:  " << bitset<64>(hash_value) << endl;
+        cerr << "index: " << std::bitset<32>(idx) << " ( bits from 64 to " << 64-pPrime << "; " << idx << ")" << endl;
+#endif
+
+		// are the bits {63-p, ..., 63-p'} all 0?
+		if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) {
+			// compute the additional rank (minimum rank is already p'-p)
+			// the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag
+			uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0
+			return idx | uint32_t(additional_rank<<1) | 1;
+		} else {
+			// else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0)
+			assert_eq((idx & 1),0);
+			return idx;
+		}
+	}
+
+
+	/**
+	 * struct holding the index and rank/rho of an entry
+	 */
+	struct idx_n_rank {
+		uint32_t idx;
+		uint8_t rank;
+		idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {}
+	};
+
+	//
+	//
+	/**
+	 * Decode a hash from the sparse representation.
+	 * Returns the index and number of leading zeros (nlz) with precision p stored in k
+	 * @param k the hash bits
+	 * @return index and rank in non-sparse format
+	 */
+	idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const  {
+
+		// difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes
+		uint32_t idx = get_index(encoded_hash_value, p);
+		uint8_t rank_val;
+
+		// check if the last bit is 1
+		if ( (encoded_hash_value & 1) == 1) {
+			// if yes: the hash was stored with higher precision, bits p to pPrime were 0
+			uint8_t additional_rank = pPrime - p;
+			rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1);
+		} else {
+			rank_val = get_rank(encoded_hash_value,p);
+
+			// clz counts 64 bit only, it seems
+			if (rank_val > 32)
+				rank_val -= 32;
+		}
+
+		return(idx_n_rank(idx,rank_val));
+	}
+
+};
+
+
+
+
+#endif /* HYPERLOGLOGPLUS_H_ */
diff --git a/indices/Makefile b/indices/Makefile
new file mode 100644
index 0000000..f120e55
--- /dev/null
+++ b/indices/Makefile
@@ -0,0 +1,321 @@
+#
+# Makefile
+# fbreitwieser, 2016-01-29 13:00
+#
+
+SHELL := /bin/bash
+
+THREADS?=1
+KEEP_FILES?=0
+
+get_ref_file_names = $(addprefix $(REFERENCE_SEQUENCES_DIR)/, $(addsuffix $(1), \
+	$(addprefix all-,$(COMPLETE_GENOMES)) \
+	$(addprefix all-,$(addsuffix -chromosome_level,$(CHROMOSOME_LEVEL_GENOMES))) \
+	$(addprefix mammalian-reference-,$(MAMMALIAN_TAXIDS)) \
+	$(addprefix all-compressed-,$(COMPLETE_GENOMES_COMPRESSED)) \
+	$(if $(INCLUDE_CONTAMINANTS),contaminants)))
+
+DL_DIR=downloaded-seq
+TMP_DIR?=tmp_$(IDX_NAME)
+TAXID_SUFFIX:=.map
+REFERENCE_SEQUENCES_DIR:=reference-sequences
+
+.PHONY: index index-name index-size .path-ok .dustmasker-ok
+
+define USAGE
+
+Makefile to create common indices to use with Centrifuge.
+
+  make [OPTIONS] TARGET
+
+OPTIONS:
+    THREADS=n          Number of threads for downloading, compression and
+                       index building
+
+STANDARD TARGETS:
+
+    b_compressed        Download all bacteria genomes from RefSeq,
+                        and compresses them at the species level
+
+    b_compressed+h+v    b_compressed + human genome and transcripts,
+                        contaminant sequences from UniVec and EmVec,
+                        and all viral genomes
+
+    b+h+v               As above, but with uncompressed bacterial genomes
+
+Alternatively, a IDX_NAME and one or more genomes may be specified as
+options to build a custom database.
+
+EXTENDED OPTIONS:
+	COMPLETE_GENOMES=s
+	COMPLETE_GENOMES_COMPRESSED=s
+	MAMMALIAN_TAXIDS=i
+	INCLUDE_CONAMINANTS=1
+	DONT_DUSTMASK=1
+	IDX_NAME=s
+
+EXAMPLES:
+	# Make an index with all complete bacterial and archaeal genomes, and compress
+	# the bacterial genomes to the species level
+	make b_compressed
+
+	# same as:
+	make COMPLETE_GENOMES=archaea COMPLETE_GENOMES_COMPRESSED=bacteria IDX_NAME=b_compressed
+	
+	# Make an index with just the human genome
+	make IDX_NAME=h MAMMALIAN_TAXIDS=9606
+
+	# All archaeal genomes and contaminant sequences from UniVec and EmVec
+	make IDX_NAME=a COMPLETE_GENOMES=archaea  INCLUDE_CONTAMINANTS=1 
+
+endef
+export USAGE
+
+###################################################################################################
+ifndef IDX_NAME
+
+all: 
+	@echo "$$USAGE"
+
+IDX_NAME?=$(shell basename $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))))
+
+INDICES=b+h+v b_compressed b_compressed+h+v refseq_microbial refseq_full nt
+
+b+h+v: export COMPLETE_GENOMES:=archaea bacteria viral
+b+h+v: export MAMMALIAN_TAXIDS:=9606
+b+h+v: export INCLUDE_CONTAMINANTS:=1
+b+h+v: export IDX_NAME:=b+h+v
+
+b_compressed: export COMPLETE_GENOMES:=
+b_compressed: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
+b_compressed: export IDX_NAME:=b_compressed
+
+b_compressed+h+v: export COMPLETE_GENOMES:=viral
+b_compressed+h+v: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
+b_compressed+h+v: export MAMMALIAN_TAXIDS:=9606
+b_compressed+h+v: export INCLUDE_CONTAMINANTS:=1
+b_compressed+h+v: export IDX_NAME:=b_compressed+h+v
+
+refseq_microbial: export COMPLETE_GENOMES:=archaea bacteria fungi protozoa viral
+refseq_microbial: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
+##refseq_microbial: export SMALL_GENOMES:=mitochondrion plasmid plastid # TODO
+refseq_microbial: export MAMMALIAN_TAXIDS:=9606 10090
+refseq_microbial: export INCLUDE_CONTAMINANTS:=1
+refseq_microbial: export IDX_NAME:=refseq_microbial
+refseq_microbial: export CF_BUILD_OPTS+=--ftabchars 14
+
+refseq_full: export COMPLETE_GENOMES:=archaea bacteria fungi invertebrate plant protozoa vertebrate_mammalian vertebrate_other viral
+refseq_full: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
+refseq_full: export SMALL_GENOMES:=mitochondrion plasmid plastid
+refseq_full: export MAMMALIAN_TAXIDS:=9606 10090
+refseq_full: export INCLUDE_CONTAMINANTS:=1
+refseq_full: export IDX_NAME:=refseq_full
+
+
+nt: export IDX_NAME:=nt
+
+$(INDICES):
+	@echo Making: $@: $(IDX_NAME)
+	$(MAKE) -f $(THIS_FILE) IDX_NAME=$(IDX_NAME)
+
+####################################################################################################
+else ## IDX_NAME is defined
+
+DONT_DUSTMASK=
+TAXONOMY_DOWNLOAD_OPTS?=
+REFERENCE_SEQUENCES=$(call get_ref_file_names,.fna)
+TAXID_MAPS=$(call get_ref_file_names,$(TAXID_SUFFIX))
+CF_BUILD_OPTS?= 
+
+ifeq (nt,$(IDX_NAME))
+REFERENCE_SEQUENCES+=nt-sorted.fna
+TAXID_MAPS+=nt.map
+TAXONOMY_DOWNLOAD_OPTS+=-g
+CF_BUILD_OPTS+=--ftabchars=14
+endif
+
+
+ifeq ($(strip $(REFERENCE_SEQUENCES)),)
+$(error REFERENCE_SEQUENCES is not set - specify at lease one of COMPLETE_GENOMES, \
+COMPLETE_GENOMES_COMPRESSED, or MAMMALIAN_TAXIDS with the IDX_NAME ($(IDX_NAME)))
+endif
+
+SIZE_TABLES=$(addprefix $(REFERENCE_SEQUENCES_DIR)/all-compressed-,$(addsuffix .size,$(COMPLETE_GENOMES_COMPRESSED)))
+ifneq ($(strip $(COMPLETE_GENOMES_COMPRESSED)),)
+CF_BUILD_OPTS+=--size-table <(cat $(SIZE_TABLES))
+endif
+
+CF_DOWNLOAD_OPTS?=
+CF_COMPRESS_OPTS?=
+ifeq ($(strip $(DONT_DUSTMASK)),)
+CF_DOWNLOAD_OPTS+=-m
+else
+CF_COMPRESS_OPTS+=--noDustmasker
+endif
+
+all: $(IDX_NAME).1.cf
+
+# vim:ft=make
+endif ## ifndef IDX_NAME
+
+$(REFERENCE_SEQUENCES_DIR):
+	mkdir -p $(REFERENCE_SEQUENCES_DIR)
+
+#$(TAXID_MAPS): | $(REFERENCE_SEQUENCES_DIR)
+#	rm $(patsubst %$(TAXID_SUFFIX),%.fna, $@)
+#	$(MAKE) -f $(THIS_FILE) $(patsubst %$(TAXID_SUFFIX),%.fna, $@)
+
+nt.gz:
+	wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz
+
+nt.fna: nt.gz
+	gunzip -c nt.gz > nt.fna
+
+nt.map:
+	wget -qO- ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz | gunzip -c | sed 's/^/gi|/' > nt.map
+
+nt-sorted.fna: nt.fna nt.map
+	centrifuge-sort-nt.pl nt.fna nt.map > nt-sorted.fna
+
+$(REFERENCE_SEQUENCES_DIR)/mammalian-reference-%.fna: | $(REFERENCE_SEQUENCES_DIR)
+	@[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	centrifuge-download -o $(TMP_DIR) -d "vertebrate_mammalian" -a "Chromosome" -t $* -c 'reference genome' -P $(THREADS) refseq > \
+		$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX), $(notdir $@))
+	cat $(TMP_DIR)/vertebrate_mammalian/*.fna > $@.tmp && mv $@.tmp $@
+	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
+ifeq (1,$(KEEP_FILES))
+	[[ -d $(DL_DIR)/verterbrate_mammalian ]] || mkdir -p $(DL_DIR)/verterbrate_mammalian
+	mv $(TMP_DIR)/verterbrate_mammalian/* $(DL_DIR)/vertebrate_mammalian
+else
+	rm -rf $(TMP_DIR)
+endif
+
+$(REFERENCE_SEQUENCES_DIR)/all-compressed-%.fna: | $(REFERENCE_SEQUENCES_DIR) taxonomy/nodes.dmp taxonomy/names.dmp .dustmasker-ok
+	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	centrifuge-download -o $(TMP_DIR) -d "$*" -P $(THREADS) refseq > $(TMP_DIR)/all-$*.map
+	time centrifuge-compress.pl $(TMP_DIR)/$* taxonomy $(CF_COMPRESS_OPTS) -map $(TMP_DIR)/all-$*.map \
+		-o $@.tmp -t $(THREADS) -maxG 50000000 2>&1 | tee centrifuge-compress-$(IDX_NAME).log && \
+	mv $@.tmp.fa $@ && mv $@.tmp.size $(patsubst %.fna,%.size,$@) && \
+	mv $@.tmp.map $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
+ifeq (1,$(KEEP_FILES))
+	[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
+	mv $(TMP_DIR)/$*/* $(DL_DIR)/$*
+else
+	rm -rf $(TMP_DIR)
+endif
+
+$(REFERENCE_SEQUENCES_DIR)/all-%.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	@echo Downloading and dust-masking $*
+	centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -d "$*" -P $(THREADS) refseq > \
+		$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
+	cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
+	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
+ifeq (1,$(KEEP_FILES))
+	[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
+	mv $(TMP_DIR)/$*/* $(DL_DIR)/$*
+else
+	rm -rf $(TMP_DIR)
+endif
+
+$(REFERENCE_SEQUENCES_DIR)/all-%-chromosome_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	@echo Downloading and dust-masking $*
+	centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Chromosome" -d "$*" -P $(THREADS) refseq > \
+		$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
+	cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
+	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
+ifeq (1,$(KEEP_FILES))
+	[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
+	mv $(TMP_DIR)/$*/* $(DL_DIR)/$*
+else
+	rm -rf $(TMP_DIR)
+endif
+
+
+
+$(REFERENCE_SEQUENCES_DIR)/contaminants.fna: | $(REFERENCE_SEQUENCES_DIR)
+	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	centrifuge-download -o $(TMP_DIR) contaminants > $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
+	cat $(TMP_DIR)/contaminants/*.fna > $@.tmp && mv $@.tmp $@
+	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
+ifeq (1,$(KEEP_FILES))
+	[[ -d $(DL_DIR)/contaminants ]] || mkdir -p $(DL_DIR)/contaminants
+	mv $(TMP_DIR)/contaminants/* $(DL_DIR)/$*
+else
+	rm -rf $(TMP_DIR)
+endif
+
+DUSTMASKER_EXISTS := $(shell command -v dustmasker)
+.dustmasker-ok:
+ifndef DUSTMASKER_EXISTS
+ifeq ($(strip $(DONT_DUSTMASK)),)
+	$(error dustmasker program does not exist. Install NCBI blast+, or set option DONT_DUSTMASK=1)
+endif
+endif
+	
+
+taxonomy/names.dmp: | taxonomy
+taxonomy/nodes.dmp: | taxonomy
+
+taxonomy: | .path-ok
+	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	centrifuge-download $(TAXONOMY_DOWNLOAD_OPTS) -o $(TMP_DIR)/taxonomy taxonomy
+	mv $(TMP_DIR)/taxonomy . && rmdir $(TMP_DIR)
+
+$(IDX_NAME).1.cf: $(REFERENCE_SEQUENCES) $(SIZE_TABLES) $(TAXID_MAPS) taxonomy/nodes.dmp taxonomy/names.dmp | .path-ok
+	@echo Index building prerequisites: $^
+	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	time centrifuge-build -p $(THREADS) $(CF_BUILD_OPTS) \
+		--conversion-table <(cat $(TAXID_MAPS)) \
+		--taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp \
+		$(call join_w_comma,$(REFERENCE_SEQUENCES)) $(TMP_DIR)/$(IDX_NAME) 2>&1 | tee centrifuge-build-$(IDX_NAME).log
+	mv $(TMP_DIR)/$(IDX_NAME).*.cf . && rmdir $(TMP_DIR)
+
+
+clean:
+	# Removing input sequences (all required information is in the index)
+	rm -rf taxonomy
+	rm -rf $(DL_DIR)
+	rm -rf $(TMP_DIR)
+	rm -rf tmp_*
+	rm -rf reference-sequences
+	rm -f *.map
+	rm -f *.log
+
+# Join a list with commas
+COMMA:=,
+EMPTY:=
+SPACE:= $(EMPTY) $(EMPTY)
+join_w_comma = $(subst $(SPACE),$(COMMA),$(strip $1))
+
+
+THIS_FILE := $(lastword $(MAKEFILE_LIST))
+PATH_OK  := $(shell command -v centrifuge-build 2> /dev/null && command -v centrifuge-download 2> /dev/null )
+CF_BASE_DIR := $(shell dirname $(shell dirname $(THIS_FILE)))
+
+error_msg := centrifuge-download and centrifuge-build are not available - please make sure they are in the path.
+define n
+
+
+endef
+
+TEST_PROGRAMS=centrifuge-build centrifuge-download
+
+ifneq ("$(wildcard $(CF_BASE_DIR)/centrifuge-build)","")
+error_msg := $(error_msg)$n$nThe following command may solve this problem:$n  export PATH=$$PATH:"$(CF_BASE_DIR)"$n
+endif
+
+.path-ok:
+ifndef PATH_OK
+    $(error $n$(error_msg))
+else
+	@echo Found centrifuge-download and centrifuge-build.
+endif
+
+index-name:
+	echo $(IDX_NAME)
+
+index-size:
+	du -csh $(IDX_NAME).[123].cf
+
diff --git a/limit.cpp b/limit.cpp
new file mode 100644
index 0000000..1146090
--- /dev/null
+++ b/limit.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <limits>
+#include "limit.h"
+
+uint8_t  MIN_U8  = std::numeric_limits<uint8_t>::min();
+uint8_t  MAX_U8  = std::numeric_limits<uint8_t>::max();
+uint16_t MIN_U16 = std::numeric_limits<uint16_t>::min();
+uint16_t MAX_U16 = std::numeric_limits<uint16_t>::max();
+uint32_t MIN_U32 = std::numeric_limits<uint32_t>::min();
+uint32_t MAX_U32 = std::numeric_limits<uint32_t>::max();
+uint64_t MIN_U64 = std::numeric_limits<uint64_t>::min();
+uint64_t MAX_U64 = std::numeric_limits<uint64_t>::max();
+size_t   MIN_SIZE_T = std::numeric_limits<size_t>::min();
+size_t   MAX_SIZE_T = std::numeric_limits<size_t>::max();
+
+int      MIN_I   = std::numeric_limits<int>::min();
+int      MAX_I   = std::numeric_limits<int>::max();
+int8_t   MIN_I8  = std::numeric_limits<int8_t>::min();
+int8_t   MAX_I8  = std::numeric_limits<int8_t>::max();
+int16_t  MIN_I16 = std::numeric_limits<int16_t>::min();
+int16_t  MAX_I16 = std::numeric_limits<int16_t>::max();
+int32_t  MIN_I32 = std::numeric_limits<int32_t>::min();
+int32_t  MAX_I32 = std::numeric_limits<int32_t>::max();
+int64_t  MIN_I64 = std::numeric_limits<int64_t>::min();
+int64_t  MAX_I64 = std::numeric_limits<int64_t>::max();
diff --git a/limit.h b/limit.h
new file mode 100644
index 0000000..06ea072
--- /dev/null
+++ b/limit.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIMIT_H_
+#define LIMIT_H_
+
+#include <stdint.h>
+#include <cstring>
+
+extern uint8_t  MIN_U8;
+extern uint8_t  MAX_U8;
+extern uint16_t MIN_U16;
+extern uint16_t MAX_U16;
+extern uint32_t MIN_U32;
+extern uint32_t MAX_U32;
+extern uint64_t MIN_U64;
+extern uint64_t MAX_U64;
+extern size_t   MIN_SIZE_T;
+extern size_t   MAX_SIZE_T;
+
+extern int     MIN_I;
+extern int     MAX_I;
+extern int8_t  MIN_I8;
+extern int8_t  MAX_I8;
+extern int16_t MIN_I16;
+extern int16_t MAX_I16;
+extern int32_t MIN_I32;
+extern int32_t MAX_I32;
+extern int64_t MIN_I64;
+extern int64_t MAX_I64;
+
+#endif
diff --git a/ls.cpp b/ls.cpp
new file mode 100644
index 0000000..96c28c0
--- /dev/null
+++ b/ls.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef MAIN_LS
+
+#include <string.h>
+#include <iostream>
+#include "sstring.h"
+#include "ls.h"
+#include "ds.h"
+
+using namespace std;
+
+int main(void) {
+	cerr << "Test LarssonSadakana for int...";
+	{
+		typedef int T;
+		const char *t = "banana";
+		EList<T> sa;
+		EList<T> isa;
+		for(size_t i = 0; i < strlen(t); i++) {
+			isa.push_back(t[i]);
+		}
+		isa.push_back(0); // disregarded
+		sa.resize(isa.size());
+		LarssonSadakane<T> ls;
+		ls.suffixsort(isa.ptr(), sa.ptr(), (T)sa.size()-1, 'z', 0);
+		assert_eq((T)'a', t[sa[1]]); assert_eq(5, sa[1]);
+		assert_eq((T)'a', t[sa[2]]); assert_eq(3, sa[2]);
+		assert_eq((T)'a', t[sa[3]]); assert_eq(1, sa[3]);
+		assert_eq((T)'b', t[sa[4]]); assert_eq(0, sa[4]);
+		assert_eq((T)'n', t[sa[5]]); assert_eq(4, sa[5]);
+		assert_eq((T)'n', t[sa[6]]); assert_eq(2, sa[6]);
+	}
+	cerr << "PASSED" << endl;
+
+	cerr << "Test LarssonSadakana for uint32_t...";
+	{
+		typedef uint32_t T;
+		const char *t = "banana";
+		EList<T> sa;
+		EList<T> isa;
+		for(size_t i = 0; i < strlen(t); i++) {
+			isa.push_back(t[i]);
+		}
+		isa.push_back(0); // disregarded
+		sa.resize(isa.size());
+		LarssonSadakane<int> ls;
+		ls.suffixsort(
+			(int*)isa.ptr(),
+			(int*)sa.ptr(),
+			(int)sa.size()-1,
+			'z',
+			0);
+		assert_eq((T)'a', t[sa[1]]); assert_eq(5, sa[1]);
+		assert_eq((T)'a', t[sa[2]]); assert_eq(3, sa[2]);
+		assert_eq((T)'a', t[sa[3]]); assert_eq(1, sa[3]);
+		assert_eq((T)'b', t[sa[4]]); assert_eq(0, sa[4]);
+		assert_eq((T)'n', t[sa[5]]); assert_eq(4, sa[5]);
+		assert_eq((T)'n', t[sa[6]]); assert_eq(2, sa[6]);
+	}
+	cerr << "PASSED" << endl;
+
+	cerr << "Last elt is < or > others ...";
+	{
+		{
+		typedef int T;
+		const char *t = "aaa";
+		EList<T> sa;
+		EList<T> isa;
+		for(size_t i = 0; i < strlen(t); i++) {
+			isa.push_back(t[i]);
+		}
+		isa.push_back(0); // disregarded
+		sa.resize(isa.size());
+		LarssonSadakane<T> ls;
+		ls.suffixsort(isa.ptr(), sa.ptr(), (T)sa.size()-1, 'z', 0);
+		assert_eq(3, sa[0]);
+		assert_eq(2, sa[1]);
+		assert_eq(1, sa[2]);
+		assert_eq(0, sa[3]);
+		}
+
+		{
+		typedef int T;
+		const char *t = "aaa";
+		EList<T> sa;
+		EList<T> isa;
+		for(size_t i = 0; i < strlen(t); i++) {
+			isa.push_back(t[i]);
+		}
+		isa.push_back('y'); // doesn't matter if this is > others
+		sa.resize(isa.size());
+		LarssonSadakane<T> ls;
+		ls.suffixsort(isa.ptr(), sa.ptr(), (T)sa.size()-1, 'z', 0);
+		assert_eq(3, sa[0]);
+		assert_eq(2, sa[1]);
+		assert_eq(1, sa[2]);
+		assert_eq(0, sa[3]);
+		}
+		
+		{
+		typedef int T;
+		const char *t = "aaa";
+		EList<T> sa;
+		EList<T> isa;
+		for(size_t i = 0; i < strlen(t); i++) {
+			isa.push_back(t[i]);
+		}
+		isa.push_back('y'); // breaks ties
+		isa.push_back(0);   // disregarded
+		sa.resize(isa.size());
+		LarssonSadakane<T> ls;
+		ls.suffixsort(isa.ptr(), sa.ptr(), (T)sa.size()-1, 'z', 0);
+		assert_eq(4, sa[0]);
+		assert_eq(0, sa[1]);
+		assert_eq(1, sa[2]);
+		assert_eq(2, sa[3]);
+		assert_eq(3, sa[4]);
+		}
+		
+	}
+	cerr << "PASSED" << endl;
+}
+
+#endif
diff --git a/ls.h b/ls.h
new file mode 100644
index 0000000..e333f7c
--- /dev/null
+++ b/ls.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Code in this file is ultimately based on:
+
+   qsufsort.c
+   Copyright 1999, N. Jesper Larsson, all rights reserved.
+
+   This file contains an implementation of the algorithm presented in "Faster
+   Suffix Sorting" by N. Jesper Larsson (jesper at cs.lth.se) and Kunihiko
+   Sadakane (sada at is.s.u-tokyo.ac.jp).
+
+   This software may be used freely for any purpose. However, when distributed,
+   the original source must be clearly stated, and, when the source code is
+   distributed, the copyright notice must be retained and any alterations in
+   the code must be clearly marked. No warranty is given regarding the quality
+   of this software.*/
+
+#ifndef LS_H_
+#define LS_H_
+
+#include <iostream>
+#include <limits>
+#include <stdint.h>
+
+template<typename T>
+class LarssonSadakane {
+	T *I, /* group array, ultimately suffix array.*/
+	*V,   /* inverse array, ultimately inverse of I.*/
+	r,    /* number of symbols aggregated by transform.*/
+	h;    /* length of already-sorted prefixes.*/
+
+	#define LS_KEY(p)          (V[*(p)+(h)])
+	#define LS_SWAP(p, q)      (tmp=*(p), *(p)=*(q), *(q)=tmp)
+	#define LS_SMED3(a, b, c)  (LS_KEY(a)<LS_KEY(b) ?                        \
+			  (LS_KEY(b)<LS_KEY(c) ? (b) : LS_KEY(a)<LS_KEY(c) ? (c) : (a))  \
+			: (LS_KEY(b)>LS_KEY(c) ? (b) : LS_KEY(a)>LS_KEY(c) ? (c) : (a)))
+
+	/* Subroutine for select_sort_split and sort_split. Sets group numbers for a
+	   group whose lowest position in I is pl and highest position is pm.*/
+
+	inline void update_group(T *pl, T *pm) {
+	   T g;
+	   g=(T)(pm-I);                 /* group number.*/
+	   V[*pl]=g;                    /* update group number of first position.*/
+	   if (pl==pm)
+		  *pl=-1;                   /* one element, sorted group.*/
+	   else
+		  do                        /* more than one element, unsorted group.*/
+			 V[*++pl]=g;            /* update group numbers.*/
+		  while (pl<pm);
+	}
+
+	/* Quadratic sorting method to use for small subarrays. To be able to update
+	   group numbers consistently, a variant of selection sorting is used.*/
+
+	inline void select_sort_split(T *p, T n) {
+	   T *pa, *pb, *pi, *pn;
+	   T f, v, tmp;
+
+	   pa=p;                        /* pa is start of group being picked out.*/
+	   pn=p+n-1;                    /* pn is last position of subarray.*/
+	   while (pa<pn) {
+		  for (pi=pb=pa+1, f=LS_KEY(pa); pi<=pn; ++pi)
+			 if ((v=LS_KEY(pi))<f) {
+				f=v;                /* f is smallest key found.*/
+				LS_SWAP(pi, pa);       /* place smallest element at beginning.*/
+				pb=pa+1;            /* pb is position for elements equal to f.*/
+			 } else if (v==f) {     /* if equal to smallest key.*/
+				LS_SWAP(pi, pb);       /* place next to other smallest elements.*/
+				++pb;
+			 }
+		  update_group(pa, pb-1);   /* update group values for new group.*/
+		  pa=pb;                    /* continue sorting rest of the subarray.*/
+	   }
+	   if (pa==pn) {                /* check if last part is single element.*/
+		  V[*pa]=(T)(pa-I);
+		  *pa=-1;                   /* sorted group.*/
+	   }
+	}
+
+	/* Subroutine for sort_split, algorithm by Bentley & McIlroy.*/
+
+	inline T choose_pivot(T *p, T n) {
+	   T *pl, *pm, *pn;
+	   T s;
+
+	   pm=p+(n>>1);                 /* small arrays, middle element.*/
+	   if (n>7) {
+		  pl=p;
+		  pn=p+n-1;
+		  if (n>40) {               /* big arrays, pseudomedian of 9.*/
+			 s=n>>3;
+			 pl=LS_SMED3(pl, pl+s, pl+s+s);
+			 pm=LS_SMED3(pm-s, pm, pm+s);
+			 pn=LS_SMED3(pn-s-s, pn-s, pn);
+		  }
+		  pm=LS_SMED3(pl, pm, pn);      /* midsize arrays, median of 3.*/
+	   }
+	   return LS_KEY(pm);
+	}
+
+	/* Sorting routine called for each unsorted group. Sorts the array of integers
+	   (suffix numbers) of length n starting at p. The algorithm is a ternary-split
+	   quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
+	   Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
+	   function is based on Program 7.*/
+
+	inline void sort_split(T *p, T n)
+	{
+	   T *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+	   T f, v, s, t, tmp;
+
+	   if (n<7) {                   /* multi-selection sort smallest arrays.*/
+		  select_sort_split(p, n);
+		  return;
+	   }
+
+	   v=choose_pivot(p, n);
+	   pa=pb=p;
+	   pc=pd=p+n-1;
+	   while (1) {                  /* split-end partition.*/
+		  while (pb<=pc && (f=LS_KEY(pb))<=v) {
+			 if (f==v) {
+				LS_SWAP(pa, pb);
+				++pa;
+			 }
+			 ++pb;
+		  }
+		  while (pc>=pb && (f=LS_KEY(pc))>=v) {
+			 if (f==v) {
+				LS_SWAP(pc, pd);
+				--pd;
+			 }
+			 --pc;
+		  }
+		  if (pb>pc)
+			 break;
+		  LS_SWAP(pb, pc);
+		  ++pb;
+		  --pc;
+	   }
+	   pn=p+n;
+	   if ((s=(T)(pa-p))>(t=(T)(pb-pa)))
+		  s=t;
+	   for (pl=p, pm=pb-s; s; --s, ++pl, ++pm)
+		  LS_SWAP(pl, pm);
+	   if ((s=(T)(pd-pc))>(t=(T)(pn-pd-1)))
+		  s=t;
+	   for (pl=pb, pm=pn-s; s; --s, ++pl, ++pm)
+		  LS_SWAP(pl, pm);
+
+	   s=(T)(pb-pa);
+	   t=(T)(pd-pc);
+	   if (s>0)
+		  sort_split(p, s);
+	   update_group(p+s, p+n-t-1);
+	   if (t>0)
+		  sort_split(p+n-t, t);
+	}
+
+	/* Bucketsort for first iteration.
+
+	   Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
+	   at least once. x[n] is 0. (This is the corresponding output of transform.) k
+	   must be at most n+1. p is array of size n+1 whose contents are disregarded.
+
+	   Output: x is V and p is I after the initial sorting stage of the refined
+	   suffix sorting algorithm.*/
+
+	inline void bucketsort(T *x, T *p, T n, T k)
+	{
+	   T *pi, i, c, d, g;
+
+	   for (pi=p; pi<p+k; ++pi)
+		  *pi=-1;                   /* mark linked lists empty.*/
+	   for (i=0; i<=n; ++i) {
+		  x[i]=p[c=x[i]];           /* insert in linked list.*/
+		  p[c]=i;
+	   }
+	   for (pi=p+k-1, i=n; pi>=p; --pi) {
+		  d=x[c=*pi];               /* c is position, d is next in list.*/
+		  x[c]=g=i;                 /* last position equals group number.*/
+		  if (d == 0 || d > 0) {    /* if more than one element in group.*/
+			 p[i--]=c;              /* p is permutation for the sorted x.*/
+			 do {
+				d=x[c=d];           /* next in linked list.*/
+				x[c]=g;             /* group number in x.*/
+				p[i--]=c;           /* permutation in p.*/
+			 } while (d == 0 || d > 0);
+		  } else
+			 p[i--]=-1;             /* one element, sorted group.*/
+	   }
+	}
+
+	/* Transforms the alphabet of x by attempting to aggregate several symbols into
+	   one, while preserving the suffix order of x. The alphabet may also be
+	   compacted, so that x on output comprises all integers of the new alphabet
+	   with no skipped numbers.
+
+	   Input: x is an array of size n+1 whose first n elements are positive
+	   integers in the range l...k-1. p is array of size n+1, used for temporary
+	   storage. q controls aggregation and compaction by defining the maximum value
+	   for any symbol during transformation: q must be at least k-l; if q<=n,
+	   compaction is guaranteed; if k-l>n, compaction is never done; if q is
+	   INT_MAX, the maximum number of symbols are aggregated into one.
+
+	   Output: Returns an integer j in the range 1...q representing the size of the
+	   new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
+	   set to the number of old symbols grouped into one. Only x[n] is 0.*/
+
+	inline T transform(T *x, T *p, T n, T k, T l, T q)
+	{
+	   T b, c, d, e, i, j, m, s;
+	   T *pi, *pj;
+
+	   for (s=0, i=k-l; i; i>>=1)
+		  ++s;                      /* s is number of bits in old symbol.*/
+	   e=std::numeric_limits<T>::max()>>s; /* e is for overflow checking.*/
+	   for (b=d=r=0; r<n && d<=e && (c=d<<s|(k-l))<=q; ++r) {
+		  b=b<<s|(x[r]-l+1);        /* b is start of x in chunk alphabet.*/
+		  d=c;                      /* d is max symbol in chunk alphabet.*/
+	   }
+	   m=(((T)1)<<(r-1)*s)-1;            /* m masks off top old symbol from chunk.*/
+	   x[n]=l-1;                    /* emulate zero terminator.*/
+	   if (d<=n) {                  /* if bucketing possible, compact alphabet.*/
+		  for (pi=p; pi<=p+d; ++pi)
+			 *pi=0;                 /* zero transformation table.*/
+		  for (pi=x+r, c=b; pi<=x+n; ++pi) {
+			 p[c]=1;                /* mark used chunk symbol.*/
+			 c=(c&m)<<s|(*pi-l+1);  /* shift in next old symbol in chunk.*/
+		  }
+		  for (i=1; i<r; ++i) {     /* handle last r-1 positions.*/
+			 p[c]=1;                /* mark used chunk symbol.*/
+			 c=(c&m)<<s;            /* shift in next old symbol in chunk.*/
+		  }
+		  for (pi=p, j=1; pi<=p+d; ++pi)
+			 if (*pi)
+				*pi=j++;            /* j is new alphabet size.*/
+		  for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
+			 *pi=p[c];              /* transform to new alphabet.*/
+			 c=(c&m)<<s|(*pj-l+1);  /* shift in next old symbol in chunk.*/
+		  }
+		  while (pi<x+n) {          /* handle last r-1 positions.*/
+			 *pi++=p[c];            /* transform to new alphabet.*/
+			 c=(c&m)<<s;            /* shift right-end zero in chunk.*/
+		  }
+	   } else {                     /* bucketing not possible, don't compact.*/
+		  for (pi=x, pj=x+r, c=b; pj<=x+n; ++pi, ++pj) {
+			 *pi=c;                 /* transform to new alphabet.*/
+			 c=(c&m)<<s|(*pj-l+1);  /* shift in next old symbol in chunk.*/
+		  }
+		  while (pi<x+n) {          /* handle last r-1 positions.*/
+			 *pi++=c;               /* transform to new alphabet.*/
+			 c=(c&m)<<s;            /* shift right-end zero in chunk.*/
+		  }
+		  j=d+1;                    /* new alphabet size.*/
+	   }
+	   x[n]=0;                      /* end-of-string symbol is zero.*/
+	   return j;                    /* return new alphabet size.*/
+	}
+	
+	public:
+
+	/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
+	   n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
+	   contents of x[n] is disregarded, the n-th symbol being regarded as
+	   end-of-string smaller than all other symbols.*/
+
+	void suffixsort(T *x, T *p, T n, T k, T l)
+	{
+	   T *pi, *pk;
+	   T i, j, s, sl;
+
+	   V=x;                         /* set global values.*/
+	   I=p;
+
+	   if (n>=k-l) {                /* if bucketing possible,*/
+		  j=transform(V, I, n, k, l, n);
+		  bucketsort(V, I, n, j);   /* bucketsort on first r positions.*/
+	   } else {
+		  transform(V, I, n, k, l, std::numeric_limits<T>::max());
+		  for (i=0; i<=n; ++i)
+			 I[i]=i;                /* initialize I with suffix numbers.*/
+		  h=0;
+		  sort_split(I, n+1);       /* quicksort on first r positions.*/
+	   }
+	   h=r;                         /* number of symbols aggregated by transform.*/
+
+	   while (*I>=-n) {
+		  pi=I;                     /* pi is first position of group.*/
+		  sl=0;                     /* sl is negated length of sorted groups.*/
+		  do {
+			 if ((s=*pi) <= 0 && (s=*pi) != 0) {
+				pi-=s;              /* skip over sorted group.*/
+				sl+=s;              /* add negated length to sl.*/
+			 } else {
+				if (sl) {
+				   *(pi+sl)=sl;     /* combine sorted groups before pi.*/
+				   sl=0;
+				}
+				pk=I+V[s]+1;        /* pk-1 is last position of unsorted group.*/
+				sort_split(pi, (T)(pk-pi));
+				pi=pk;              /* next group.*/
+			 }
+		  } while (pi<=I+n);
+		  if (sl)                   /* if the array ends with a sorted group.*/
+			 *(pi+sl)=sl;           /* combine sorted groups at end of I.*/
+		  h=2*h;                    /* double sorted-depth.*/
+	   }
+
+	   for (i=0; i<=n; ++i)         /* reconstruct suffix array from inverse.*/
+		  I[V[i]]=i;
+	}
+};
+
+#endif /*def LS_H_*/
diff --git a/mask.cpp b/mask.cpp
new file mode 100644
index 0000000..ffefdc7
--- /dev/null
+++ b/mask.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "mask.h"
+
+// 5-bit pop count
+int alts5[32] = {
+	 0, 1, 1, 2, 1, 2, 2, 3,
+	 1, 2, 2, 3, 2, 3, 3, 4,
+	 1, 2, 2, 3, 2, 3, 3, 4,
+	 2, 3, 3, 4, 3, 4, 4, 5
+};
+
+// Index of lowest set bit
+int firsts5[32] = {
+	-1, 0, 1, 0, 2, 0, 1, 0,
+	 3, 0, 1, 0, 2, 0, 1, 0,
+	 4, 0, 1, 0, 2, 0, 1, 0,
+	 3, 0, 1, 0, 2, 0, 1, 0
+};
diff --git a/mask.h b/mask.h
new file mode 100644
index 0000000..e00c194
--- /dev/null
+++ b/mask.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MASK_H_
+#define MASK_H_
+
+#include <iostream>
+#include "random_source.h"
+
+// 5-bit pop count
+extern int alts5[32];
+
+// Index of lowest set bit
+extern int firsts5[32];
+
+/**
+ * Return 1 if a 2-bit-encoded base ('i') matches any bit in the mask ('j') and
+ * the mask < 16.  Returns -1 if either the reference or the read character was
+ * ambiguous.  Returns 0 if the characters unambiguously mismatch.
+ */
+static inline int matchesEx(int i, int j) {
+	if(j >= 16 || i > 3) {
+		// read and/or ref was ambiguous
+		return -1;
+	}
+	return (((1 << i) & j) != 0) ? 1 : 0;
+}
+
+/**
+ * Return 1 if a 2-bit-encoded base ('i') matches any bit in the mask ('j').
+ */
+static inline bool matches(int i, int j) {
+	return ((1 << i) & j) != 0;
+}
+
+/**
+ * Given a mask with up to 5 bits, return an index corresponding to a
+ * set bit in the mask, randomly chosen from among all set bits.
+ */
+static inline int randFromMask(RandomSource& rnd, int mask) {
+	assert_gt(mask, 0);
+	if(alts5[mask] == 1) {
+		// only one to pick from, pick it via lookup table
+		return firsts5[mask];
+	}
+	assert_gt(mask, 0);
+	assert_lt(mask, 32);
+	int r = rnd.nextU32() % alts5[mask];
+	assert_geq(r, 0);
+	assert_lt(r, alts5[mask]);
+	// could do the following via lookup table too
+	for(int i = 0; i < 5; i++) {
+		if((mask & (1 << i)) != 0) {
+			if(r == 0) return i;
+			r--;
+		}
+	}
+	std::cerr << "Shouldn't get here" << std::endl;
+	throw 1;
+	return -1;
+}
+
+#endif /*ndef MASK_H_*/
diff --git a/mem_ids.h b/mem_ids.h
new file mode 100644
index 0000000..352817b
--- /dev/null
+++ b/mem_ids.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// For holding index data
+#define EBWT_CAT  ((int) 1)
+// For holding index-building data
+#define EBWTB_CAT ((int) 2)
+// For holding cache data
+#define CA_CAT    ((int) 3)
+// For holding group-walk-left bookkeeping data
+#define GW_CAT    ((int) 4)
+// For holding alignment bookkeeping data
+#define AL_CAT    ((int) 5)
+// For holding dynamic programming bookkeeping data
+#define DP_CAT    ((int) 6)
+// For holding alignment results and other hit objects
+#define RES_CAT   ((int) 7)
+#define MISC_CAT  ((int) 9)
+#define DEBUG_CAT ((int)10)
diff --git a/mm.h b/mm.h
new file mode 100644
index 0000000..00a2335
--- /dev/null
+++ b/mm.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MM_H_
+#define MM_H_
+
+/**
+ * mm.h:
+ *
+ * Defines that make it easier to handle files in the two different MM
+ * contexts: i.e. on Linux and Mac where MM is supported and POSIX I/O
+ * functions work as expected, and on Windows where MM is not supported
+ * and where there isn't POSIX I/O,
+ */
+#if 0
+#ifdef BOWTIE_MM
+#define MM_FILE_CLOSE(x) if(x > 3) { close(x); }
+#define MM_READ_RET ssize_t
+// #define MM_READ read
+#define MM_SEEK lseek
+#define MM_FILE int
+#define MM_FILE_INIT -1
+#else
+#define MM_FILE_CLOSE(x) if(x != NULL) { fclose(x); }
+#define MM_READ_RET size_t
+#define MM_SEEK fseek
+#define MM_FILE FILE*
+#define MM_FILE_INIT NULL
+#endif
+#endif
+
+#define MM_READ(file, dest, sz) fread(dest, 1, sz, file)
+#define MM_IS_IO_ERR(file_hd, ret, count) is_fread_err(file_hd, ret, count)
+
+#endif /* MM_H_ */
diff --git a/multikey_qsort.h b/multikey_qsort.h
new file mode 100644
index 0000000..02c41e1
--- /dev/null
+++ b/multikey_qsort.h
@@ -0,0 +1,1232 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MULTIKEY_QSORT_H_
+#define MULTIKEY_QSORT_H_
+
+#include <iostream>
+#include "sequence_io.h"
+#include "alphabet.h"
+#include "assert_helpers.h"
+#include "diff_sample.h"
+#include "sstring.h"
+#include "btypes.h"
+
+using namespace std;
+
+/**
+ * Swap elements a and b in s
+ */
+template <typename TStr, typename TPos>
+static inline void swap(TStr& s, size_t slen, TPos a, TPos b) {
+	assert_lt(a, slen);
+	assert_lt(b, slen);
+	swap(s[a], s[b]);
+}
+
+/**
+ * Swap elements a and b in array s
+ */
+template <typename TVal, typename TPos>
+static inline void swap(TVal* s, size_t slen, TPos a, TPos b) {
+	assert_lt(a, slen);
+	assert_lt(b, slen);
+	swap(s[a], s[b]);
+}
+
+/**
+ * Helper macro for swapping elements a and b in s.  Does some additional
+ * sainty checking w/r/t begin and end (which are parameters to the sorting
+ * routines below).
+ */
+#define SWAP(s, a, b) { \
+	assert_geq(a, begin); \
+	assert_geq(b, begin); \
+	assert_lt(a, end); \
+	assert_lt(b, end); \
+	swap(s, slen, a, b); \
+}
+
+/**
+ * Helper macro for swapping the same pair of elements a and b in two different
+ * strings s and s2.  This is a helpful variant if, for example, the caller
+ * would like to see how their input was permuted by the sort routine (in that
+ * case, the caller would let s2 be an array s2[] where s2 is the same length
+ * as s and s2[i] = i).
+ */
+#define SWAP2(s, s2, a, b) { \
+	SWAP(s, a, b); \
+	swap(s2, slen, a, b); \
+}
+
+#define SWAP1(s, s2, a, b) { \
+	SWAP(s, a, b); \
+}
+
+/**
+ * Helper macro that swaps a range of elements [i, i+n) with another
+ * range [j, j+n) in s.
+ */
+#define VECSWAP(s, i, j, n) { \
+	if(n > 0) { vecswap(s, slen, i, j, n, begin, end); } \
+}
+
+/**
+ * Helper macro that swaps a range of elements [i, i+n) with another
+ * range [j, j+n) both in s and s2.
+ */
+#define VECSWAP2(s, s2, i, j, n) { \
+	if(n > 0) { vecswap2(s, slen, s2, i, j, n, begin, end); } \
+}
+
+/**
+ * Helper function that swaps a range of elements [i, i+n) with another
+ * range [j, j+n) in s.  begin and end represent the current range under
+ * consideration by the caller (one of the recursive multikey_quicksort
+ * routines below).
+ */
+template <typename TStr, typename TPos>
+static inline void vecswap(TStr& s, size_t slen, TPos i, TPos j, TPos n, TPos begin, TPos end) {
+	assert_geq(i, begin);
+	assert_geq(j, begin);
+	assert_lt(i, end);
+	assert_lt(j, end);
+	while(n-- > 0) {
+		assert_geq(n, 0);
+		TPos a = i+n;
+		TPos b = j+n;
+		assert_geq(a, begin);
+		assert_geq(b, begin);
+		assert_lt(a, end);
+		assert_lt(b, end);
+		swap(s, slen, a, b);
+	}
+}
+
+template <typename TVal, typename TPos>
+static inline void vecswap(TVal *s, size_t slen, TPos i, TPos j, TPos n, TPos begin, TPos end) {
+	assert_geq(i, begin);
+	assert_geq(j, begin);
+	assert_lt(i, end);
+	assert_lt(j, end);
+	while(n-- > 0) {
+		assert_geq(n, 0);
+		TPos a = i+n;
+		TPos b = j+n;
+		assert_geq(a, begin);
+		assert_geq(b, begin);
+		assert_lt(a, end);
+		assert_lt(b, end);
+		swap(s, slen, a, b);
+	}
+}
+
+/**
+ * Helper function that swaps a range of elements [i, i+n) with another range
+ * [j, j+n) both in s and s2.  begin and end represent the current range under
+ * consideration by the caller (one of the recursive multikey_quicksort
+ * routines below).
+ */
+template <typename TStr, typename TPos>
+static inline void vecswap2(
+	TStr& s,
+	size_t slen,
+	TStr& s2,
+	TPos i,
+	TPos j,
+	TPos n,
+	TPos begin,
+	TPos end)
+{
+	assert_geq(i, begin);
+	assert_geq(j, begin);
+	assert_lt(i, end);
+	assert_lt(j, end);
+	while(n-- > 0) {
+		assert_geq(n, 0);
+		TPos a = i+n;
+		TPos b = j+n;
+		assert_geq(a, begin);
+		assert_geq(b, begin);
+		assert_lt(a, end);
+		assert_lt(b, end);
+		swap(s, slen, a, b);
+		swap(s2, slen, a, b);
+	}
+}
+
+template <typename TVal, typename TPos>
+static inline void vecswap2(TVal* s, size_t slen, TVal* s2, TPos i, TPos j, TPos n, TPos begin, TPos end) {
+	assert_geq(i, begin);
+	assert_geq(j, begin);
+	assert_lt(i, end);
+	assert_lt(j, end);
+	while(n-- > 0) {
+		assert_geq(n, 0);
+		TPos a = i+n;
+		TPos b = j+n;
+		assert_geq(a, begin);
+		assert_geq(b, begin);
+		assert_lt(a, end);
+		assert_lt(b, end);
+		swap(s, slen, a, b);
+		swap(s2, slen, a, b);
+	}
+}
+
+/// Retrieve an int-ized version of the ath character of string s, or,
+/// if a goes off the end of s, return a (user-specified) int greater
+/// than any TAlphabet character - 'hi'.
+#define CHAR_AT(ss, aa) ((length(s[ss]) > aa) ? (int)(s[ss][aa]) : hi)
+
+/// Retrieve an int-ized version of the ath character of string s, or,
+/// if a goes off the end of s, return a (user-specified) int greater
+/// than any TAlphabet character - 'hi'.
+#define CHAR_AT_SUF(si, off) \
+	(((off + s[si]) < hlen) ? ((int)(host[off + s[si]])) : (hi))
+
+/// Retrieve an int-ized version of the ath character of string s, or,
+/// if a goes off the end of s, return a (user-specified) int greater
+/// than any TAlphabet character - 'hi'.
+
+#define CHAR_AT_SUF_U8(si, off) char_at_suf_u8(host, hlen, s, si, off, hi)
+
+// Note that CHOOSE_AND_SWAP_RANDOM_PIVOT is unused
+#define CHOOSE_AND_SWAP_RANDOM_PIVOT(sw, ch) {                            \
+	/* Note: rand() didn't really cut it here; it seemed to run out of */ \
+	/* randomness and, after a time, returned the same thing over and */  \
+	/* over again */                                                      \
+	a = (rand() % n) + begin; /* choose pivot between begin and end */  \
+	assert_lt(a, end); assert_geq(a, begin);                              \
+	sw(s, s2, begin, a); /* move pivot to beginning */                    \
+}
+
+/**
+ * Ad-hoc DNA-centric way of choose a pretty good pivot without using
+ * the pseudo-random number generator.  We try to get a 1 or 2 if
+ * possible, since they'll split things more evenly than a 0 or 4.  We
+ * also avoid swapping in the event that we choose the first element.
+ */
+#define CHOOSE_AND_SWAP_SMART_PIVOT(sw, ch) {                                    \
+	a = begin; /* choose first elt */                                            \
+	/* now try to find a better elt */                                           \
+	if(n >= 5) { /* n is the difference between begin and end */                 \
+		if     (ch(begin+1, depth) == 1 || ch(begin+1, depth) == 2) a = begin+1; \
+		else if(ch(begin+2, depth) == 1 || ch(begin+2, depth) == 2) a = begin+2; \
+		else if(ch(begin+3, depth) == 1 || ch(begin+3, depth) == 2) a = begin+3; \
+		else if(ch(begin+4, depth) == 1 || ch(begin+4, depth) == 2) a = begin+4; \
+		if(a != begin) sw(s, s2, begin, a); /* move pivot to beginning */        \
+	}                                                                            \
+	/* the element at [begin] now holds the pivot value */                       \
+}
+
+#define CHOOSE_AND_SWAP_PIVOT CHOOSE_AND_SWAP_SMART_PIVOT
+
+#ifndef NDEBUG
+
+/**
+ * Assert that the range of chars at depth 'depth' in strings 'begin'
+ * to 'end' in string-of-suffix-offsets s is parititioned properly
+ * according to the ternary paritioning strategy of Bentley and McIlroy
+ * (*prior to* swapping the = regions to the center)
+ */
+template<typename THost>
+bool assertPartitionedSuf(
+	const THost& host,
+	TIndexOffU *s,
+	size_t slen,
+	int hi,
+	int pivot,
+	size_t begin,
+	size_t end,
+	size_t depth)
+{
+	size_t hlen = host.length();
+	int state = 0; // 0 -> 1st = section, 1 -> < section, 2 -> > section, 3 -> 2nd = section
+	for(size_t i = begin; i < end; i++) {
+		switch(state) {
+		case 0:
+			if       (CHAR_AT_SUF(i, depth) < pivot)  { state = 1; break; }
+			else if  (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
+			assert_eq(CHAR_AT_SUF(i, depth), pivot);  break;
+		case 1:
+			if       (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
+			else if  (CHAR_AT_SUF(i, depth) == pivot) { state = 3; break; }
+			assert_lt(CHAR_AT_SUF(i, depth), pivot);  break;
+		case 2:
+			if       (CHAR_AT_SUF(i, depth) == pivot) { state = 3; break; }
+			assert_gt(CHAR_AT_SUF(i, depth), pivot);	 break;
+		case 3:
+			assert_eq(CHAR_AT_SUF(i, depth), pivot);	 break;
+		}
+	}
+	return true;
+}
+
+/**
+ * Assert that the range of chars at depth 'depth' in strings 'begin'
+ * to 'end' in string-of-suffix-offsets s is parititioned properly
+ * according to the ternary paritioning strategy of Bentley and McIlroy
+ * (*after* swapping the = regions to the center)
+ */
+template<typename THost>
+bool assertPartitionedSuf2(
+	const THost& host,
+	TIndexOffU *s,
+	size_t slen,
+	int hi,
+	int pivot,
+	size_t begin,
+	size_t end,
+	size_t depth)
+{
+	size_t hlen = host.length();
+	int state = 0; // 0 -> < section, 1 -> = section, 2 -> > section
+	for(size_t i = begin; i < end; i++) {
+		switch(state) {
+		case 0:
+			if       (CHAR_AT_SUF(i, depth) == pivot) { state = 1; break; }
+			else if  (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
+			assert_lt(CHAR_AT_SUF(i, depth), pivot);  break;
+		case 1:
+			if       (CHAR_AT_SUF(i, depth) > pivot)  { state = 2; break; }
+			assert_eq(CHAR_AT_SUF(i, depth), pivot);  break;
+		case 2:
+			assert_gt(CHAR_AT_SUF(i, depth), pivot);  break;
+		}
+	}
+	return true;
+}
+#endif
+
+/**
+ * Assert that string s of suffix offsets into string 'host' is a seemingly
+ * legitimate suffix-offset list (at this time, we just check that it doesn't
+ * list any suffix twice).
+ */
+static inline void sanityCheckInputSufs(TIndexOffU *s, size_t slen) {
+	assert_gt(slen, 0);
+	for(size_t i = 0; i < slen; i++) {
+		// Actually, it's convenient to allow the caller to provide
+		// suffix offsets thare are off the end of the host string.
+		// See, e.g., build() in diff_sample.cpp.
+		//assert_lt(s[i], length(host));
+		for(size_t j = i+1; j < slen; j++) {
+			assert_neq(s[i], s[j]);
+		}
+	}
+}
+
+/**
+ * Assert that the string s of suffix offsets into  'host' really are in
+ * lexicographical order up to depth 'upto'.
+ */
+template <typename T>
+void sanityCheckOrderedSufs(
+	const T& host,
+	size_t hlen,
+	TIndexOffU *s,
+	size_t slen,
+	size_t upto,
+	size_t lower = 0,
+	size_t upper = OFF_MASK)
+{
+	assert_lt(s[0], hlen);
+	upper = min<size_t>(upper, slen-1);
+	for(size_t i = lower; i < upper; i++) {
+		// Allow s[i+t] to point off the end of the string; this is
+		// convenient for some callers
+		if(s[i+1] >= hlen) continue;
+#ifndef NDEBUG
+		if(upto == OFF_MASK) {
+			assert(sstr_suf_lt(host, s[i], hlen, host, s[i+1], hlen, false));
+		} else {
+			if(sstr_suf_upto_lt(host, s[i], host, s[i+1], upto, false)) {
+				// operator > treats shorter strings as
+				// lexicographically smaller, but we want to opposite
+				//assert(isPrefix(suffix(host, s[i+1]), suffix(host, s[i])));
+			}
+		}
+#endif
+	}
+}
+
+/**
+ * Main multikey quicksort function for suffixes.  Based on Bentley &
+ * Sedgewick's algorithm on p.5 of their paper "Fast Algorithms for
+ * Sorting and Searching Strings".  That algorithm has been extended in
+ * three ways:
+ *
+ *  1. Deal with keys of different lengths by checking bounds and
+ *     considering off-the-end values to be 'hi' (b/c our goal is the
+ *     BWT transform, we're biased toward considring prefixes as
+ *     lexicographically *greater* than their extensions).
+ *  2. The multikey_qsort_suffixes version takes a single host string
+ *     and a list of suffix offsets as input.  This reduces memory
+ *     footprint compared to an approach that treats its input
+ *     generically as a set of strings (not necessarily suffixes), thus
+ *     requiring that we store at least two integers worth of
+ *     information for each string.
+ *  3. Sorting functions take an extra "upto" parameter that upper-
+ *     bounds the depth to which the function sorts.
+ *
+ * TODO: Consult a tie-breaker (like a difference cover sample) if two
+ * keys share a long prefix.
+ */
+template<typename T>
+void mkeyQSortSuf(
+	const T& host,
+	size_t hlen,
+	TIndexOffU *s,
+	size_t slen,
+	int hi,
+	size_t begin,
+	size_t end,
+	size_t depth,
+	size_t upto = OFF_MASK)
+{
+	// Helper for making the recursive call; sanity-checks arguments to
+	// make sure that the problem actually got smaller.
+	#define MQS_RECURSE_SUF(nbegin, nend, ndepth) { \
+		assert(nbegin > begin || nend < end || ndepth > depth); \
+		if(ndepth < upto) { /* don't exceed depth of 'upto' */ \
+			mkeyQSortSuf(host, hlen, s, slen, hi, nbegin, nend, ndepth, upto); \
+		} \
+	}
+	assert_leq(begin, slen);
+	assert_leq(end, slen);
+	size_t a, b, c, d, /*e,*/ r;
+	size_t n = end - begin;
+	if(n <= 1) return;                 // 1-element list already sorted
+	CHOOSE_AND_SWAP_PIVOT(SWAP1, CHAR_AT_SUF); // pick pivot, swap it into [begin]
+	int v = CHAR_AT_SUF(begin, depth); // v <- randomly-selected pivot value
+	#ifndef NDEBUG
+	{
+		bool stillInBounds = false;
+		for(size_t i = begin; i < end; i++) {
+			if(depth < (hlen-s[i])) {
+				stillInBounds = true;
+				break;
+			} else { /* already fell off this suffix */ }
+		}
+		assert(stillInBounds); // >=1 suffix must still be in bounds
+	}
+	#endif
+	a = b = begin;
+	c = d = end-1;
+	while(true) {
+		// Invariant: everything before a is = pivot, everything
+		// between a and b is <
+		int bc = 0; // shouldn't have to init but gcc on Mac complains
+		while(b <= c && v >= (bc = CHAR_AT_SUF(b, depth))) {
+			if(v == bc) {
+				SWAP(s, a, b); a++;
+			}
+			b++;
+		}
+		// Invariant: everything after d is = pivot, everything
+		// between c and d is >
+		int cc = 0; // shouldn't have to init but gcc on Mac complains
+		while(b <= c && v <= (cc = CHAR_AT_SUF(c, depth))) {
+			if(v == cc) {
+				SWAP(s, c, d); d--;
+			}
+			c--;
+		}
+		if(b > c) break;
+		SWAP(s, b, c);
+		b++;
+		c--;
+	}
+	assert(a > begin || c < end-1);                      // there was at least one =s
+	assert_lt(d-c, n); // they can't all have been > pivot
+	assert_lt(b-a, n); // they can't all have been < pivot
+	assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth));  // check pre-=-swap invariant
+	r = min(a-begin, b-a); VECSWAP(s, begin, b-r,   r);  // swap left = to center
+	r = min(d-c, end-d-1); VECSWAP(s, b,     end-r, r);  // swap right = to center
+	assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
+	r = b-a; // r <- # of <'s
+	if(r > 0) {
+		MQS_RECURSE_SUF(begin, begin + r, depth); // recurse on <'s
+	}
+	// Do not recurse on ='s if the pivot was the off-the-end value;
+	// they're already fully sorted
+	if(v != hi) {
+		MQS_RECURSE_SUF(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
+	}
+	r = d-c; // r <- # of >'s excluding those exhausted
+	if(r > 0 && v < hi-1) {
+		MQS_RECURSE_SUF(end-r, end, depth); // recurse on >'s
+	}
+}
+
+/**
+ * Toplevel function for multikey quicksort over suffixes.
+ */
+template<typename T>
+void mkeyQSortSuf(
+	const T& host,
+	TIndexOffU *s,
+	size_t slen,
+	int hi,
+	bool verbose = false,
+	bool sanityCheck = false,
+	size_t upto = OFF_MASK)
+{
+	size_t hlen = host.length();
+	assert_gt(slen, 0);
+	if(sanityCheck) sanityCheckInputSufs(s, slen);
+	mkeyQSortSuf(host, hlen, s, slen, hi, (size_t)0, slen, (size_t)0, upto);
+	if(sanityCheck) sanityCheckOrderedSufs(host, hlen, s, slen, upto);
+}
+
+/**
+ * Just like mkeyQSortSuf but all swaps are applied to s2 as well as s.
+ * This is a helpful variant if, for example, the caller would like to
+ * see how their input was permuted by the sort routine (in that case,
+ * the caller would let s2 be an array s2[] where s2 is the same length
+ * as s and s2[i] = i).
+ */
+struct QSortRange {
+    size_t begin;
+    size_t end;
+    size_t depth;
+};
+template<typename T>
+void mkeyQSortSuf2(
+                   const T& host,
+                   size_t hlen,
+                   TIndexOffU *s,
+                   size_t slen,
+                   TIndexOffU *s2,
+                   int hi,
+                   size_t _begin,
+                   size_t _end,
+                   size_t _depth,
+                   size_t upto = OFF_MASK,
+                   EList<size_t>* boundaries = NULL)
+{
+    ELList<QSortRange, 3, 1024> block_list;
+    while(true) {
+        size_t begin = 0, end = 0, depth = 0;
+        if(block_list.size() == 0) {
+            begin = _begin;
+            end = _end;
+            depth = _depth;
+        } else {
+            if(block_list.back().size() > 0) {
+                begin = block_list.back()[0].begin;
+                end = block_list.back()[0].end;
+                depth = block_list.back()[0].depth;
+                block_list.back().erase(0);
+            } else {
+                block_list.resize(block_list.size() - 1);
+                if(block_list.size() == 0) {
+                    break;
+                }
+            }
+        }
+        if(depth == upto) {
+            if(boundaries != NULL) {
+                (*boundaries).push_back(end);
+            }
+            continue;
+        }
+        assert_leq(begin, slen);
+        assert_leq(end, slen);
+        size_t a, b, c, d, /*e,*/ r;
+        size_t n = end - begin;
+        if(n <= 1) { // 1-element list already sorted
+            if(n == 1 && boundaries != NULL) {
+                boundaries->push_back(end);
+            }
+            continue;
+        }
+        CHOOSE_AND_SWAP_PIVOT(SWAP2, CHAR_AT_SUF); // pick pivot, swap it into [begin]
+        int v = CHAR_AT_SUF(begin, depth); // v <- randomly-selected pivot value
+#ifndef NDEBUG
+        {
+            bool stillInBounds = false;
+            for(size_t i = begin; i < end; i++) {
+                if(depth < (hlen-s[i])) {
+                    stillInBounds = true;
+                    break;
+                } else { /* already fell off this suffix */ }
+            }
+            assert(stillInBounds); // >=1 suffix must still be in bounds
+        }
+#endif
+        a = b = begin;
+        c = d = /*e =*/ end-1;
+        while(true) {
+            // Invariant: everything before a is = pivot, everything
+            // between a and b is <
+            int bc = 0; // shouldn't have to init but gcc on Mac complains
+            while(b <= c && v >= (bc = CHAR_AT_SUF(b, depth))) {
+                if(v == bc) {
+                    SWAP2(s, s2, a, b); a++;
+                }
+                b++;
+            }
+            // Invariant: everything after d is = pivot, everything
+            // between c and d is >
+            int cc = 0; // shouldn't have to init but gcc on Mac complains
+            while(b <= c && v <= (cc = CHAR_AT_SUF(c, depth))) {
+                if(v == cc) {
+                    SWAP2(s, s2, c, d); d--; /*e--;*/
+                }
+                //else if(c == e && v == hi) e--;
+                c--;
+            }
+            if(b > c) break;
+            SWAP2(s, s2, b, c);
+            b++;
+            c--;
+        }
+        assert(a > begin || c < end-1);                      // there was at least one =s
+        assert_lt(/*e*/d-c, n); // they can't all have been > pivot
+        assert_lt(b-a, n); // they can't all have been < pivot
+        assert(assertPartitionedSuf(host, s, slen, hi, v, begin, end, depth));  // check pre-=-swap invariant
+        r = min(a-begin, b-a); VECSWAP2(s, s2, begin, b-r,   r);  // swap left = to center
+        r = min(d-c, end-d-1); VECSWAP2(s, s2, b,     end-r, r);  // swap right = to center
+        assert(assertPartitionedSuf2(host, s, slen, hi, v, begin, end, depth)); // check post-=-swap invariant
+        r = b-a; // r <- # of <'s
+        block_list.expand();
+        block_list.back().clear();
+        if(r > 0) { // recurse on <'s
+            block_list.back().expand();
+            block_list.back().back().begin = begin;
+            block_list.back().back().end = begin + r;
+            block_list.back().back().depth = depth;
+        }
+        // Do not recurse on ='s if the pivot was the off-the-end value;
+        // they're already fully sorted
+        if(v != hi) { // recurse on ='s
+            block_list.back().expand();
+            block_list.back().back().begin = begin + r;
+            block_list.back().back().end = begin + r + (a-begin) + (end-d-1);
+            block_list.back().back().depth = depth + 1;
+        }
+        r = d-c;   // r <- # of >'s excluding those exhausted
+        if(r > 0 && v < hi-1) { // recurse on >'s
+            block_list.back().expand();
+            block_list.back().back().begin = end - r;
+            block_list.back().back().end = end;
+            block_list.back().back().depth = depth;
+        }
+    }
+}
+
+/**
+ * Toplevel function for multikey quicksort over suffixes with double
+ * swapping.
+ */
+template<typename T>
+void mkeyQSortSuf2(
+                   const T& host,
+                   TIndexOffU *s,
+                   size_t slen,
+                   TIndexOffU *s2,
+                   int hi,
+                   bool verbose = false,
+                   bool sanityCheck = false,
+                   size_t upto = OFF_MASK,
+                   EList<size_t>* boundaries = NULL)
+{
+	size_t hlen = host.length();
+	if(sanityCheck) sanityCheckInputSufs(s, slen);
+	TIndexOffU *sOrig = NULL;
+	if(sanityCheck) {
+		sOrig = new TIndexOffU[slen];
+		memcpy(sOrig, s, OFF_SIZE * slen);
+	}
+	mkeyQSortSuf2(host, hlen, s, slen, s2, hi, (size_t)0, slen, (size_t)0, upto, boundaries);
+	if(sanityCheck) {
+		sanityCheckOrderedSufs(host, hlen, s, slen, upto);
+		for(size_t i = 0; i < slen; i++) {
+			assert_eq(s[i], sOrig[s2[i]]);
+		}
+		delete[] sOrig;
+	}
+}
+
+// Ugly but necessary; otherwise the compiler chokes dramatically on
+// the DifferenceCoverSample<> template args to the next few functions
+template <typename T>
+class DifferenceCoverSample;
+
+/**
+ * Constant time
+ */
+template<typename T1, typename T2> inline
+bool sufDcLt(
+	const T1& host,
+	const T2& s1,
+	const T2& s2,
+	const DifferenceCoverSample<T1>& dc,
+	bool sanityCheck = false)
+{
+	size_t diff = dc.tieBreakOff(s1, s2);
+	ASSERT_ONLY(size_t hlen = host.length());
+	assert_lt(diff, dc.v());
+	assert_lt(diff, hlen-s1);
+	assert_lt(diff, hlen-s2);
+	if(sanityCheck) {
+		for(size_t i = 0; i < diff; i++) {
+			assert_eq(host[s1+i], host[s2+i]);
+		}
+	}
+	bool ret = dc.breakTie(s1+diff, s2+diff) < 0;
+#ifndef NDEBUG
+	if(sanityCheck && ret != sstr_suf_lt(host, s1, hlen, host, s2, hlen, false)) {
+		assert(false);
+	}
+#endif
+	return ret;
+}
+
+/**
+ * k log(k)
+ */
+template<typename T> inline
+void qsortSufDc(
+	const T& host,
+	size_t hlen,
+	TIndexOffU* s,
+	size_t slen,
+	const DifferenceCoverSample<T>& dc,
+	size_t begin,
+	size_t end,
+	bool sanityCheck = false)
+{
+	assert_leq(end, slen);
+	assert_lt(begin, slen);
+	assert_gt(end, begin);
+	size_t n = end - begin;
+	if(n <= 1) return;                 // 1-element list already sorted
+	// Note: rand() didn't really cut it here; it seemed to run out of
+	// randomness and, after a time, returned the same thing over and
+	// over again
+	size_t a = (rand() % n) + begin; // choose pivot between begin and end
+	assert_lt(a, end);
+	assert_geq(a, begin);
+	SWAP(s, end-1, a); // move pivot to end
+	size_t cur = 0;
+	for(size_t i = begin; i < end-1; i++) {
+		if(sufDcLt(host, s[i], s[end-1], dc, sanityCheck)) {
+			if(sanityCheck)
+				assert(dollarLt(suffix(host, s[i]), suffix(host, s[end-1])));
+			assert_lt(begin + cur, end-1);
+			SWAP(s, i, begin + cur);
+			cur++;
+		}
+	}
+	// Put pivot into place
+	assert_lt(cur, end-begin);
+	SWAP(s, end-1, begin+cur);
+	if(begin+cur > begin) qsortSufDc(host, hlen, s, slen, dc, begin, begin+cur);
+	if(end > begin+cur+1) qsortSufDc(host, hlen, s, slen, dc, begin+cur+1, end);
+}
+
+/**
+ * Toplevel function for multikey quicksort over suffixes.
+ */
+template<typename T1, typename T2>
+void mkeyQSortSufDcU8(
+	const T1& host1,
+	const T2& host,
+	size_t hlen,
+	TIndexOffU* s,
+	size_t slen,
+	const DifferenceCoverSample<T1>& dc,
+	int hi,
+	bool verbose = false,
+	bool sanityCheck = false)
+{
+	if(sanityCheck) sanityCheckInputSufs(s, slen);
+	mkeyQSortSufDcU8(host1, host, hlen, s, slen, dc, hi, 0, slen, 0, sanityCheck);
+	if(sanityCheck) sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK);
+}
+
+/**
+ * Return a boolean indicating whether s1 < s2 using the difference
+ * cover to break the tie.
+ */
+template<typename T1, typename T2> inline
+bool sufDcLtU8(
+	const T1& host1,
+	const T2& host,
+	size_t hlen,
+	size_t s1,
+	size_t s2,
+	const DifferenceCoverSample<T1>& dc,
+	bool sanityCheck = false)
+{
+	hlen += 0;
+	size_t diff = dc.tieBreakOff((TIndexOffU)s1, (TIndexOffU)s2);
+	assert_lt(diff, dc.v());
+	assert_lt(diff, hlen-s1);
+	assert_lt(diff, hlen-s2);
+	if(sanityCheck) {
+		for(size_t i = 0; i < diff; i++) {
+			assert_eq(host[s1+i], host1[s2+i]);
+		}
+	}
+	bool ret = dc.breakTie((TIndexOffU)(s1+diff), (TIndexOffU)(s2+diff)) < 0;
+	// Sanity-check return value using dollarLt
+#ifndef NDEBUG
+	bool ret2 = sstr_suf_lt(host1, s1, hlen, host, s2, hlen, false);
+	assert(!sanityCheck || ret == ret2);
+#endif
+	return ret;
+}
+
+/**
+ * k log(k)
+ */
+template<typename T1, typename T2> inline
+void qsortSufDcU8(
+	const T1& host1,
+	const T2& host,
+	size_t hlen,
+	TIndexOffU* s,
+	size_t slen,
+	const DifferenceCoverSample<T1>& dc,
+	size_t begin,
+	size_t end,
+	bool sanityCheck = false)
+{
+	assert_leq(end, slen);
+	assert_lt(begin, slen);
+	assert_gt(end, begin);
+	size_t n = end - begin;
+	if(n <= 1) return;                 // 1-element list already sorted
+	// Note: rand() didn't really cut it here; it seemed to run out of
+	// randomness and, after a time, returned the same thing over and
+	// over again
+	size_t a = (rand() % n) + begin; // choose pivot between begin and end
+	assert_lt(a, end);
+	assert_geq(a, begin);
+	SWAP(s, end-1, a); // move pivot to end
+	size_t cur = 0;
+	for(size_t i = begin; i < end-1; i++) {
+		if(sufDcLtU8(host1, host, hlen, s[i], s[end-1], dc, sanityCheck)) {
+#ifndef NDEBUG
+			if(sanityCheck) {
+				assert(sstr_suf_lt(host1, s[i], hlen, host1, s[end-1], hlen, false));
+			}
+			assert_lt(begin + cur, end-1);
+#endif
+			SWAP(s, i, begin + cur);
+			cur++;
+		}
+	}
+	// Put pivot into place
+	assert_lt(cur, end-begin);
+	SWAP(s, end-1, begin+cur);
+	if(begin+cur > begin) qsortSufDcU8(host1, host, hlen, s, slen, dc, begin, begin+cur);
+	if(end > begin+cur+1) qsortSufDcU8(host1, host, hlen, s, slen, dc, begin+cur+1, end);
+}
+
+#define BUCKET_SORT_CUTOFF (4 * 1024 * 1024)
+#define SELECTION_SORT_CUTOFF 6
+
+/**
+ * Straightforwardly obtain a uint8_t-ized version of t[off].  This
+ * works fine as long as TStr is not packed.
+ */
+template<typename TStr>
+inline uint8_t get_uint8(const TStr& t, size_t off) {
+	return t[off];
+}
+
+/**
+ * For incomprehensible generic-programming reasons, getting a uint8_t
+ * version of a character in a packed String<> requires casting first
+ * to Dna then to uint8_t.
+ */
+template<>
+inline uint8_t get_uint8<S2bDnaString>(const S2bDnaString& t, size_t off) {
+	return (uint8_t)t[off];
+}
+
+/**
+ * Return character at offset 'off' from the 'si'th suffix in the array
+ * 's' of suffixes.  If the character is out-of-bounds, return hi.
+ */
+template<typename TStr>
+static inline int char_at_suf_u8(
+	const TStr& host,
+	size_t hlen,
+	TIndexOffU* s,
+	size_t si,
+	size_t off,
+	uint8_t hi)
+{
+	return ((off+s[si]) < hlen) ? get_uint8(host, off+s[si]) : (hi);
+}
+
+template<typename T1, typename T2>
+static void selectionSortSufDcU8(
+		const T1& host1,
+		const T2& host,
+        size_t hlen,
+        TIndexOffU* s,
+        size_t slen,
+        const DifferenceCoverSample<T1>& dc,
+        uint8_t hi,
+        size_t begin,
+        size_t end,
+        size_t depth,
+        bool sanityCheck = false)
+{
+#define ASSERT_SUF_LT(l, r) \
+	if(sanityCheck && \
+	   !sstr_suf_lt(host1, s[l], hlen, host1, s[r], hlen, false)) { \
+		assert(false); \
+	}
+
+	assert_gt(end, begin+1);
+	assert_leq(end-begin, SELECTION_SORT_CUTOFF);
+	assert_eq(hi, 4);
+	size_t v = dc.v();
+	if(end == begin+2) {
+		size_t off = dc.tieBreakOff(s[begin], s[begin+1]);
+		if(off + s[begin] >= hlen ||
+		   off + s[begin+1] >= hlen)
+		{
+			off = OFF_MASK;
+		}
+		if(off != OFF_MASK) {
+			if(off < depth) {
+				qsortSufDcU8<T1,T2>(host1, host, hlen, s, slen, dc,
+				                    begin, end, sanityCheck);
+				// It's helpful for debugging if we call this here
+				if(sanityCheck) {
+					sanityCheckOrderedSufs(host1, hlen, s, slen,
+					                       OFF_MASK, begin, end);
+				}
+				return;
+			}
+			v = off - depth + 1;
+		}
+	}
+	assert_leq(v, dc.v());
+	size_t lim = v;
+	assert_geq(lim, 0);
+	for(size_t i = begin; i < end-1; i++) {
+		size_t targ = i;
+		size_t targoff = depth + s[i];
+		for(size_t j = i+1; j < end; j++) {
+			assert_neq(j, targ);
+			size_t joff = depth + s[j];
+			size_t k;
+			for(k = 0; k <= lim; k++) {
+				assert_neq(j, targ);
+				uint8_t jc = (k + joff < hlen)    ? get_uint8(host, k + joff)    : hi;
+				uint8_t tc = (k + targoff < hlen) ? get_uint8(host, k + targoff) : hi;
+				assert(jc != hi || tc != hi);
+				if(jc > tc) {
+					// the jth suffix is greater than the current
+					// smallest suffix
+					ASSERT_SUF_LT(targ, j);
+					break;
+				} else if(jc < tc) {
+					// the jth suffix is less than the current smallest
+					// suffix, so update smallest to be j
+					ASSERT_SUF_LT(j, targ);
+					targ = j;
+					targoff = joff;
+					break;
+				} else if(k == lim) {
+					// Check whether either string ends immediately
+					// after this character
+					assert_leq(k + joff + 1, hlen);
+					assert_leq(k + targoff + 1, hlen);
+					if(k + joff + 1 == hlen) {
+						// targ < j
+						assert_neq(k + targoff + 1, hlen);
+						ASSERT_SUF_LT(targ, j);
+						break;
+					} else if(k + targoff + 1 == hlen) {
+						// j < targ
+						ASSERT_SUF_LT(j, targ);
+						targ = j;
+						targoff = joff;
+						break;
+					}
+				} else {
+					// They're equal so far, keep going
+				}
+			}
+			// The jth suffix was equal to the current smallest suffix
+			// up to the difference-cover period, so disambiguate with
+			// difference cover
+			if(k == lim+1) {
+				assert_neq(j, targ);
+				if(sufDcLtU8(host1, host, hlen, s[j], s[targ], dc, sanityCheck)) {
+					// j < targ
+					assert(!sufDcLtU8(host1, host, hlen, s[targ], s[j], dc, sanityCheck));
+					ASSERT_SUF_LT(j, targ);
+					targ = j;
+					targoff = joff;
+				} else {
+					assert(sufDcLtU8(host1, host, hlen, s[targ], s[j], dc, sanityCheck));
+					ASSERT_SUF_LT(targ, j); // !
+				}
+			}
+		}
+		if(i != targ) {
+			ASSERT_SUF_LT(targ, i);
+			// swap i and targ
+			TIndexOffU tmp = s[i];
+			s[i] = s[targ];
+			s[targ] = tmp;
+		}
+		for(size_t j = i+1; j < end; j++) {
+			ASSERT_SUF_LT(i, j);
+		}
+	}
+	if(sanityCheck) {
+		sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK, begin, end);
+	}
+}
+
+template<typename T1, typename T2>
+static void bucketSortSufDcU8(
+		const T1& host1,
+		const T2& host,
+        size_t hlen,
+        TIndexOffU* s,
+        size_t slen,
+        const DifferenceCoverSample<T1>& dc,
+        uint8_t hi,
+        size_t _begin,
+        size_t _end,
+        size_t _depth,
+        bool sanityCheck = false)
+{
+    // 5 64-element buckets for bucket-sorting A, C, G, T, $
+    TIndexOffU* bkts[4];
+    for(size_t i = 0; i < 4; i++) {
+        bkts[i] = new TIndexOffU[4 * 1024 * 1024];
+    }
+    ELList<size_t, 5, 1024> block_list;
+    while(true) {
+        size_t begin = 0, end = 0;
+        if(block_list.size() == 0) {
+            begin = _begin;
+            end = _end;
+        } else {
+            if(block_list.back().size() > 1) {
+                end = block_list.back().back(); block_list.back().pop_back();
+                begin = block_list.back().back();
+            } else {
+                block_list.resize(block_list.size() - 1);
+                if(block_list.size() == 0) {
+                    break;
+                }
+            }
+        }
+        size_t depth = block_list.size() + _depth;
+        assert_leq(end-begin, BUCKET_SORT_CUTOFF);
+        assert_eq(hi, 4);
+        if(end <= begin + 1) { // 1-element list already sorted
+            continue;
+        }
+        if(depth > dc.v()) {
+            // Quicksort the remaining suffixes using difference cover
+            // for constant-time comparisons; this is O(k*log(k)) where
+            // k=(end-begin)
+            qsortSufDcU8<T1,T2>(host1, host, hlen, s, slen, dc, begin, end, sanityCheck);
+            continue;
+        }
+        if(end-begin <= SELECTION_SORT_CUTOFF) {
+            // Bucket sort remaining items
+            selectionSortSufDcU8(host1, host, hlen, s, slen, dc, hi,
+                                 begin, end, depth, sanityCheck);
+            if(sanityCheck) {
+                sanityCheckOrderedSufs(host1, hlen, s, slen,
+                                       OFF_MASK, begin, end);
+            }
+            continue;
+        }
+        size_t cnts[] = { 0, 0, 0, 0, 0 };
+        for(size_t i = begin; i < end; i++) {
+            size_t off = depth + s[i];
+            uint8_t c = (off < hlen) ? get_uint8(host, off) : hi;
+            assert_leq(c, 4);
+            if(c == 0) {
+                s[begin + cnts[0]++] = s[i];
+            } else {
+                bkts[c-1][cnts[c]++] = s[i];
+            }
+        }
+        assert_eq(cnts[0] + cnts[1] + cnts[2] + cnts[3] + cnts[4], end - begin);
+        size_t cur = begin + cnts[0];
+        if(cnts[1] > 0) { memcpy(&s[cur], bkts[0], cnts[1] << (OFF_SIZE/4 + 1)); cur += cnts[1]; }
+        if(cnts[2] > 0) { memcpy(&s[cur], bkts[1], cnts[2] << (OFF_SIZE/4 + 1)); cur += cnts[2]; }
+        if(cnts[3] > 0) { memcpy(&s[cur], bkts[2], cnts[3] << (OFF_SIZE/4 + 1)); cur += cnts[3]; }
+        if(cnts[4] > 0) { memcpy(&s[cur], bkts[3], cnts[4] << (OFF_SIZE/4 + 1)); }
+        // This frame is now totally finished with bkts[][], so recursive
+        // callees can safely clobber it; we're not done with cnts[], but
+        // that's local to the stack frame.
+        block_list.expand();
+        block_list.back().clear();
+        block_list.back().push_back(begin);
+        for(size_t i = 0; i < 4; i++) {
+            if(cnts[i] > 0) {
+                block_list.back().push_back(block_list.back().back() + cnts[i]);
+            }
+        }
+    }
+    // Done
+    
+    for(size_t i = 0; i < 4; i++) {
+        delete [] bkts[i];
+    }
+}
+
+/**
+ * Main multikey quicksort function for suffixes.  Based on Bentley &
+ * Sedgewick's algorithm on p.5 of their paper "Fast Algorithms for
+ * Sorting and Searching Strings".  That algorithm has been extended in
+ * three ways:
+ *
+ *  1. Deal with keys of different lengths by checking bounds and
+ *     considering off-the-end values to be 'hi' (b/c our goal is the
+ *     BWT transform, we're biased toward considring prefixes as
+ *     lexicographically *greater* than their extensions).
+ *  2. The multikey_qsort_suffixes version takes a single host string
+ *     and a list of suffix offsets as input.  This reduces memory
+ *     footprint compared to an approach that treats its input
+ *     generically as a set of strings (not necessarily suffixes), thus
+ *     requiring that we store at least two integers worth of
+ *     information for each string.
+ *  3. Sorting functions take an extra "upto" parameter that upper-
+ *     bounds the depth to which the function sorts.
+ */
+template<typename T1, typename T2>
+void mkeyQSortSufDcU8(
+	const T1& host1,
+	const T2& host,
+	size_t hlen,
+	TIndexOffU* s,
+	size_t slen,
+	const DifferenceCoverSample<T1>& dc,
+	int hi,
+	size_t begin,
+	size_t end,
+	size_t depth,
+	bool sanityCheck = false)
+{
+	// Helper for making the recursive call; sanity-checks arguments to
+	// make sure that the problem actually got smaller.
+	#define MQS_RECURSE_SUF_DC_U8(nbegin, nend, ndepth) { \
+		assert(nbegin > begin || nend < end || ndepth > depth); \
+		mkeyQSortSufDcU8(host1, host, hlen, s, slen, dc, hi, nbegin, nend, ndepth, sanityCheck); \
+	}
+	assert_leq(begin, slen);
+	assert_leq(end, slen);
+	size_t n = end - begin;
+	if(n <= 1) return; // 1-element list already sorted
+	if(depth > dc.v()) {
+		// Quicksort the remaining suffixes using difference cover
+		// for constant-time comparisons; this is O(k*log(k)) where
+		// k=(end-begin)
+		qsortSufDcU8<T1,T2>(host1, host, hlen, s, slen, dc, begin, end, sanityCheck);
+		if(sanityCheck) {
+			sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK, begin, end);
+		}
+		return;
+	}
+	if(n <= BUCKET_SORT_CUTOFF) {
+		// Bucket sort remaining items
+		bucketSortSufDcU8(host1, host, hlen, s, slen, dc,
+		                  (uint8_t)hi, begin, end, depth, sanityCheck);
+		if(sanityCheck) {
+			sanityCheckOrderedSufs(host1, hlen, s, slen, OFF_MASK, begin, end);
+		}
+		return;
+	}
+	size_t a, b, c, d, r;
+	CHOOSE_AND_SWAP_PIVOT(SWAP1, CHAR_AT_SUF_U8); // choose pivot, swap to begin
+	int v = CHAR_AT_SUF_U8(begin, depth); // v <- pivot value
+	#ifndef NDEBUG
+	{
+		bool stillInBounds = false;
+		for(size_t i = begin; i < end; i++) {
+			if(depth < (hlen-s[i])) {
+				stillInBounds = true;
+				break;
+			} else { /* already fell off this suffix */ }
+		}
+		assert(stillInBounds); // >=1 suffix must still be in bounds
+	}
+	#endif
+	a = b = begin;
+	c = d = end-1;
+	while(true) {
+		// Invariant: everything before a is = pivot, everything
+		// between a and b is <
+		int bc = 0; // shouldn't have to init but gcc on Mac complains
+		while(b <= c && v >= (bc = CHAR_AT_SUF_U8(b, depth))) {
+			if(v == bc) {
+				SWAP(s, a, b); a++;
+			}
+			b++;
+		}
+		// Invariant: everything after d is = pivot, everything
+		// between c and d is >
+		int cc = 0; // shouldn't have to init but gcc on Mac complains
+		//bool hiLatch = true;
+		while(b <= c && v <= (cc = CHAR_AT_SUF_U8(c, depth))) {
+			if(v == cc) {
+				SWAP(s, c, d); d--;
+			}
+			//else if(hiLatch && cc == hi) { }
+			c--;
+		}
+		if(b > c) break;
+		SWAP(s, b, c);
+		b++;
+		c--;
+	}
+	assert(a > begin || c < end-1);                      // there was at least one =s
+	assert_lt(d-c, n); // they can't all have been > pivot
+	assert_lt(b-a, n); // they can't all have been < pivot
+	r = min(a-begin, b-a); VECSWAP(s, begin, b-r,   r);  // swap left = to center
+	r = min(d-c, end-d-1); VECSWAP(s, b,     end-r, r);  // swap right = to center
+	r = b-a; // r <- # of <'s
+	if(r > 0) {
+		MQS_RECURSE_SUF_DC_U8(begin, begin + r, depth); // recurse on <'s
+	}
+	// Do not recurse on ='s if the pivot was the off-the-end value;
+	// they're already fully sorted
+	if(v != hi) {
+		MQS_RECURSE_SUF_DC_U8(begin + r, begin + r + (a-begin) + (end-d-1), depth+1); // recurse on ='s
+	}
+	r = d-c; // r <- # of >'s excluding those exhausted
+	if(r > 0 && v < hi-1) {
+		MQS_RECURSE_SUF_DC_U8(end-r, end, depth); // recurse on >'s
+	}
+}
+
+
+#endif /*MULTIKEY_QSORT_H_*/
diff --git a/opts.h b/opts.h
new file mode 100644
index 0000000..cd00d40
--- /dev/null
+++ b/opts.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef OPTS_H_
+#define OPTS_H_
+
+enum {
+	ARG_ORIG = 256,             // --orig
+	ARG_SEED,                   // --seed
+	ARG_SOLEXA_QUALS,           // --solexa-quals
+	ARG_VERBOSE,                // --verbose
+	ARG_STARTVERBOSE,           // --startverbose
+	ARG_QUIET,                  // --quiet
+	ARG_METRIC_IVAL,            // --met
+	ARG_METRIC_FILE,            // --met-file
+	ARG_METRIC_STDERR,          // --met-stderr
+	ARG_METRIC_PER_READ,        // --met-per-read
+	ARG_REFIDX,                 // --refidx
+	ARG_SANITY,                 // --sanity
+	ARG_PARTITION,              // --partition
+	ARG_INTEGER_QUALS,          // --int-quals
+	ARG_FILEPAR,                // --filepar
+	ARG_SHMEM,                  // --shmem
+	ARG_MM,                     // --mm
+	ARG_MMSWEEP,                // --mmsweep
+	ARG_FF,                     // --ff
+	ARG_FR,                     // --fr
+	ARG_RF,                     // --rf
+	ARG_NO_MIXED,               // --no-mixed
+	ARG_NO_DISCORDANT,          // --no-discordant
+	ARG_CACHE_LIM,              // --
+	ARG_CACHE_SZ,               // --
+	ARG_NO_FW,                  // --nofw
+	ARG_NO_RC,                  // --norc
+	ARG_SKIP,                   // --skip
+	ARG_ONETWO,                 // --12
+	ARG_PHRED64,                // --phred64
+	ARG_PHRED33,                // --phred33
+	ARG_HADOOPOUT,              // --hadoopout
+	ARG_FUZZY,                  // --fuzzy
+	ARG_FULLREF,                // --fullref
+	ARG_USAGE,                  // --usage
+	ARG_SNPPHRED,               // --snpphred
+	ARG_SNPFRAC,                // --snpfrac
+	ARG_SAM_NO_QNAME_TRUNC,     // --sam-no-qname-trunc
+	ARG_SAM_OMIT_SEC_SEQ,       // --sam-omit-sec-seq
+	ARG_SAM_NOHEAD,             // --sam-noHD/--sam-nohead
+	ARG_SAM_NOSQ,               // --sam-nosq/--sam-noSQ
+	ARG_SAM_RG,                 // --sam-rg
+	ARG_SAM_RGID,               // --sam-rg-id
+	ARG_GAP_BAR,                // --gbar
+	ARG_QUALS1,                 // --Q1
+	ARG_QUALS2,                 // --Q2
+	ARG_QSEQ,                   // --qseq
+	ARG_SEED_SUMM,              // --seed-summary
+	ARG_OVERHANG,               // --overhang
+	ARG_NO_CACHE,               // --no-cache
+	ARG_USE_CACHE,              // --cache
+	ARG_NOISY_HPOLY,            // --454/--ion-torrent
+	ARG_LOCAL,                  // --local
+	ARG_END_TO_END,             // --end-to-end
+	ARG_SCAN_NARROWED,          // --scan-narrowed
+	ARG_QC_FILTER,              // --qc-filter
+	ARG_BWA_SW_LIKE,            // --bwa-sw-like
+	ARG_MULTISEED_IVAL,         // --multiseed
+	ARG_SCORE_MIN,              // --score-min
+	ARG_SCORE_MA,               // --ma
+	ARG_SCORE_MMP,              // --mm
+	ARG_SCORE_NP,               // --nm
+	ARG_SCORE_RDG,              // --rdg
+	ARG_SCORE_RFG,              // --rfg
+	ARG_N_CEIL,                 // --n-ceil
+	ARG_DPAD,                   // --dpad
+	ARG_SAM_PRINT_YI,           // --mapq-print-inputs
+	ARG_ALIGN_POLICY,           // --policy
+	ARG_PRESET_VERY_FAST,       // --very-fast
+	ARG_PRESET_FAST,            // --fast
+	ARG_PRESET_SENSITIVE,       // --sensitive
+	ARG_PRESET_VERY_SENSITIVE,  // --very-sensitive
+	ARG_PRESET_VERY_FAST_LOCAL,      // --very-fast-local
+	ARG_PRESET_FAST_LOCAL,           // --fast-local
+	ARG_PRESET_SENSITIVE_LOCAL,      // --sensitive-local
+	ARG_PRESET_VERY_SENSITIVE_LOCAL, // --very-sensitive-local
+	ARG_NO_SCORE_PRIORITY,      // --no-score-priority
+	ARG_IGNORE_QUALS,           // --ignore-quals
+	ARG_DESC,                   // --arg-desc
+	ARG_TAB5,                   // --tab5
+	ARG_TAB6,                   // --tab6
+	ARG_WRAPPER,                // --wrapper
+	ARG_DOVETAIL,               // --dovetail
+	ARG_NO_DOVETAIL,            // --no-dovetail
+	ARG_CONTAIN,                // --contain
+	ARG_NO_CONTAIN,             // --no-contain
+	ARG_OVERLAP,                // --overlap
+	ARG_NO_OVERLAP,             // --no-overlap
+	ARG_MAPQ_V,                 // --mapq-v
+	ARG_SSE8,                   // --sse8
+	ARG_SSE8_NO,                // --no-sse8
+	ARG_UNGAPPED,               // --ungapped
+	ARG_UNGAPPED_NO,            // --no-ungapped
+	ARG_TIGHTEN,                // --tighten
+	ARG_UNGAP_THRESH,           // --ungap-thresh
+	ARG_EXACT_UPFRONT,          // --exact-upfront
+	ARG_1MM_UPFRONT,            // --1mm-upfront
+	ARG_EXACT_UPFRONT_NO,       // --no-exact-upfront
+	ARG_1MM_UPFRONT_NO,         // --no-1mm-upfront
+	ARG_1MM_MINLEN,             // --1mm-minlen
+	ARG_VERSION,                // --version
+	ARG_SEED_OFF,               // --seed-off
+	ARG_SEED_BOOST_THRESH,      // --seed-boost
+	ARG_READ_TIMES,             // --read-times
+	ARG_EXTEND_ITERS,           // --extends
+	ARG_DP_MATE_STREAK_THRESH,  // --db-mate-streak
+	ARG_DP_FAIL_STREAK_THRESH,  // --dp-fail-streak
+	ARG_UG_FAIL_STREAK_THRESH,  // --ug-fail-streak
+	ARG_EE_FAIL_STREAK_THRESH,  // --ee-fail-streak
+	ARG_DP_FAIL_THRESH,         // --dp-fails
+	ARG_UG_FAIL_THRESH,         // --ug-fails
+	ARG_MAPQ_EX,                // --mapq-extra
+	ARG_NO_EXTEND,              // --no-extend
+	ARG_REORDER,                // --reorder
+	ARG_SHOW_RAND_SEED,         // --show-rand-seed
+	ARG_READ_PASSTHRU,          // --passthrough
+	ARG_SAMPLE,                 // --sample
+	ARG_CP_MIN,                 // --cp-min
+	ARG_CP_IVAL,                // --cp-ival
+	ARG_TRI,                    // --tri
+	ARG_LOCAL_SEED_CACHE_SZ,    // --local-seed-cache-sz
+	ARG_CURRENT_SEED_CACHE_SZ,  // --seed-cache-sz
+	ARG_SAM_NO_UNAL,            // --no-unal
+	ARG_NON_DETERMINISTIC,      // --non-deterministic
+	ARG_TEST_25,                // --test-25
+	ARG_DESC_KB,                // --desc-kb
+	ARG_DESC_LANDING,           // --desc-landing
+	ARG_DESC_EXP,               // --desc-exp
+	ARG_DESC_FMOPS,             // --desc-fmops
+    ARG_NO_TEMPSPLICESITE,
+    ARG_PEN_CANSPLICE,
+    ARG_PEN_NONCANSPLICE,
+    ARG_PEN_CONFLICTSPLICE,
+    ARG_PEN_INTRONLEN,
+    ARG_KNOWN_SPLICESITE_INFILE,
+    ARG_NOVEL_SPLICESITE_INFILE,
+    ARG_NOVEL_SPLICESITE_OUTFILE,
+    ARG_SECONDARY,
+    ARG_NO_SPLICED_ALIGNMENT,
+    ARG_RNA_STRANDNESS,
+    ARG_SPLICESITE_DB_ONLY,
+    ARG_MIN_HITLEN,              // --min-hitlen
+    ARG_MIN_TOTALLEN,            // --min-totallen
+    ARG_HOST_TAXIDS,             // --host-taxids
+	ARG_REPORT_FILE,             // --report
+    ARG_NO_ABUNDANCE,            // --no-abundance
+    ARG_NO_TRAVERSE,             // --no-traverse
+    ARG_CLASSIFICATION_RANK,
+    ARG_EXCLUDE_TAXIDS,
+#ifdef USE_SRA
+    ARG_SRA_ACC,
+#endif
+};
+
+#endif
+
diff --git a/outq.cpp b/outq.cpp
new file mode 100644
index 0000000..1141675
--- /dev/null
+++ b/outq.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "outq.h"
+
+/**
+ * Caller is telling us that they're about to write output record(s) for
+ * the read with the given id.
+ */
+void OutputQueue::beginRead(TReadId rdid, size_t threadId) {
+	ThreadSafe t(&mutex_m, threadSafe_);
+	nstarted_++;
+	if(reorder_) {
+		assert_geq(rdid, cur_);
+		assert_eq(lines_.size(), finished_.size());
+		assert_eq(lines_.size(), started_.size());
+		if(rdid - cur_ >= lines_.size()) {
+			// Make sure there's enough room in lines_, started_ and finished_
+			size_t oldsz = lines_.size();
+			lines_.resize(rdid - cur_ + 1);
+			started_.resize(rdid - cur_ + 1);
+			finished_.resize(rdid - cur_ + 1);
+			for(size_t i = oldsz; i < lines_.size(); i++) {
+				started_[i] = finished_[i] = false;
+			}
+		}
+		started_[rdid - cur_] = true;
+		finished_[rdid - cur_] = false;
+	}
+}
+
+/**
+ * Writer is finished writing to 
+ */
+void OutputQueue::finishRead(const BTString& rec, TReadId rdid, size_t threadId) {
+	ThreadSafe t(&mutex_m, threadSafe_);
+	if(reorder_) {
+		assert_geq(rdid, cur_);
+		assert_eq(lines_.size(), finished_.size());
+		assert_eq(lines_.size(), started_.size());
+		assert_lt(rdid - cur_, lines_.size());
+		assert(started_[rdid - cur_]);
+		assert(!finished_[rdid - cur_]);
+		lines_[rdid - cur_] = rec;
+		nfinished_++;
+		finished_[rdid - cur_] = true;
+		flush(false, false); // don't force; already have lock
+	} else {
+		// obuf_ is the OutFileBuf for the output file
+		obuf_.writeString(rec);
+		nfinished_++;
+		nflushed_++;
+	}
+}
+
+/**
+ * Write already-finished lines starting from cur_.
+ */
+void OutputQueue::flush(bool force, bool getLock) {
+	if(!reorder_) {
+		return;
+	}
+	ThreadSafe t(&mutex_m, getLock && threadSafe_);
+	size_t nflush = 0;
+	while(nflush < finished_.size() && finished_[nflush]) {
+		assert(started_[nflush]);
+		nflush++;
+	}
+	// Waiting until we have several in a row to flush cuts down on copies
+	// (but requires more buffering)
+	if(force || nflush >= NFLUSH_THRESH) {
+		for(size_t i = 0; i < nflush; i++) {
+			assert(started_[i]);
+			assert(finished_[i]);
+			obuf_.writeString(lines_[i]);
+		}
+		lines_.erase(0, nflush);
+		started_.erase(0, nflush);
+		finished_.erase(0, nflush);
+		cur_ += nflush;
+		nflushed_ += nflush;
+	}
+}
+
+#ifdef OUTQ_MAIN
+
+#include <iostream>
+
+using namespace std;
+
+int main(void) {
+	cerr << "Case 1 (one thread) ... ";
+	{
+		OutFileBuf ofb;
+		OutputQueue oq(ofb, false);
+		assert_eq(0, oq.numFlushed());
+		assert_eq(0, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		oq.beginRead(1);
+		assert_eq(0, oq.numFlushed());
+		assert_eq(1, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		oq.beginRead(3);
+		assert_eq(0, oq.numFlushed());
+		assert_eq(2, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		oq.beginRead(2);
+		assert_eq(0, oq.numFlushed());
+		assert_eq(3, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		oq.flush();
+		assert_eq(0, oq.numFlushed());
+		assert_eq(3, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		oq.beginRead(0);
+		assert_eq(0, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		oq.flush();
+		assert_eq(0, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		oq.finishRead(0);
+		assert_eq(0, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(1, oq.numFinished());
+		oq.flush();
+		assert_eq(0, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(1, oq.numFinished());
+		oq.flush(true);
+		assert_eq(1, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(1, oq.numFinished());
+		oq.finishRead(2);
+		assert_eq(1, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(2, oq.numFinished());
+		oq.flush(true);
+		assert_eq(1, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(2, oq.numFinished());
+		oq.finishRead(1);
+		assert_eq(1, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(3, oq.numFinished());
+		oq.flush(true);
+		assert_eq(3, oq.numFlushed());
+		assert_eq(4, oq.numStarted());
+		assert_eq(3, oq.numFinished());
+	}
+	cerr << "PASSED" << endl;
+
+	cerr << "Case 2 (one thread) ... ";
+	{
+		OutFileBuf ofb;
+		OutputQueue oq(ofb, false);
+		BTString& buf1 = oq.beginRead(0);
+		BTString& buf2 = oq.beginRead(1);
+		BTString& buf3 = oq.beginRead(2);
+		BTString& buf4 = oq.beginRead(3);
+		BTString& buf5 = oq.beginRead(4);
+		assert_eq(5, oq.numStarted());
+		assert_eq(0, oq.numFinished());
+		buf1.install("A\n");
+		buf2.install("B\n");
+		buf3.install("C\n");
+		buf4.install("D\n");
+		buf5.install("E\n");
+		oq.finishRead(4);
+		oq.finishRead(1);
+		oq.finishRead(0);
+		oq.finishRead(2);
+		oq.finishRead(3);
+		oq.flush(true);
+		assert_eq(5, oq.numFlushed());
+		assert_eq(5, oq.numStarted());
+		assert_eq(5, oq.numFinished());
+		ofb.flush();
+	}
+	cerr << "PASSED" << endl;
+	return 0;
+}
+
+#endif /*def ALN_SINK_MAIN*/
diff --git a/outq.h b/outq.h
new file mode 100644
index 0000000..00408cf
--- /dev/null
+++ b/outq.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef OUTQ_H_
+#define OUTQ_H_
+
+#include "assert_helpers.h"
+#include "ds.h"
+#include "sstring.h"
+#include "read.h"
+#include "threading.h"
+#include "mem_ids.h"
+
+/**
+ * Encapsulates a list of lines of output.  If the earliest as-yet-unreported
+ * read has id N and Bowtie 2 wants to write a record for read with id N+1, we
+ * resize the lines_ and committed_ lists to have at least 2 elements (1 for N,
+ * 1 for N+1) and return the BTString * associated with the 2nd element.  When
+ * the user calls commit() for the read with id N, 
+ */
+class OutputQueue {
+
+	static const size_t NFLUSH_THRESH = 8;
+
+public:
+
+	OutputQueue(
+		OutFileBuf& obuf,
+		bool reorder,
+		size_t nthreads,
+		bool threadSafe,
+		TReadId rdid = 0) :
+		obuf_(obuf),
+		cur_(rdid),
+		nstarted_(0),
+		nfinished_(0),
+		nflushed_(0),
+		lines_(RES_CAT),
+		started_(RES_CAT),
+		finished_(RES_CAT),
+		reorder_(reorder),
+		threadSafe_(threadSafe),
+        mutex_m()
+	{
+		assert(nthreads <= 1 || threadSafe);
+	}
+
+	/**
+	 * Caller is telling us that they're about to write output record(s) for
+	 * the read with the given id.
+	 */
+	void beginRead(TReadId rdid, size_t threadId);
+	
+	/**
+	 * Writer is finished writing to 
+	 */
+	void finishRead(const BTString& rec, TReadId rdid, size_t threadId);
+	
+	/**
+	 * Return the number of records currently being buffered.
+	 */
+	size_t size() const {
+		return lines_.size();
+	}
+	
+	/**
+	 * Return the number of records that have been flushed so far.
+	 */
+	TReadId numFlushed() const {
+		return nflushed_;
+	}
+
+	/**
+	 * Return the number of records that have been started so far.
+	 */
+	TReadId numStarted() const {
+		return nstarted_;
+	}
+
+	/**
+	 * Return the number of records that have been finished so far.
+	 */
+	TReadId numFinished() const {
+		return nfinished_;
+	}
+
+	/**
+	 * Write already-committed lines starting from cur_.
+	 */
+	void flush(bool force = false, bool getLock = true);
+
+protected:
+
+	OutFileBuf&     obuf_;
+	TReadId         cur_;
+	TReadId         nstarted_;
+	TReadId         nfinished_;
+	TReadId         nflushed_;
+	EList<BTString> lines_;
+	EList<bool>     started_;
+	EList<bool>     finished_;
+	bool            reorder_;
+	bool            threadSafe_;
+	MUTEX_T         mutex_m;
+};
+
+class OutputQueueMark {
+public:
+	OutputQueueMark(
+		OutputQueue& q,
+		const BTString& rec,
+		TReadId rdid,
+		size_t threadId) :
+		q_(q),
+		rec_(rec),
+		rdid_(rdid),
+		threadId_(threadId)
+	{
+		q_.beginRead(rdid, threadId);
+	}
+	
+	~OutputQueueMark() {
+		q_.finishRead(rec_, rdid_, threadId_);
+	}
+	
+protected:
+	OutputQueue& q_;
+	const BTString& rec_;
+	TReadId rdid_;
+	size_t threadId_;
+};
+
+#endif
diff --git a/pat.cpp b/pat.cpp
new file mode 100644
index 0000000..516ebda
--- /dev/null
+++ b/pat.cpp
@@ -0,0 +1,1783 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "sstring.h"
+
+#include "pat.h"
+#include "filebuf.h"
+#include "formats.h"
+
+#ifdef USE_SRA
+
+#include "tinythread.h"
+#include <ncbi-vdb/NGS.hpp>
+#include <ngs/ErrorMsg.hpp>
+#include <ngs/ReadCollection.hpp>
+#include <ngs/ReadIterator.hpp>
+#include <ngs/Read.hpp>
+
+#endif
+
+using namespace std;
+
+/**
+ * Return a new dynamically allocated PatternSource for the given
+ * format, using the given list of strings as the filenames to read
+ * from or as the sequences themselves (i.e. if -c was used).
+ */
+PatternSource* PatternSource::patsrcFromStrings(
+                                                const PatternParams& p,
+                                                const EList<string>& qs,
+                                                int nthreads)
+{
+	switch(p.format) {
+		case FASTA:       return new FastaPatternSource(qs, p);
+		case FASTA_CONT:  return new FastaContinuousPatternSource(qs, p);
+		case RAW:         return new RawPatternSource(qs, p);
+		case FASTQ:       return new FastqPatternSource(qs, p);
+		case TAB_MATE5:   return new TabbedPatternSource(qs, p, false);
+		case TAB_MATE6:   return new TabbedPatternSource(qs, p, true);
+		case CMDLINE:     return new VectorPatternSource(qs, p);
+		case QSEQ:        return new QseqPatternSource(qs, p);
+#ifdef USE_SRA
+        case SRA_FASTA:
+        case SRA_FASTQ: return new SRAPatternSource(qs, p, nthreads);
+#endif
+		default: {
+			cerr << "Internal error; bad patsrc format: " << p.format << endl;
+			throw 1;
+		}
+	}
+}
+
+/**
+ * The main member function for dispensing patterns.
+ *
+ * Returns true iff a pair was parsed succesfully.
+ */
+bool PatternSource::nextReadPair(
+	Read& ra,
+	Read& rb,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done,
+	bool& paired,
+	bool fixName)
+{
+	// nextPatternImpl does the reading from the ultimate source;
+	// it is implemented in concrete subclasses
+	success = done = paired = false;
+	nextReadPairImpl(ra, rb, rdid, endid, success, done, paired);
+	if(success) {
+		// Construct reversed versions of fw and rc seqs/quals
+		ra.finalize();
+		if(!rb.empty()) {
+			rb.finalize();
+		}
+		// Fill in the random-seed field using a combination of
+		// information from the user-specified seed and the read
+		// sequence, qualities, and name
+		ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
+		if(!rb.empty()) {
+			rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
+		}
+	}
+	return success;
+}
+
+/**
+ * The main member function for dispensing patterns.
+ */
+bool PatternSource::nextRead(
+	Read& r,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done)
+{
+	// nextPatternImpl does the reading from the ultimate source;
+	// it is implemented in concrete subclasses
+	nextReadImpl(r, rdid, endid, success, done);
+	if(success) {
+		// Construct the reversed versions of the fw and rc seqs
+		// and quals
+		r.finalize();
+		// Fill in the random-seed field using a combination of
+		// information from the user-specified seed and the read
+		// sequence, qualities, and name
+		r.seed = genRandSeed(r.patFw, r.qual, r.name, seed_);
+	}
+	return success;
+}
+
+/**
+ * Get the next paired or unpaired read from the wrapped
+ * PairedPatternSource.
+ */
+bool WrappedPatternSourcePerThread::nextReadPair(
+	bool& success,
+	bool& done,
+	bool& paired,
+	bool fixName)
+{
+	PatternSourcePerThread::nextReadPair(success, done, paired, fixName);
+	ASSERT_ONLY(TReadId lastRdId = rdid_);
+	buf1_.reset();
+	buf2_.reset();
+	patsrc_.nextReadPair(buf1_, buf2_, rdid_, endid_, success, done, paired, fixName);
+	assert(!success || rdid_ != lastRdId);
+	return success;
+}
+
+/**
+ * The main member function for dispensing pairs of reads or
+ * singleton reads.  Returns true iff ra and rb contain a new
+ * pair; returns false if ra contains a new unpaired read.
+ */
+bool PairedSoloPatternSource::nextReadPair(
+	Read& ra,
+	Read& rb,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done,
+	bool& paired,
+	bool fixName)
+{
+	uint32_t cur = cur_;
+	success = false;
+	while(cur < src_->size()) {
+		// Patterns from srca_[cur_] are unpaired
+		do {
+			(*src_)[cur]->nextReadPair(
+				ra, rb, rdid, endid, success, done, paired, fixName);
+		} while(!success && !done);
+		if(!success) {
+			assert(done);
+			// If patFw is empty, that's our signal that the
+			// input dried up
+			lock();
+			if(cur + 1 > cur_) cur_++;
+			cur = cur_;
+			unlock();
+			continue; // on to next pair of PatternSources
+		}
+		assert(success);
+		ra.seed = genRandSeed(ra.patFw, ra.qual, ra.name, seed_);
+		if(!rb.empty()) {
+			rb.seed = genRandSeed(rb.patFw, rb.qual, rb.name, seed_);
+			if(fixName) {
+				ra.fixMateName(1);
+				rb.fixMateName(2);
+			}
+		}
+		ra.rdid = rdid;
+		ra.endid = endid;
+		if(!rb.empty()) {
+			rb.rdid = rdid;
+			rb.endid = endid+1;
+		}
+		ra.mate = 1;
+		rb.mate = 2;
+		return true; // paired
+	}
+	assert_leq(cur, src_->size());
+	done = (cur == src_->size());
+	return false;
+}
+
+/**
+ * The main member function for dispensing pairs of reads or
+ * singleton reads.  Returns true iff ra and rb contain a new
+ * pair; returns false if ra contains a new unpaired read.
+ */
+bool PairedDualPatternSource::nextReadPair(
+	Read& ra,
+	Read& rb,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done,
+	bool& paired,
+	bool fixName)
+{
+	// 'cur' indexes the current pair of PatternSources
+	uint32_t cur;
+	{
+		lock();
+		cur = cur_;
+		unlock();
+	}
+	success = false;
+	done = true;
+	while(cur < srca_->size()) {
+		if((*srcb_)[cur] == NULL) {
+			paired = false;
+			// Patterns from srca_ are unpaired
+			do {
+				(*srca_)[cur]->nextRead(ra, rdid, endid, success, done);
+			} while(!success && !done);
+			if(!success) {
+				assert(done);
+				lock();
+				if(cur + 1 > cur_) cur_++;
+				cur = cur_; // Move on to next PatternSource
+				unlock();
+				continue; // on to next pair of PatternSources
+			}
+			ra.rdid = rdid;
+			ra.endid = endid;
+			ra.mate  = 0;
+			return success;
+		} else {
+			paired = true;
+			// Patterns from srca_[cur_] and srcb_[cur_] are paired
+			TReadId rdid_a = 0, endid_a = 0;
+			TReadId rdid_b = 0, endid_b = 0;
+			bool success_a = false, done_a = false;
+			bool success_b = false, done_b = false;
+			// Lock to ensure that this thread gets parallel reads
+			// in the two mate files
+			lock();
+			do {
+				(*srca_)[cur]->nextRead(ra, rdid_a, endid_a, success_a, done_a);
+			} while(!success_a && !done_a);
+			do {
+				(*srcb_)[cur]->nextRead(rb, rdid_b, endid_b, success_b, done_b);
+			} while(!success_b && !done_b);
+			if(!success_a && success_b) {
+				cerr << "Error, fewer reads in file specified with -1 than in file specified with -2" << endl;
+				throw 1;
+			} else if(!success_a) {
+				assert(done_a && done_b);
+				if(cur + 1 > cur_) cur_++;
+				cur = cur_; // Move on to next PatternSource
+				unlock();
+				continue; // on to next pair of PatternSources
+			} else if(!success_b) {
+				cerr << "Error, fewer reads in file specified with -2 than in file specified with -1" << endl;
+				throw 1;
+			}
+			assert_eq(rdid_a, rdid_b);
+			//assert_eq(endid_a+1, endid_b);
+			assert_eq(success_a, success_b);
+			unlock();
+			if(fixName) {
+				ra.fixMateName(1);
+				rb.fixMateName(2);
+			}
+			rdid = rdid_a;
+			endid = endid_a;
+			success = success_a;
+			done = done_a;
+			ra.rdid = rdid;
+			ra.endid = endid;
+			if(!rb.empty()) {
+				rb.rdid = rdid;
+				rb.endid = endid+1;
+			}
+			ra.mate = 1;
+			rb.mate = 2;
+			return success;
+		}
+	}
+	return success;
+}
+
+/**
+ * Return the number of reads attempted.
+ */
+pair<TReadId, TReadId> PairedDualPatternSource::readCnt() const {
+	uint64_t rets = 0llu, retp = 0llu;
+	for(size_t i = 0; i < srca_->size(); i++) {
+		if((*srcb_)[i] == NULL) {
+			rets += (*srca_)[i]->readCnt();
+		} else {
+			assert_eq((*srca_)[i]->readCnt(), (*srcb_)[i]->readCnt());
+			retp += (*srca_)[i]->readCnt();
+		}
+	}
+	return make_pair(rets, retp);
+}
+
+/**
+ * Given the values for all of the various arguments used to specify
+ * the read and quality input, create a list of pattern sources to
+ * dispense them.
+ */
+PairedPatternSource* PairedPatternSource::setupPatternSources(
+	const EList<string>& si,   // singles, from argv
+	const EList<string>& m1,   // mate1's, from -1 arg
+	const EList<string>& m2,   // mate2's, from -2 arg
+	const EList<string>& m12,  // both mates on each line, from --12 arg
+#ifdef USE_SRA
+    const EList<string>& sra_accs,
+#endif
+	const EList<string>& q,    // qualities associated with singles
+	const EList<string>& q1,   // qualities associated with m1
+	const EList<string>& q2,   // qualities associated with m2
+	const PatternParams& p,    // read-in parameters
+                                                              int nthreads,
+	bool verbose)              // be talkative?
+{
+	EList<PatternSource*>* a  = new EList<PatternSource*>();
+	EList<PatternSource*>* b  = new EList<PatternSource*>();
+	EList<PatternSource*>* ab = new EList<PatternSource*>();
+	// Create list of pattern sources for paired reads appearing
+	// interleaved in a single file
+	for(size_t i = 0; i < m12.size(); i++) {
+		const EList<string>* qs = &m12;
+		EList<string> tmp;
+		if(p.fileParallel) {
+			// Feed query files one to each PatternSource
+			qs = &tmp;
+			tmp.push_back(m12[i]);
+			assert_eq(1, tmp.size());
+		}
+		ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
+		if(!p.fileParallel) {
+			break;
+		}
+	}
+    
+#ifdef USE_SRA
+    for(size_t i = 0; i < sra_accs.size(); i++) {
+        const EList<string>* qs = &sra_accs;
+        EList<string> tmp;
+        if(p.fileParallel) {
+            // Feed query files one to each PatternSource
+            qs = &tmp;
+            tmp.push_back(sra_accs[i]);
+            assert_eq(1, tmp.size());
+        }
+        ab->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
+        if(!p.fileParallel) {
+            break;
+        }
+    }
+#endif
+
+	// Create list of pattern sources for paired reads
+	for(size_t i = 0; i < m1.size(); i++) {
+		const EList<string>* qs = &m1;
+		EList<string> tmpSeq;
+		EList<string> tmpQual;
+		if(p.fileParallel) {
+			// Feed query files one to each PatternSource
+			qs = &tmpSeq;
+			tmpSeq.push_back(m1[i]);
+			assert_eq(1, tmpSeq.size());
+		}
+		a->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
+		if(!p.fileParallel) {
+			break;
+		}
+	}
+
+	// Create list of pattern sources for paired reads
+	for(size_t i = 0; i < m2.size(); i++) {
+		const EList<string>* qs = &m2;
+		EList<string> tmpSeq;
+		EList<string> tmpQual;
+		if(p.fileParallel) {
+			// Feed query files one to each PatternSource
+			qs = &tmpSeq;
+			tmpSeq.push_back(m2[i]);
+			assert_eq(1, tmpSeq.size());
+		}
+		b->push_back(PatternSource::patsrcFromStrings(p, *qs, nthreads));
+		if(!p.fileParallel) {
+			break;
+		}
+	}
+	// All mates/mate files must be paired
+	assert_eq(a->size(), b->size());
+
+	// Create list of pattern sources for the unpaired reads
+	for(size_t i = 0; i < si.size(); i++) {
+		const EList<string>* qs = &si;
+		PatternSource* patsrc = NULL;
+		EList<string> tmpSeq;
+		EList<string> tmpQual;
+		if(p.fileParallel) {
+			// Feed query files one to each PatternSource
+			qs = &tmpSeq;
+			tmpSeq.push_back(si[i]);
+			assert_eq(1, tmpSeq.size());
+		}
+		patsrc = PatternSource::patsrcFromStrings(p, *qs, nthreads);
+		assert(patsrc != NULL);
+		a->push_back(patsrc);
+		b->push_back(NULL);
+		if(!p.fileParallel) {
+			break;
+		}
+	}
+
+	PairedPatternSource *patsrc = NULL;
+#ifdef USE_SRA
+    if(m12.size() > 0 || sra_accs.size() > 0) {
+#else
+    if(m12.size() > 0) {
+#endif
+		patsrc = new PairedSoloPatternSource(ab, p);
+		for(size_t i = 0; i < a->size(); i++) delete (*a)[i];
+		for(size_t i = 0; i < b->size(); i++) delete (*b)[i];
+		delete a; delete b;
+	} else {
+		patsrc = new PairedDualPatternSource(a, b, p);
+		for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i];
+		delete ab;
+	}
+	return patsrc;
+}
+
+VectorPatternSource::VectorPatternSource(
+	const EList<string>& v,
+	const PatternParams& p) :
+	PatternSource(p),
+	cur_(p.skip),
+	skip_(p.skip),
+	paired_(false),
+	v_(),
+	quals_()
+{
+	for(size_t i = 0; i < v.size(); i++) {
+		EList<string> ss;
+		tokenize(v[i], ":", ss, 2);
+		assert_gt(ss.size(), 0);
+		assert_leq(ss.size(), 2);
+		// Initialize s
+		string s = ss[0];
+		int mytrim5 = gTrim5;
+		if(gColor && s.length() > 1) {
+			// This may be a primer character.  If so, keep it in the
+			// 'primer' field of the read buf and parse the rest of the
+			// read without it.
+			int c = toupper(s[0]);
+			if(asc2dnacat[c] > 0) {
+				// First char is a DNA char
+				int c2 = toupper(s[1]);
+				// Second char is a color char
+				if(asc2colcat[c2] > 0) {
+					mytrim5 += 2; // trim primer and first color
+				}
+			}
+		}
+		if(gColor) {
+			// Convert '0'-'3' to 'A'-'T'
+			for(size_t i = 0; i < s.length(); i++) {
+				if(s[i] >= '0' && s[i] <= '4') {
+					s[i] = "ACGTN"[(int)s[i] - '0'];
+				}
+				if(s[i] == '.') s[i] = 'N';
+			}
+		}
+		if(s.length() <= (size_t)(gTrim3 + mytrim5)) {
+			// Entire read is trimmed away
+			s.clear();
+		} else {
+			// Trim on 5' (high-quality) end
+			if(mytrim5 > 0) {
+				s.erase(0, mytrim5);
+			}
+			// Trim on 3' (low-quality) end
+			if(gTrim3 > 0) {
+				s.erase(s.length()-gTrim3);
+			}
+		}
+		//  Initialize vq
+		string vq;
+		if(ss.size() == 2) {
+			vq = ss[1];
+		}
+		// Trim qualities
+		if(vq.length() > (size_t)(gTrim3 + mytrim5)) {
+			// Trim on 5' (high-quality) end
+			if(mytrim5 > 0) {
+				vq.erase(0, mytrim5);
+			}
+			// Trim on 3' (low-quality) end
+			if(gTrim3 > 0) {
+				vq.erase(vq.length()-gTrim3);
+			}
+		}
+		// Pad quals with Is if necessary; this shouldn't happen
+		while(vq.length() < s.length()) {
+			vq.push_back('I');
+		}
+		// Truncate quals to match length of read if necessary;
+		// this shouldn't happen
+		if(vq.length() > s.length()) {
+			vq.erase(s.length());
+		}
+		assert_eq(vq.length(), s.length());
+		v_.expand();
+		v_.back().installChars(s);
+		quals_.push_back(BTString(vq));
+		trimmed3_.push_back(gTrim3);
+		trimmed5_.push_back(mytrim5);
+		ostringstream os;
+		os << (names_.size());
+		names_.push_back(BTString(os.str()));
+	}
+	assert_eq(v_.size(), quals_.size());
+}
+	
+bool VectorPatternSource::nextReadImpl(
+	Read& r,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done)
+{
+	// Let Strings begin at the beginning of the respective bufs
+	r.reset();
+	lock();
+	if(cur_ >= v_.size()) {
+		unlock();
+		// Clear all the Strings, as a signal to the caller that
+		// we're out of reads
+		r.reset();
+		success = false;
+		done = true;
+		assert(r.empty());
+		return false;
+	}
+	// Copy v_*, quals_* strings into the respective Strings
+	r.color = gColor;
+	r.patFw  = v_[cur_];
+	r.qual = quals_[cur_];
+	r.trimmed3 = trimmed3_[cur_];
+	r.trimmed5 = trimmed5_[cur_];
+	ostringstream os;
+	os << cur_;
+	r.name = os.str();
+	cur_++;
+	done = cur_ == v_.size();
+	rdid = endid = readCnt_;
+	readCnt_++;
+	unlock();
+	success = true;
+	return true;
+}
+	
+/**
+ * This is unused, but implementation is given for completeness.
+ */
+bool VectorPatternSource::nextReadPairImpl(
+	Read& ra,
+	Read& rb,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done,
+	bool& paired)
+{
+	// Let Strings begin at the beginning of the respective bufs
+	ra.reset();
+	rb.reset();
+	paired = true;
+	if(!paired_) {
+		paired_ = true;
+		cur_ <<= 1;
+	}
+	lock();
+	if(cur_ >= v_.size()-1) {
+		unlock();
+		// Clear all the Strings, as a signal to the caller that
+		// we're out of reads
+		ra.reset();
+		rb.reset();
+		assert(ra.empty());
+		assert(rb.empty());
+		success = false;
+		done = true;
+		return false;
+	}
+	// Copy v_*, quals_* strings into the respective Strings
+	ra.patFw  = v_[cur_];
+	ra.qual = quals_[cur_];
+	ra.trimmed3 = trimmed3_[cur_];
+	ra.trimmed5 = trimmed5_[cur_];
+	cur_++;
+	rb.patFw  = v_[cur_];
+	rb.qual = quals_[cur_];
+	rb.trimmed3 = trimmed3_[cur_];
+	rb.trimmed5 = trimmed5_[cur_];
+	ostringstream os;
+	os << readCnt_;
+	ra.name = os.str();
+	rb.name = os.str();
+	ra.color = rb.color = gColor;
+	cur_++;
+	done = cur_ >= v_.size()-1;
+	rdid = endid = readCnt_;
+	readCnt_++;
+	unlock();
+	success = true;
+	return true;
+}
+
+/**
+ * Parse a single quality string from fb and store qualities in r.
+ * Assume the next character obtained via fb.get() is the first
+ * character of the quality string.  When returning, the next
+ * character returned by fb.peek() or fb.get() should be the first
+ * character of the following line.
+ */
+int parseQuals(
+	Read& r,
+	FileBuf& fb,
+	int firstc,
+	int readLen,
+	int trim3,
+	int trim5,
+	bool intQuals,
+	bool phred64,
+	bool solexa64)
+{
+	int c = firstc;
+	assert(c != '\n' && c != '\r');
+	r.qual.clear();
+	if (intQuals) {
+		while (c != '\r' && c != '\n' && c != -1) {
+			bool neg = false;
+			int num = 0;
+			while(!isspace(c) && !fb.eof()) {
+				if(c == '-') {
+					neg = true;
+					assert_eq(num, 0);
+				} else {
+					if(!isdigit(c)) {
+						char buf[2048];
+						cerr << "Warning: could not parse quality line:" << endl;
+						fb.getPastNewline();
+						cerr << fb.copyLastN(buf);
+						buf[2047] = '\0';
+						cerr << buf;
+						throw 1;
+					}
+					assert(isdigit(c));
+					num *= 10;
+					num += (c - '0');
+				}
+				c = fb.get();
+			}
+			if(neg) num = 0;
+			// Phred-33 ASCII encode it and add it to the back of the
+			// quality string
+			r.qual.append('!' + num);
+			// Skip over next stretch of whitespace
+			while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) {
+				c = fb.get();
+			}
+		}
+	} else {
+		while (c != '\r' && c != '\n' && c != -1) {
+			r.qual.append(charToPhred33(c, solexa64, phred64));
+			c = fb.get();
+			while(c != '\r' && c != '\n' && isspace(c) && !fb.eof()) {
+				c = fb.get();
+			}
+		}
+	}
+	if ((int)r.qual.length() < readLen-1 ||
+	    ((int)r.qual.length() < readLen && !r.color))
+	{
+		tooFewQualities(r.name);
+	}
+	r.qual.trimEnd(trim3);
+	if(r.qual.length()-trim5 < r.patFw.length()) {
+		assert(gColor && r.primer != -1);
+		assert_gt(trim5, 0);
+		trim5--;
+	}
+	r.qual.trimBegin(trim5);
+	if(r.qual.length() <= 0) return 0;
+	assert_eq(r.qual.length(), r.patFw.length());
+	while(fb.peek() == '\n' || fb.peek() == '\r') fb.get();
+	return (int)r.qual.length();
+}
+
+/// Read another pattern from a FASTA input file
+bool FastaPatternSource::read(
+	Read& r,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done)
+{
+	int c, qc = 0;
+	success = true;
+	done = false;
+	assert(fb_.isOpen());
+	r.reset();
+	r.color = gColor;
+	// Pick off the first carat
+	c = fb_.get();
+	if(c < 0) {
+		bail(r); success = false; done = true; return success;
+	}
+	while(c == '#' || c == ';' || c == '\r' || c == '\n') {
+		c = fb_.peekUptoNewline();
+		fb_.resetLastN();
+		c = fb_.get();
+	}
+	assert_eq(1, fb_.lastNLen());
+
+	// Pick off the first carat
+	if(first_) {
+		if(c != '>') {
+			cerr << "Error: reads file does not look like a FASTA file" << endl;
+			throw 1;
+		}
+		first_ = false;
+	}
+	assert_eq('>', c);
+	c = fb_.get(); // get next char after '>'
+
+	// Read to the end of the id line, sticking everything after the '>'
+	// into *name
+	//bool warning = false;
+	while(true) {
+		if(c < 0 || qc < 0) {
+			bail(r); success = false; done = true; return success;
+		}
+		if(c == '\n' || c == '\r') {
+			// Break at end of line, after consuming all \r's, \n's
+			while(c == '\n' || c == '\r') {
+				if(fb_.peek() == '>') {
+					// Empty sequence
+					break;
+				}
+				c = fb_.get();
+				if(c < 0 || qc < 0) {
+					bail(r); success = false; done = true; return success;
+				}
+			}
+			break;
+		}
+		r.name.append(c);
+		if(fb_.peek() == '>') {
+			// Empty sequence
+			break;
+		}
+		c = fb_.get();
+	}
+	if(c == '>') {
+		// Empty sequences!
+		cerr << "Warning: skipping empty FASTA read with name '" << r.name << "'" << endl;
+		fb_.resetLastN();
+		rdid = endid = readCnt_;
+		readCnt_++;
+		success = true; done = false; return success;
+	}
+	assert_neq('>', c);
+
+	// _in now points just past the first character of a sequence
+	// line, and c holds the first character
+	int begin = 0;
+	int mytrim5 = gTrim5;
+	if(gColor) {
+		// This is the primer character, keep it in the
+		// 'primer' field of the read buf and keep parsing
+		c = toupper(c);
+		if(asc2dnacat[c] > 0) {
+			// First char is a DNA char
+			int c2 = toupper(fb_.peek());
+			if(asc2colcat[c2] > 0) {
+				// Second char is a color char
+				r.primer = c;
+				r.trimc = c2;
+				mytrim5 += 2;
+			}
+		}
+		if(c < 0) {
+			bail(r); success = false; done = true; return success;
+		}
+	}
+	while(c != '>' && c >= 0) {
+		if(gColor) {
+			if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
+			if(c == '.') c = 'N';
+		}
+		if(asc2dnacat[c] > 0 && begin++ >= mytrim5) {
+			r.patFw.append(asc2dna[c]);
+			r.qual.append('I');
+		}
+		if(fb_.peek() == '>') break;
+		c = fb_.get();
+	}
+	r.patFw.trimEnd(gTrim3);
+	r.qual.trimEnd(gTrim3);
+	r.trimmed3 = gTrim3;
+	r.trimmed5 = mytrim5;
+	// Set up a default name if one hasn't been set
+	if(r.name.empty()) {
+		char cbuf[20];
+		itoa10<TReadId>(readCnt_, cbuf);
+		r.name.install(cbuf);
+	}
+	assert_gt(r.name.length(), 0);
+	r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
+	fb_.resetLastN();
+	rdid = endid = readCnt_;
+	readCnt_++;
+	return success;
+}
+
+/// Read another pattern from a FASTQ input file
+bool FastqPatternSource::read(
+	Read& r,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done)
+{
+	int c;
+	int dstLen = 0;
+	success = true;
+	done = false;
+	r.reset();
+	r.color = gColor;
+	r.fuzzy = fuzzy_;
+	// Pick off the first at
+	if(first_) {
+		c = fb_.get();
+		if(c != '@') {
+			c = getOverNewline(fb_);
+			if(c < 0) {
+				bail(r); success = false; done = true; return success;
+			}
+		}
+		if(c != '@') {
+			cerr << "Error: reads file does not look like a FASTQ file" << endl;
+			throw 1;
+		}
+		assert_eq('@', c);
+		first_ = false;
+	}
+
+	// Read to the end of the id line, sticking everything after the '@'
+	// into *name
+	while(true) {
+		c = fb_.get();
+		if(c < 0) {
+			bail(r); success = false; done = true; return success;
+		}
+		if(c == '\n' || c == '\r') {
+			// Break at end of line, after consuming all \r's, \n's
+			while(c == '\n' || c == '\r') {
+				c = fb_.get();
+				if(c < 0) {
+					bail(r); success = false; done = true; return success;
+				}
+			}
+			break;
+		}
+		r.name.append(c);
+	}
+	// fb_ now points just past the first character of a
+	// sequence line, and c holds the first character
+	int charsRead = 0;
+	BTDnaString *sbuf = &r.patFw;
+	int dstLens[] = {0, 0, 0, 0};
+	int *dstLenCur = &dstLens[0];
+	int mytrim5 = gTrim5;
+	int altBufIdx = 0;
+	if(gColor && c != '+') {
+		// This may be a primer character.  If so, keep it in the
+		// 'primer' field of the read buf and parse the rest of the
+		// read without it.
+		c = toupper(c);
+		if(asc2dnacat[c] > 0) {
+			// First char is a DNA char
+			int c2 = toupper(fb_.peek());
+			// Second char is a color char
+			if(asc2colcat[c2] > 0) {
+				r.primer = c;
+				r.trimc = c2;
+				mytrim5 += 2; // trim primer and first color
+			}
+		}
+		if(c < 0) {
+			bail(r); success = false; done = true; return success;
+		}
+	}
+	int trim5 = 0;
+	if(c != '+') {
+		trim5 = mytrim5;
+		while(c != '+') {
+			// Convert color numbers to letters if necessary
+			if(c == '.') c = 'N';
+			if(gColor) {
+				if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
+			}
+			if(fuzzy_ && c == '-') c = 'A';
+			if(isalpha(c)) {
+				// If it's past the 5'-end trim point
+				if(charsRead >= trim5) {
+					sbuf->append(asc2dna[c]);
+					(*dstLenCur)++;
+				}
+				charsRead++;
+			} else if(fuzzy_ && c == ' ') {
+				trim5 = 0; // disable 5' trimming for now
+				if(charsRead == 0) {
+					c = fb_.get();
+					continue;
+				}
+				charsRead = 0;
+				if(altBufIdx >= 3) {
+					cerr << "At most 3 alternate sequence strings permitted; offending read: " << r.name << endl;
+					throw 1;
+				}
+				// Move on to the next alternate-sequence buffer
+				sbuf = &r.altPatFw[altBufIdx++];
+				dstLenCur = &dstLens[altBufIdx];
+			}
+			c = fb_.get();
+			if(c < 0) {
+				bail(r); success = false; done = true; return success;
+			}
+		}
+		dstLen = dstLens[0];
+		charsRead = dstLen + mytrim5;
+	}
+	// Trim from 3' end
+	if(gTrim3 > 0) {
+		if((int)r.patFw.length() > gTrim3) {
+			r.patFw.resize(r.patFw.length() - gTrim3);
+			dstLen -= gTrim3;
+			assert_eq((int)r.patFw.length(), dstLen);
+		} else {
+			// Trimmed the whole read; we won't be using this read,
+			// but we proceed anyway so that fb_ is advanced
+			// properly
+			r.patFw.clear();
+			dstLen = 0;
+		}
+	}
+	assert_eq('+', c);
+
+	// Chew up the optional name on the '+' line
+	ASSERT_ONLY(int pk =) peekToEndOfLine(fb_);
+	if(charsRead == 0) {
+		assert_eq('@', pk);
+		fb_.get();
+		fb_.resetLastN();
+		rdid = endid = readCnt_;
+		readCnt_++;
+		return success;
+	}
+
+	// Now read the qualities
+	if (intQuals_) {
+		assert(!fuzzy_);
+		int qualsRead = 0;
+		char buf[4096];
+		if(gColor && r.primer != -1) {
+			// In case the original quality string is one shorter
+			mytrim5--;
+		}
+		qualToks_.clear();
+		tokenizeQualLine(fb_, buf, 4096, qualToks_);
+		for(unsigned int j = 0; j < qualToks_.size(); ++j) {
+			char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
+			assert_geq(c, 33);
+			if (qualsRead >= mytrim5) {
+				r.qual.append(c);
+			}
+			++qualsRead;
+		} // done reading integer quality lines
+		if(gColor && r.primer != -1) mytrim5++;
+		r.qual.trimEnd(gTrim3);
+		if(r.qual.length() < r.patFw.length()) {
+			tooFewQualities(r.name);
+		} else if(r.qual.length() > r.patFw.length() + 1) {
+			tooManyQualities(r.name);
+		}
+		if(r.qual.length() == r.patFw.length()+1 && gColor && r.primer != -1) {
+			r.qual.remove(0);
+		}
+		// Trim qualities on 3' end
+		if(r.qual.length() > r.patFw.length()) {
+			r.qual.resize(r.patFw.length());
+			assert_eq((int)r.qual.length(), dstLen);
+		}
+		peekOverNewline(fb_);
+	} else {
+		// Non-integer qualities
+		altBufIdx = 0;
+		trim5 = mytrim5;
+		int qualsRead[4] = {0, 0, 0, 0};
+		int *qualsReadCur = &qualsRead[0];
+		BTString *qbuf = &r.qual;
+		if(gColor && r.primer != -1) {
+			// In case the original quality string is one shorter
+			trim5--;
+		}
+		while(true) {
+			c = fb_.get();
+			if (!fuzzy_ && c == ' ') {
+				wrongQualityFormat(r.name);
+			} else if(c == ' ') {
+				trim5 = 0; // disable 5' trimming for now
+				if((*qualsReadCur) == 0) continue;
+				if(altBufIdx >= 3) {
+					cerr << "At most 3 alternate quality strings permitted; offending read: " << r.name << endl;
+					throw 1;
+				}
+				qbuf = &r.altQual[altBufIdx++];
+				qualsReadCur = &qualsRead[altBufIdx];
+				continue;
+			}
+			if(c < 0) {
+				break; // let the file end just at the end of a quality line
+				//bail(r); success = false; done = true; return success;
+			}
+			if (c != '\r' && c != '\n') {
+				if (*qualsReadCur >= trim5) {
+					c = charToPhred33(c, solQuals_, phred64Quals_);
+					assert_geq(c, 33);
+					qbuf->append(c);
+				}
+				(*qualsReadCur)++;
+			} else {
+				break;
+			}
+		}
+		qualsRead[0] -= gTrim3;
+		r.qual.trimEnd(gTrim3);
+		if(r.qual.length() < r.patFw.length()) {
+			tooFewQualities(r.name);
+		} else if(r.qual.length() > r.patFw.length()+1) {
+			tooManyQualities(r.name);
+		}
+		if(r.qual.length() == r.patFw.length()+1 && gColor && r.primer != -1) {
+			r.qual.remove(0);
+		}
+
+		if(fuzzy_) {
+			// Trim from 3' end of alternate basecall and quality strings
+			if(gTrim3 > 0) {
+				for(int i = 0; i < 3; i++) {
+					assert_eq(r.altQual[i].length(), r.altPatFw[i].length());
+					if((int)r.altQual[i].length() > gTrim3) {
+						r.altPatFw[i].resize(gTrim3);
+						r.altQual[i].resize(gTrim3);
+					} else {
+						r.altPatFw[i].clear();
+						r.altQual[i].clear();
+					}
+					qualsRead[i+1] = dstLens[i+1] =
+						max<int>(0, dstLens[i+1] - gTrim3);
+				}
+			}
+			// Shift to RHS, and install in Strings
+			assert_eq(0, r.alts);
+			for(int i = 1; i < 4; i++) {
+				if(qualsRead[i] == 0) continue;
+				if(qualsRead[i] > dstLen) {
+					// Shift everybody up
+					int shiftAmt = qualsRead[i] - dstLen;
+					for(int j = 0; j < dstLen; j++) {
+						r.altQual[i-1].set(r.altQual[i-1][j+shiftAmt], j);
+						r.altPatFw[i-1].set(r.altPatFw[i-1][j+shiftAmt], j);
+					}
+					r.altQual[i-1].resize(dstLen);
+					r.altPatFw[i-1].resize(dstLen);
+				} else if (qualsRead[i] < dstLen) {
+					r.altQual[i-1].resize(dstLen);
+					r.altPatFw[i-1].resize(dstLen);
+					// Shift everybody down
+					int shiftAmt = dstLen - qualsRead[i];
+					for(int j = dstLen-1; j >= shiftAmt; j--) {
+						r.altQual[i-1].set(r.altQual[i-1][j-shiftAmt], j);
+						r.altPatFw[i-1].set(r.altPatFw[i-1][j-shiftAmt], j);
+					}
+					// Fill in unset positions
+					for(int j = 0; j < shiftAmt; j++) {
+						// '!' - indicates no alternate basecall at
+						// this position
+						r.altQual[i-1].set(33, j);
+					}
+				}
+				r.alts++;
+			}
+		}
+
+		if(c == '\r' || c == '\n') {
+			c = peekOverNewline(fb_);
+		} else {
+			c = peekToEndOfLine(fb_);
+		}
+	}
+	r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
+	fb_.resetLastN();
+
+	c = fb_.get();
+	// Should either be at end of file or at beginning of next record
+	assert(c == -1 || c == '@');
+
+	// Set up a default name if one hasn't been set
+	if(r.name.empty()) {
+		char cbuf[20];
+		itoa10<TReadId>(readCnt_, cbuf);
+		r.name.install(cbuf);
+	}
+	r.trimmed3 = gTrim3;
+	r.trimmed5 = mytrim5;
+	rdid = endid = readCnt_;
+	readCnt_++;
+	return success;
+}
+
+/// Read another pattern from a FASTA input file
+bool TabbedPatternSource::read(
+	Read& r,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done)
+{
+	r.reset();
+	r.color = gColor;
+	success = true;
+	done = false;
+	// fb_ is about to dish out the first character of the
+	// name field
+	if(parseName(r, NULL, '\t') == -1) {
+		peekOverNewline(fb_); // skip rest of line
+		r.reset();
+		success = false;
+		done = true;
+		return false;
+	}
+	assert_neq('\t', fb_.peek());
+
+	// fb_ is about to dish out the first character of the
+	// sequence field
+	int charsRead = 0;
+	int mytrim5 = gTrim5;
+	int dstLen = parseSeq(r, charsRead, mytrim5, '\t');
+	assert_neq('\t', fb_.peek());
+	if(dstLen < 0) {
+		peekOverNewline(fb_); // skip rest of line
+		r.reset();
+		success = false;
+		done = true;
+		return false;
+	}
+
+	// fb_ is about to dish out the first character of the
+	// quality-string field
+	char ct = 0;
+	if(parseQuals(r, charsRead, dstLen, mytrim5, ct, '\n') < 0) {
+		peekOverNewline(fb_); // skip rest of line
+		r.reset();
+		success = false;
+		done = true;
+		return false;
+	}
+	r.trimmed3 = gTrim3;
+	r.trimmed5 = mytrim5;
+	assert_eq(ct, '\n');
+	assert_neq('\n', fb_.peek());
+	r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
+	fb_.resetLastN();
+	rdid = endid = readCnt_;
+	readCnt_++;
+	return true;
+}
+
+/// Read another pair of patterns from a FASTA input file
+bool TabbedPatternSource::readPair(
+	Read& ra,
+	Read& rb,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done,
+	bool& paired)
+{
+	success = true;
+	done = false;
+	
+	// Skip over initial vertical whitespace
+	if(fb_.peek() == '\r' || fb_.peek() == '\n') {
+		fb_.peekUptoNewline();
+		fb_.resetLastN();
+	}
+	
+	// fb_ is about to dish out the first character of the
+	// name field
+	int mytrim5_1 = gTrim5;
+	if(parseName(ra, &rb, '\t') == -1) {
+		peekOverNewline(fb_); // skip rest of line
+		ra.reset();
+		rb.reset();
+		fb_.resetLastN();
+		success = false;
+		done = true;
+		return false;
+	}
+	assert_neq('\t', fb_.peek());
+
+	// fb_ is about to dish out the first character of the
+	// sequence field for the first mate
+	int charsRead1 = 0;
+	int dstLen1 = parseSeq(ra, charsRead1, mytrim5_1, '\t');
+	if(dstLen1 < 0) {
+		peekOverNewline(fb_); // skip rest of line
+		ra.reset();
+		rb.reset();
+		fb_.resetLastN();
+		success = false;
+		done = true;
+		return false;
+	}
+	assert_neq('\t', fb_.peek());
+
+	// fb_ is about to dish out the first character of the
+	// quality-string field
+	char ct = 0;
+	if(parseQuals(ra, charsRead1, dstLen1, mytrim5_1, ct, '\t', '\n') < 0) {
+		peekOverNewline(fb_); // skip rest of line
+		ra.reset();
+		rb.reset();
+		fb_.resetLastN();
+		success = false;
+		done = true;
+		return false;
+	}
+	ra.trimmed3 = gTrim3;
+	ra.trimmed5 = mytrim5_1;
+	assert(ct == '\t' || ct == '\n' || ct == '\r' || ct == -1);
+	if(ct == '\r' || ct == '\n' || ct == -1) {
+		// Only had 3 fields prior to newline, so this must be an unpaired read
+		rb.reset();
+		ra.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
+		fb_.resetLastN();
+		success = true;
+		done = false;
+		paired = false;
+		rdid = endid = readCnt_;
+		readCnt_++;
+		return success;
+	}
+	paired = true;
+	assert_neq('\t', fb_.peek());
+	
+	// Saw another tab after the third field, so this must be a pair
+	if(secondName_) {
+		// The second mate has its own name
+		if(parseName(rb, NULL, '\t') == -1) {
+			peekOverNewline(fb_); // skip rest of line
+			ra.reset();
+			rb.reset();
+			fb_.resetLastN();
+			success = false;
+			done = true;
+			return false;
+		}
+		assert_neq('\t', fb_.peek());
+	}
+
+	// fb_ about to give the first character of the second mate's sequence
+	int charsRead2 = 0;
+	int mytrim5_2 = gTrim5;
+	int dstLen2 = parseSeq(rb, charsRead2, mytrim5_2, '\t');
+	if(dstLen2 < 0) {
+		peekOverNewline(fb_); // skip rest of line
+		ra.reset();
+		rb.reset();
+		fb_.resetLastN();
+		success = false;
+		done = true;
+		return false;
+	}
+	assert_neq('\t', fb_.peek());
+
+	// fb_ is about to dish out the first character of the
+	// quality-string field
+	if(parseQuals(rb, charsRead2, dstLen2, mytrim5_2, ct, '\n') < 0) {
+		peekOverNewline(fb_); // skip rest of line
+		ra.reset();
+		rb.reset();
+		fb_.resetLastN();
+		success = false;
+		done = true;
+		return false;
+	}
+	ra.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
+	fb_.resetLastN();
+	rb.trimmed3 = gTrim3;
+	rb.trimmed5 = mytrim5_2;
+	rdid = endid = readCnt_;
+	readCnt_++;
+	return true;
+}
+
+/**
+ * Parse a name from fb_ and store in r.  Assume that the next
+ * character obtained via fb_.get() is the first character of
+ * the sequence and the string stops at the next char upto (could
+ * be tab, newline, etc.).
+ */
+int TabbedPatternSource::parseName(
+	Read& r,
+	Read* r2,
+	char upto /* = '\t' */)
+{
+	// Read the name out of the first field
+	int c = 0;
+	if(r2 != NULL) r2->name.clear();
+	r.name.clear();
+	while(true) {
+		if((c = fb_.get()) < 0) {
+			return -1;
+		}
+		if(c == upto) {
+			// Finished with first field
+			break;
+		}
+		if(c == '\n' || c == '\r') {
+			return -1;
+		}
+		if(r2 != NULL) r2->name.append(c);
+		r.name.append(c);
+	}
+	// Set up a default name if one hasn't been set
+	if(r.name.empty()) {
+		char cbuf[20];
+		itoa10<TReadId>(readCnt_, cbuf);
+		r.name.install(cbuf);
+		if(r2 != NULL) r2->name.install(cbuf);
+	}
+	return (int)r.name.length();
+}
+
+/**
+ * Parse a single sequence from fb_ and store in r.  Assume
+ * that the next character obtained via fb_.get() is the first
+ * character of the sequence and the sequence stops at the next
+ * char upto (could be tab, newline, etc.).
+ */
+int TabbedPatternSource::parseSeq(
+	Read& r,
+	int& charsRead,
+	int& trim5,
+	char upto /*= '\t'*/)
+{
+	int begin = 0;
+	int c = fb_.get();
+	assert(c != upto);
+	r.patFw.clear();
+	r.color = gColor;
+	if(gColor) {
+		// This may be a primer character.  If so, keep it in the
+		// 'primer' field of the read buf and parse the rest of the
+		// read without it.
+		c = toupper(c);
+		if(asc2dnacat[c] > 0) {
+			// First char is a DNA char
+			int c2 = toupper(fb_.peek());
+			// Second char is a color char
+			if(asc2colcat[c2] > 0) {
+				r.primer = c;
+				r.trimc = c2;
+				trim5 += 2; // trim primer and first color
+			}
+		}
+		if(c < 0) { return -1; }
+	}
+	while(c != upto) {
+		if(gColor) {
+			if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
+			if(c == '.') c = 'N';
+		}
+		if(isalpha(c)) {
+			assert_in(toupper(c), "ACGTN");
+			if(begin++ >= trim5) {
+				assert_neq(0, asc2dnacat[c]);
+				r.patFw.append(asc2dna[c]);
+			}
+			charsRead++;
+		}
+		if((c = fb_.get()) < 0) {
+			return -1;
+		}
+	}
+	r.patFw.trimEnd(gTrim3);
+	return (int)r.patFw.length();
+}
+
+/**
+ * Parse a single quality string from fb_ and store in r.
+ * Assume that the next character obtained via fb_.get() is
+ * the first character of the quality string and the string stops
+ * at the next char upto (could be tab, newline, etc.).
+ */
+int TabbedPatternSource::parseQuals(
+	Read& r,
+	int charsRead,
+	int dstLen,
+	int trim5,
+	char& c2,
+	char upto /*= '\t'*/,
+	char upto2 /*= -1*/)
+{
+	int qualsRead = 0;
+	int c = 0;
+	if (intQuals_) {
+		char buf[4096];
+		while (qualsRead < charsRead) {
+			qualToks_.clear();
+			if(!tokenizeQualLine(fb_, buf, 4096, qualToks_)) break;
+			for (unsigned int j = 0; j < qualToks_.size(); ++j) {
+				char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
+				assert_geq(c, 33);
+				if (qualsRead >= trim5) {
+					r.qual.append(c);
+				}
+				++qualsRead;
+			}
+		} // done reading integer quality lines
+		if (charsRead > qualsRead) tooFewQualities(r.name);
+	} else {
+		// Non-integer qualities
+		while((qualsRead < dstLen + trim5) && c >= 0) {
+			c = fb_.get();
+			c2 = c;
+			if (c == ' ') wrongQualityFormat(r.name);
+			if(c < 0) {
+				// EOF occurred in the middle of a read - abort
+				return -1;
+			}
+			if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
+				if (qualsRead >= trim5) {
+					c = charToPhred33(c, solQuals_, phred64Quals_);
+					assert_geq(c, 33);
+					r.qual.append(c);
+				}
+				qualsRead++;
+			} else {
+				break;
+			}
+		}
+		if(qualsRead < dstLen + trim5) {
+			tooFewQualities(r.name);
+		} else if(qualsRead > dstLen + trim5) {
+			tooManyQualities(r.name);
+		}
+	}
+	r.qual.resize(dstLen);
+	while(c != upto && (upto2 == -1 || c != upto2) && c != -1) {
+		c = fb_.get();
+		c2 = c;
+	}
+	return qualsRead;
+}
+
+void wrongQualityFormat(const BTString& read_name) {
+	cerr << "Error: Encountered one or more spaces while parsing the quality "
+	     << "string for read " << read_name << ".  If this is a FASTQ file "
+		 << "with integer (non-ASCII-encoded) qualities, try re-running with "
+		 << "the --integer-quals option." << endl;
+	throw 1;
+}
+
+void tooFewQualities(const BTString& read_name) {
+	cerr << "Error: Read " << read_name << " has more read characters than "
+		 << "quality values." << endl;
+	throw 1;
+}
+
+void tooManyQualities(const BTString& read_name) {
+	cerr << "Error: Read " << read_name << " has more quality values than read "
+		 << "characters." << endl;
+	throw 1;
+}
+
+#ifdef USE_SRA
+    
+    struct SRA_Read {
+        SStringExpandable<char, 64>      name;      // read name
+        SDnaStringExpandable<128, 2>     patFw;     // forward-strand sequence
+        SStringExpandable<char, 128, 2>  qual;      // quality values
+        
+        void reset() {
+            name.clear();
+            patFw.clear();
+            qual.clear();
+        }
+    };
+    
+    static const uint64_t buffer_size_per_thread = 4096;
+    
+    struct SRA_Data {
+        uint64_t read_pos;
+        uint64_t write_pos;
+        uint64_t buffer_size;
+        bool     done;
+        EList<pair<SRA_Read, SRA_Read> > paired_reads;
+        
+        ngs::ReadIterator* sra_it;
+        
+        SRA_Data() {
+            read_pos = 0;
+            write_pos = 0;
+            buffer_size = buffer_size_per_thread;
+            done = false;
+            sra_it = NULL;
+        }
+        
+        bool isFull() {
+            assert_leq(read_pos, write_pos);
+            assert_geq(read_pos + buffer_size, write_pos);
+            return read_pos + buffer_size <= write_pos;
+        }
+        
+        bool isEmpty() {
+            assert_leq(read_pos, write_pos);
+            assert_geq(read_pos + buffer_size, write_pos);
+            return read_pos == write_pos;
+        }
+        
+        pair<SRA_Read, SRA_Read>& getPairForRead() {
+            assert(!isEmpty());
+            return paired_reads[read_pos % buffer_size];
+        }
+        
+        pair<SRA_Read, SRA_Read>& getPairForWrite() {
+            assert(!isFull());
+            return paired_reads[write_pos % buffer_size];
+        }
+        
+        void advanceReadPos() {
+            assert(!isEmpty());
+            read_pos++;
+        }
+        
+        void advanceWritePos() {
+            assert(!isFull());
+            write_pos++;
+        }
+    };
+    
+    static void SRA_IO_Worker(void *vp)
+    {
+        SRA_Data* sra_data = (SRA_Data*)vp;
+        assert(sra_data != NULL);
+        ngs::ReadIterator* sra_it = sra_data->sra_it;
+        assert(sra_it != NULL);
+        
+        while(!sra_data->done) {
+            while(sra_data->isFull()) {
+#if defined(_TTHREAD_WIN32_)
+                Sleep(1);
+#elif defined(_TTHREAD_POSIX_)
+                const static timespec ts = {0, 1000000};  // 1 millisecond
+                nanosleep(&ts, NULL);
+#endif
+            }
+            pair<SRA_Read, SRA_Read>& pair = sra_data->getPairForWrite();
+            SRA_Read& ra = pair.first;
+            SRA_Read& rb = pair.second;
+            bool exception_thrown = false;
+            try {
+                if(!sra_it->nextRead() || !sra_it->nextFragment()) {
+                    ra.reset();
+                    rb.reset();
+                    sra_data->done = true;
+                    return;
+                }
+                
+                // Read the name out of the first field
+                ngs::StringRef rname = sra_it->getReadId();
+                ra.name.install(rname.data(), rname.size());
+                assert(!ra.name.empty());
+                
+                ngs::StringRef ra_seq = sra_it->getFragmentBases();
+                if(gTrim5 + gTrim3 < (int)ra_seq.size()) {
+                    ra.patFw.installChars(ra_seq.data() + gTrim5, ra_seq.size() - gTrim5 - gTrim3);
+                }
+                ngs::StringRef ra_qual = sra_it->getFragmentQualities();
+                if(ra_seq.size() == ra_qual.size() && gTrim5 + gTrim3 < (int)ra_qual.size()) {
+                    ra.qual.install(ra_qual.data() + gTrim5, ra_qual.size() - gTrim5 - gTrim3);
+                } else {
+                    ra.qual.resize(ra.patFw.length());
+                    ra.qual.fill('I');
+                }
+                assert_eq(ra.patFw.length(), ra.qual.length());
+                
+                if(!sra_it->nextFragment()) {
+                    rb.reset();
+                } else {
+                    // rb.name = ra.name;
+                    ngs::StringRef rb_seq = sra_it->getFragmentBases();
+                    if(gTrim5 + gTrim3 < (int)rb_seq.size()) {
+                        rb.patFw.installChars(rb_seq.data() + gTrim5, rb_seq.size() - gTrim5 - gTrim3);
+                    }
+                    ngs::StringRef rb_qual = sra_it->getFragmentQualities();
+                    if(rb_seq.size() == rb_qual.size() && gTrim5 + gTrim3 < (int)rb_qual.size()) {
+                        rb.qual.install(rb_qual.data() + gTrim5, rb_qual.size() - gTrim5 - gTrim3);
+                    } else {
+                        rb.qual.resize(rb.patFw.length());
+                        rb.qual.fill('I');
+                    }
+                    assert_eq(rb.patFw.length(), rb.qual.length());
+                }
+                sra_data->advanceWritePos();
+            } catch(ngs::ErrorMsg & x) {
+                cerr << x.toString () << endl;
+                exception_thrown = true;
+            } catch(exception & x) {
+                cerr << x.what () << endl;
+                exception_thrown = true;
+            } catch(...) {
+                cerr << "unknown exception\n";
+                exception_thrown = true;
+            }
+            
+            if(exception_thrown) {
+                ra.reset();
+                rb.reset();
+                sra_data->done = true;
+                cerr << "An error happened while fetching SRA reads. Please rerun HISAT2. You may want to disable the SRA cache if you didn't (see the instructions at https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration).\n";
+                exit(1);
+            }
+        }
+    }
+    
+    SRAPatternSource::~SRAPatternSource() {
+        if(io_thread_) delete io_thread_;
+        if(sra_data_) delete sra_data_;
+        if(sra_it_) delete sra_it_;
+        if(sra_run_) delete sra_run_;
+    }
+    
+    /// Read another pair of patterns from a FASTA input file
+    bool SRAPatternSource::readPair(
+                                    Read& ra,
+                                    Read& rb,
+                                    TReadId& rdid,
+                                    TReadId& endid,
+                                    bool& success,
+                                    bool& done,
+                                    bool& paired)
+    {
+        assert(sra_run_ != NULL && sra_it_ != NULL);
+        success = true;
+        done = false;
+        while(sra_data_->isEmpty()) {
+            if(sra_data_->done && sra_data_->isEmpty()) {
+                ra.reset();
+                rb.reset();
+                success = false;
+                done = true;
+                return false;
+            }
+            
+#if defined(_TTHREAD_WIN32_)
+            Sleep(1);
+#elif defined(_TTHREAD_POSIX_)
+            const static timespec ts = {0, 1000000}; // 1 millisecond
+            nanosleep(&ts, NULL);
+#endif
+        }
+        
+        pair<SRA_Read, SRA_Read>& pair = sra_data_->getPairForRead();
+        ra.name.install(pair.first.name.buf(), pair.first.name.length());
+        ra.patFw.install(pair.first.patFw.buf(), pair.first.patFw.length());
+        ra.qual.install(pair.first.qual.buf(), pair.first.qual.length());
+        ra.trimmed3 = gTrim3;
+        ra.trimmed5 = gTrim5;
+        if(pair.second.patFw.length() > 0) {
+            rb.name.install(pair.first.name.buf(), pair.first.name.length());
+            rb.patFw.install(pair.second.patFw.buf(), pair.second.patFw.length());
+            rb.qual.install(pair.second.qual.buf(), pair.second.qual.length());
+            rb.trimmed3 = gTrim3;
+            rb.trimmed5 = gTrim5;
+            paired = true;
+        } else {
+            rb.reset();
+        }
+        sra_data_->advanceReadPos();
+        
+        rdid = endid = readCnt_;
+        readCnt_++;
+        
+        return true;
+    }
+    
+    void SRAPatternSource::open() {
+        string version = "centrifuge-";
+        version += CENTRIFUGE_VERSION;
+        ncbi::NGS::setAppVersionString(version.c_str());
+        assert(!sra_accs_.empty());
+        while(sra_acc_cur_ < sra_accs_.size()) {
+            // Open read
+            if(sra_it_) {
+                delete sra_it_;
+                sra_it_ = NULL;
+            }
+            if(sra_run_) {
+                delete sra_run_;
+                sra_run_ = NULL;
+            }
+            try {
+                // open requested accession using SRA implementation of the API
+                sra_run_ = new ngs::ReadCollection(ncbi::NGS::openReadCollection(sra_accs_[sra_acc_cur_]));
+                // compute window to iterate through
+                size_t MAX_ROW = sra_run_->getReadCount();
+                sra_it_ = new ngs::ReadIterator(sra_run_->getReadRange(1, MAX_ROW, ngs::Read::all));
+                
+                // create a buffer for SRA data
+                sra_data_ = new SRA_Data;
+                sra_data_->sra_it = sra_it_;
+                sra_data_->buffer_size = nthreads_ * buffer_size_per_thread;
+                sra_data_->paired_reads.resize(sra_data_->buffer_size);
+                
+                // create a thread for handling SRA data access
+                io_thread_ = new tthread::thread(SRA_IO_Worker, (void*)sra_data_);
+                // io_thread_->join();
+            } catch(...) {
+                if(!errs_[sra_acc_cur_]) {
+                    cerr << "Warning: Could not access \"" << sra_accs_[sra_acc_cur_].c_str() << "\" for reading; skipping..." << endl;
+                    errs_[sra_acc_cur_] = true;
+                }
+                sra_acc_cur_++;
+                continue;
+            }
+            return;
+        }
+        cerr << "Error: No input SRA accessions were valid" << endl;
+        exit(1);
+        return;
+    }
+    
+#endif
diff --git a/pat.h b/pat.h
new file mode 100644
index 0000000..39c0432
--- /dev/null
+++ b/pat.h
@@ -0,0 +1,1788 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PAT_H_
+#define PAT_H_
+
+#include <cassert>
+#include <cmath>
+#include <stdexcept>
+#include <vector>
+#include <string>
+#include <cstring>
+#include <ctype.h>
+#include <fstream>
+#include "alphabet.h"
+#include "assert_helpers.h"
+#include "tokenize.h"
+#include "random_source.h"
+#include "threading.h"
+#include "filebuf.h"
+#include "qual.h"
+#include "search_globals.h"
+#include "sstring.h"
+#include "ds.h"
+#include "read.h"
+#include "util.h"
+
+/**
+ * Classes and routines for reading reads from various input sources.
+ */
+
+using namespace std;
+
+/**
+ * Calculate a per-read random seed based on a combination of
+ * the read data (incl. sequence, name, quals) and the global
+ * seed in '_randSeed'.
+ */
+static inline uint32_t genRandSeed(const BTDnaString& qry,
+                                   const BTString& qual,
+                                   const BTString& name,
+                                   uint32_t seed)
+{
+	// Calculate a per-read random seed based on a combination of
+	// the read data (incl. sequence, name, quals) and the global
+	// seed
+	uint32_t rseed = (seed + 101) * 59 * 61 * 67 * 71 * 73 * 79 * 83;
+	size_t qlen = qry.length();
+	// Throw all the characters of the read into the random seed
+	for(size_t i = 0; i < qlen; i++) {
+		int p = (int)qry[i];
+		assert_leq(p, 4);
+		size_t off = ((i & 15) << 1);
+		rseed ^= (p << off);
+	}
+	// Throw all the quality values for the read into the random
+	// seed
+	for(size_t i = 0; i < qlen; i++) {
+		int p = (int)qual[i];
+		assert_leq(p, 255);
+		size_t off = ((i & 3) << 3);
+		rseed ^= (p << off);
+	}
+	// Throw all the characters in the read name into the random
+	// seed
+	size_t namelen = name.length();
+	for(size_t i = 0; i < namelen; i++) {
+		int p = (int)name[i];
+		if(p == '/') break;
+		assert_leq(p, 255);
+		size_t off = ((i & 3) << 3);
+		rseed ^= (p << off);
+	}
+	return rseed;
+}
+
+/**
+ * Parameters affecting how reads and read in.
+ */
+struct PatternParams {
+	PatternParams(
+		int format_,
+		bool fileParallel_,
+		uint32_t seed_,
+		bool useSpinlock_,
+		bool solexa64_,
+		bool phred64_,
+		bool intQuals_,
+		bool fuzzy_,
+		int sampleLen_,
+		int sampleFreq_,
+		uint32_t skip_) :
+		format(format_),
+		fileParallel(fileParallel_),
+		seed(seed_),
+		useSpinlock(useSpinlock_),
+		solexa64(solexa64_),
+		phred64(phred64_),
+		intQuals(intQuals_),
+		fuzzy(fuzzy_),
+		sampleLen(sampleLen_),
+		sampleFreq(sampleFreq_),
+		skip(skip_) { }
+
+	int format;           // file format
+	bool fileParallel;    // true -> wrap files with separate PairedPatternSources
+	uint32_t seed;        // pseudo-random seed
+	bool useSpinlock;     // use spin locks instead of pthreads
+	bool solexa64;        // true -> qualities are on solexa64 scale
+	bool phred64;         // true -> qualities are on phred64 scale
+	bool intQuals;        // true -> qualities are space-separated numbers
+	bool fuzzy;           // true -> try to parse fuzzy fastq
+	int sampleLen;        // length of sampled reads for FastaContinuous...
+	int sampleFreq;       // frequency of sampled reads for FastaContinuous...
+	uint32_t skip;        // skip the first 'skip' patterns
+};
+
+/**
+ * Encapsulates a synchronized source of patterns; usually a file.
+ * Optionally reverses reads and quality strings before returning them,
+ * though that is usually more efficiently done by the concrete
+ * subclass.  Concrete subclasses should delimit critical sections with
+ * calls to lock() and unlock().
+ */
+class PatternSource {
+
+public:
+
+	PatternSource(const PatternParams& p) :
+		seed_(p.seed),
+		readCnt_(0),
+		numWrappers_(0),
+		doLocking_(true),
+		useSpinlock_(p.useSpinlock),
+		mutex()
+	{
+	}
+
+	virtual ~PatternSource() { }
+
+	/**
+	 * Call this whenever this PatternSource is wrapped by a new
+	 * WrappedPatternSourcePerThread.  This helps us keep track of
+	 * whether locks will be contended.
+	 */
+	void addWrapper() {
+		lock();
+		numWrappers_++;
+		unlock();
+	}
+	
+	/**
+	 * The main member function for dispensing patterns.
+	 *
+	 * Returns true iff a pair was parsed succesfully.
+	 */
+	virtual bool nextReadPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired,
+		bool fixName);
+
+	/**
+	 * The main member function for dispensing patterns.
+	 */
+	virtual bool nextRead(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done);
+
+	/**
+	 * Implementation to be provided by concrete subclasses.  An
+	 * implementation for this member is only relevant for formats that
+	 * can read in a pair of reads in a single transaction with a
+	 * single input source.  If paired-end input is given as a pair of
+	 * parallel files, this member should throw an error and exit.
+	 */
+	virtual bool nextReadPairImpl(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired) = 0;
+
+	/**
+	 * Implementation to be provided by concrete subclasses.  An
+	 * implementation for this member is only relevant for formats
+	 * where individual input sources look like single-end-read
+	 * sources, e.g., formats where paired-end reads are specified in
+	 * parallel read files.
+	 */
+	virtual bool nextReadImpl(
+		Read& r,
+		TReadId& rdid, 
+		TReadId& endid, 
+		bool& success,
+		bool& done) = 0;
+
+	/// Reset state to start over again with the first read
+	virtual void reset() { readCnt_ = 0; }
+
+	/**
+	 * Concrete subclasses call lock() to enter a critical region.
+	 * What constitutes a critical region depends on the subclass.
+	 */
+	void lock() {
+		if(!doLocking_) return; // no contention
+        mutex.lock();
+	}
+
+	/**
+	 * Concrete subclasses call unlock() to exit a critical region
+	 * What constitutes a critical region depends on the subclass.
+	 */
+	void unlock() {
+		if(!doLocking_) return; // no contention
+        mutex.unlock();
+	}
+
+	/**
+	 * Return a new dynamically allocated PatternSource for the given
+	 * format, using the given list of strings as the filenames to read
+	 * from or as the sequences themselves (i.e. if -c was used).
+	 */
+	static PatternSource* patsrcFromStrings(
+                                            const PatternParams& p,
+                                            const EList<string>& qs,
+                                            int nthreads);
+
+	/**
+	 * Return the number of reads attempted.
+	 */
+	TReadId readCnt() const { return readCnt_ - 1; }
+
+protected:
+
+	uint32_t seed_;
+
+	/// The number of reads read by this PatternSource
+	TReadId readCnt_;
+
+	int numWrappers_;      /// # threads that own a wrapper for this PatternSource
+	bool doLocking_;       /// override whether to lock (true = don't override)
+	/// User can ask to use the normal pthreads-style lock even if
+	/// spinlocks is enabled and compiled in.  This is sometimes better
+	/// if we expect bad I/O latency on some reads.
+	bool useSpinlock_;
+	MUTEX_T mutex;
+};
+
+/**
+ * Abstract parent class for synhconized sources of paired-end reads
+ * (and possibly also single-end reads).
+ */
+class PairedPatternSource {
+public:
+	PairedPatternSource(const PatternParams& p) : mutex_m(), seed_(p.seed) {}
+	virtual ~PairedPatternSource() { }
+
+	virtual void addWrapper() = 0;
+	virtual void reset() = 0;
+	
+	virtual bool nextReadPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired,
+		bool fixName) = 0;
+	
+	virtual pair<TReadId, TReadId> readCnt() const = 0;
+
+	/**
+	 * Lock this PairedPatternSource, usually because one of its shared
+	 * fields is being updated.
+	 */
+	void lock() {
+		mutex_m.lock();
+	}
+
+	/**
+	 * Unlock this PairedPatternSource.
+	 */
+	void unlock() {
+		mutex_m.unlock();
+	}
+
+	/**
+	 * Given the values for all of the various arguments used to specify
+	 * the read and quality input, create a list of pattern sources to
+	 * dispense them.
+	 */
+	static PairedPatternSource* setupPatternSources(
+		const EList<string>& si,    // singles, from argv
+		const EList<string>& m1,    // mate1's, from -1 arg
+		const EList<string>& m2,    // mate2's, from -2 arg
+		const EList<string>& m12,   // both mates on each line, from --12 arg
+#ifdef USE_SRA
+        const EList<string>& sra_accs,
+#endif
+		const EList<string>& q,     // qualities associated with singles
+		const EList<string>& q1,    // qualities associated with m1
+		const EList<string>& q2,    // qualities associated with m2
+		const PatternParams& p,     // read-in params
+                                                    int nthreads,
+		bool verbose);              // be talkative?
+
+protected:
+
+	MUTEX_T mutex_m; /// mutex for syncing over critical regions
+	uint32_t seed_;
+};
+
+/**
+ * Encapsulates a synchronized source of both paired-end reads and
+ * unpaired reads, where the paired-end must come from parallel files.
+ */
+class PairedSoloPatternSource : public PairedPatternSource {
+
+public:
+
+	PairedSoloPatternSource(
+		const EList<PatternSource*>* src,
+		const PatternParams& p) :
+		PairedPatternSource(p),
+		cur_(0),
+		src_(src)
+	{
+		assert(src_ != NULL);
+		for(size_t i = 0; i < src_->size(); i++) {
+			assert((*src_)[i] != NULL);
+		}
+	}
+
+	virtual ~PairedSoloPatternSource() { delete src_; }
+
+	/**
+	 * Call this whenever this PairedPatternSource is wrapped by a new
+	 * WrappedPatternSourcePerThread.  This helps us keep track of
+	 * whether locks within PatternSources will be contended.
+	 */
+	virtual void addWrapper() {
+		for(size_t i = 0; i < src_->size(); i++) {
+			(*src_)[i]->addWrapper();
+		}
+	}
+
+	/**
+	 * Reset this object and all the PatternSources under it so that
+	 * the next call to nextReadPair gets the very first read pair.
+	 */
+	virtual void reset() {
+		for(size_t i = 0; i < src_->size(); i++) {
+			(*src_)[i]->reset();
+		}
+		cur_ = 0;
+	}
+
+	/**
+	 * The main member function for dispensing pairs of reads or
+	 * singleton reads.  Returns true iff ra and rb contain a new
+	 * pair; returns false if ra contains a new unpaired read.
+	 */
+	virtual bool nextReadPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired,
+		bool fixName);
+
+	/**
+	 * Return the number of reads attempted.
+	 */
+	virtual pair<TReadId, TReadId> readCnt() const {
+		uint64_t ret = 0llu;
+		for(size_t i = 0; i < src_->size(); i++) ret += (*src_)[i]->readCnt();
+		return make_pair(ret, 0llu);
+	}
+
+protected:
+
+	volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
+	const EList<PatternSource*>* src_; /// PatternSources for paired-end reads
+};
+
+/**
+ * Encapsulates a synchronized source of both paired-end reads and
+ * unpaired reads, where the paired-end must come from parallel files.
+ */
+class PairedDualPatternSource : public PairedPatternSource {
+
+public:
+
+	PairedDualPatternSource(
+		const EList<PatternSource*>* srca,
+		const EList<PatternSource*>* srcb,
+		const PatternParams& p) :
+		PairedPatternSource(p), cur_(0), srca_(srca), srcb_(srcb)
+	{
+		assert(srca_ != NULL);
+		assert(srcb_ != NULL);
+		// srca_ and srcb_ must be parallel
+		assert_eq(srca_->size(), srcb_->size());
+		for(size_t i = 0; i < srca_->size(); i++) {
+			// Can't have NULL first-mate sources.  Second-mate sources
+			// can be NULL, in the case when the corresponding first-
+			// mate source is unpaired.
+			assert((*srca_)[i] != NULL);
+			for(size_t j = 0; j < srcb_->size(); j++) {
+				assert_neq((*srca_)[i], (*srcb_)[j]);
+			}
+		}
+	}
+
+	virtual ~PairedDualPatternSource() {
+		delete srca_;
+		delete srcb_;
+	}
+
+	/**
+	 * Call this whenever this PairedPatternSource is wrapped by a new
+	 * WrappedPatternSourcePerThread.  This helps us keep track of
+	 * whether locks within PatternSources will be contended.
+	 */
+	virtual void addWrapper() {
+		for(size_t i = 0; i < srca_->size(); i++) {
+			(*srca_)[i]->addWrapper();
+			if((*srcb_)[i] != NULL) {
+				(*srcb_)[i]->addWrapper();
+			}
+		}
+	}
+
+	/**
+	 * Reset this object and all the PatternSources under it so that
+	 * the next call to nextReadPair gets the very first read pair.
+	 */
+	virtual void reset() {
+		for(size_t i = 0; i < srca_->size(); i++) {
+			(*srca_)[i]->reset();
+			if((*srcb_)[i] != NULL) {
+				(*srcb_)[i]->reset();
+			}
+		}
+		cur_ = 0;
+	}
+
+	/**
+	 * The main member function for dispensing pairs of reads or
+	 * singleton reads.  Returns true iff ra and rb contain a new
+	 * pair; returns false if ra contains a new unpaired read.
+	 */
+	virtual bool nextReadPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired,
+		bool fixName);
+	
+	/**
+	 * Return the number of reads attempted.
+	 */
+	virtual pair<TReadId, TReadId> readCnt() const;
+
+protected:
+
+	volatile uint32_t cur_; // current element in parallel srca_, srcb_ vectors
+	const EList<PatternSource*>* srca_; /// PatternSources for 1st mates and/or unpaired reads
+	const EList<PatternSource*>* srcb_; /// PatternSources for 2nd mates
+};
+
+/**
+ * Encapsulates a single thread's interaction with the PatternSource.
+ * Most notably, this class holds the buffers into which the
+ * PatterSource will write sequences.  This class is *not* threadsafe
+ * - it doesn't need to be since there's one per thread.  PatternSource
+ * is thread-safe.
+ */
+class PatternSourcePerThread {
+
+public:
+
+	PatternSourcePerThread() :
+		buf1_(), buf2_(), rdid_(0xffffffff), endid_(0xffffffff) { }
+
+	virtual ~PatternSourcePerThread() { }
+
+	/**
+	 * Read the next read pair.
+	 */
+	virtual bool nextReadPair(
+		bool& success,
+		bool& done,
+		bool& paired,
+		bool fixName)
+	{
+		return success;
+	}
+
+	Read& bufa()             { return buf1_;    }	
+	Read& bufb()             { return buf2_;    }
+	const Read& bufa() const { return buf1_;    }
+	const Read& bufb() const { return buf2_;    }
+
+	TReadId       rdid()  const { return rdid_;  }
+	TReadId       endid() const { return endid_; }
+	virtual void  reset()       { rdid_ = endid_ = 0xffffffff;  }
+	
+	/**
+	 * Return the length of mate 1 or mate 2.
+	 */
+	size_t length(int mate) const {
+		return (mate == 1) ? buf1_.length() : buf2_.length();
+	}
+
+protected:
+
+	Read  buf1_;    // read buffer for mate a
+	Read  buf2_;    // read buffer for mate b
+	TReadId rdid_;  // index of read just read
+	TReadId endid_; // index of read just read
+};
+
+/**
+ * Abstract parent factory for PatternSourcePerThreads.
+ */
+class PatternSourcePerThreadFactory {
+public:
+	virtual ~PatternSourcePerThreadFactory() { }
+	virtual PatternSourcePerThread* create() const = 0;
+	virtual EList<PatternSourcePerThread*>* create(uint32_t n) const = 0;
+
+	/// Free memory associated with a pattern source
+	virtual void destroy(PatternSourcePerThread* patsrc) const {
+		assert(patsrc != NULL);
+		// Free the PatternSourcePerThread
+		delete patsrc;
+	}
+
+	/// Free memory associated with a pattern source list
+	virtual void destroy(EList<PatternSourcePerThread*>* patsrcs) const {
+		assert(patsrcs != NULL);
+		// Free all of the PatternSourcePerThreads
+		for(size_t i = 0; i < patsrcs->size(); i++) {
+			if((*patsrcs)[i] != NULL) {
+				delete (*patsrcs)[i];
+				(*patsrcs)[i] = NULL;
+			}
+		}
+		// Free the vector
+		delete patsrcs;
+	}
+};
+
+/**
+ * A per-thread wrapper for a PairedPatternSource.
+ */
+class WrappedPatternSourcePerThread : public PatternSourcePerThread {
+public:
+	WrappedPatternSourcePerThread(PairedPatternSource& __patsrc) :
+		patsrc_(__patsrc)
+	{
+		patsrc_.addWrapper();
+	}
+
+	/**
+	 * Get the next paired or unpaired read from the wrapped
+	 * PairedPatternSource.
+	 */
+	virtual bool nextReadPair(
+		bool& success,
+		bool& done,
+		bool& paired,
+		bool fixName);
+
+private:
+
+	/// Container for obtaining paired reads from PatternSources
+	PairedPatternSource& patsrc_;
+};
+
+/**
+ * Abstract parent factory for PatternSourcePerThreads.
+ */
+class WrappedPatternSourcePerThreadFactory : public PatternSourcePerThreadFactory {
+public:
+	WrappedPatternSourcePerThreadFactory(PairedPatternSource& patsrc) :
+		patsrc_(patsrc) { }
+
+	/**
+	 * Create a new heap-allocated WrappedPatternSourcePerThreads.
+	 */
+	virtual PatternSourcePerThread* create() const {
+		return new WrappedPatternSourcePerThread(patsrc_);
+	}
+
+	/**
+	 * Create a new heap-allocated vector of heap-allocated
+	 * WrappedPatternSourcePerThreads.
+	 */
+	virtual EList<PatternSourcePerThread*>* create(uint32_t n) const {
+		EList<PatternSourcePerThread*>* v = new EList<PatternSourcePerThread*>;
+		for(size_t i = 0; i < n; i++) {
+			v->push_back(new WrappedPatternSourcePerThread(patsrc_));
+			assert(v->back() != NULL);
+		}
+		return v;
+	}
+
+private:
+	/// Container for obtaining paired reads from PatternSources
+	PairedPatternSource& patsrc_;
+};
+
+/// Skip to the end of the current string of newline chars and return
+/// the first character after the newline chars, or -1 for EOF
+static inline int getOverNewline(FileBuf& in) {
+	int c;
+	while(isspace(c = in.get()));
+	return c;
+}
+
+/// Skip to the end of the current string of newline chars such that
+/// the next call to get() returns the first character after the
+/// whitespace
+static inline int peekOverNewline(FileBuf& in) {
+	while(true) {
+		int c = in.peek();
+		if(c != '\r' && c != '\n') {
+			return c;
+		}
+		in.get();
+	}
+}
+
+/// Skip to the end of the current line; return the first character
+/// of the next line or -1 for EOF
+static inline int getToEndOfLine(FileBuf& in) {
+	while(true) {
+		int c = in.get(); if(c < 0) return -1;
+		if(c == '\n' || c == '\r') {
+			while(c == '\n' || c == '\r') {
+				c = in.get(); if(c < 0) return -1;
+			}
+			// c now holds first character of next line
+			return c;
+		}
+	}
+}
+
+/// Skip to the end of the current line such that the next call to
+/// get() returns the first character on the next line
+static inline int peekToEndOfLine(FileBuf& in) {
+	while(true) {
+		int c = in.get(); if(c < 0) return c;
+		if(c == '\n' || c == '\r') {
+			c = in.peek();
+			while(c == '\n' || c == '\r') {
+				in.get(); if(c < 0) return c; // consume \r or \n
+				c = in.peek();
+			}
+			// next get() gets first character of next line
+			return c;
+		}
+	}
+}
+
+extern void wrongQualityFormat(const BTString& read_name);
+extern void tooFewQualities(const BTString& read_name);
+extern void tooManyQualities(const BTString& read_name);
+
+/**
+ * Encapsulates a source of patterns which is an in-memory vector.
+ */
+class VectorPatternSource : public PatternSource {
+
+public:
+
+	VectorPatternSource(
+		const EList<string>& v,
+		const PatternParams& p);
+	
+	virtual ~VectorPatternSource() { }
+	
+	virtual bool nextReadImpl(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done);
+	
+	/**
+	 * This is unused, but implementation is given for completeness.
+	 */
+	virtual bool nextReadPairImpl(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired);
+	
+	virtual void reset() {
+		PatternSource::reset();
+		cur_ = skip_;
+		paired_ = false;
+	}
+	
+private:
+
+	size_t cur_;
+	uint32_t skip_;
+	bool paired_;
+	EList<BTDnaString> v_;  // forward sequences
+	EList<BTString> quals_; // forward qualities
+	EList<BTString> names_; // names
+	EList<int> trimmed3_;   // names
+	EList<int> trimmed5_;   // names
+};
+
+/**
+ *
+ */
+class BufferedFilePatternSource : public PatternSource {
+public:
+	BufferedFilePatternSource(
+		const EList<string>& infiles,
+		const PatternParams& p) :
+		PatternSource(p),
+		infiles_(infiles),
+		filecur_(0),
+		fb_(),
+		skip_(p.skip),
+		first_(true)
+	{
+		assert_gt(infiles.size(), 0);
+		errs_.resize(infiles_.size());
+		errs_.fill(0, infiles_.size(), false);
+		assert(!fb_.isOpen());
+		open(); // open first file in the list
+		filecur_++;
+	}
+
+	virtual ~BufferedFilePatternSource() {
+		if(fb_.isOpen()) fb_.close();
+	}
+
+	/**
+	 * Fill Read with the sequence, quality and name for the next
+	 * read in the list of read files.  This function gets called by
+	 * all the search threads, so we must handle synchronization.
+	 */
+	virtual bool nextReadImpl(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done)
+	{
+		// We'll be manipulating our file handle/filecur_ state
+		lock();
+		while(true) {
+			do { read(r, rdid, endid, success, done); }
+			while(!success && !done);
+			if(!success && filecur_ < infiles_.size()) {
+				assert(done);
+				open();
+				resetForNextFile(); // reset state to handle a fresh file
+				filecur_++;
+				continue;
+			}
+			break;
+		}
+		assert(r.repOk());
+		// Leaving critical region
+		unlock();
+		return success;
+	}
+	
+	/**
+	 *
+	 */
+	virtual bool nextReadPairImpl(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired)
+	{
+		// We'll be manipulating our file handle/filecur_ state
+		lock();
+		while(true) {
+			do { readPair(ra, rb, rdid, endid, success, done, paired); }
+			while(!success && !done);
+			if(!success && filecur_ < infiles_.size()) {
+				assert(done);
+				open();
+				resetForNextFile(); // reset state to handle a fresh file
+				filecur_++;
+				continue;
+			}
+			break;
+		}
+		assert(ra.repOk());
+		assert(rb.repOk());
+		// Leaving critical region
+		unlock();
+		return success;
+	}
+	
+	/**
+	 * Reset state so that we read start reading again from the
+	 * beginning of the first file.  Should only be called by the
+	 * master thread.
+	 */
+	virtual void reset() {
+		PatternSource::reset();
+		filecur_ = 0,
+		open();
+		filecur_++;
+	}
+
+protected:
+
+	/// Read another pattern from the input file; this is overridden
+	/// to deal with specific file formats
+	virtual bool read(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done) = 0;
+	
+	/// Read another pattern pair from the input file; this is
+	/// overridden to deal with specific file formats
+	virtual bool readPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired) = 0;
+	
+	/// Reset state to handle a fresh file
+	virtual void resetForNextFile() { }
+	
+	void open() {
+		if(fb_.isOpen()) fb_.close();
+		while(filecur_ < infiles_.size()) {
+			// Open read
+			FILE *in;
+			if(infiles_[filecur_] == "-") {
+				in = stdin;
+			} else if((in = fopen(infiles_[filecur_].c_str(), "rb")) == NULL) {
+				if(!errs_[filecur_]) {
+					cerr << "Warning: Could not open read file \"" << infiles_[filecur_].c_str() << "\" for reading; skipping..." << endl;
+					errs_[filecur_] = true;
+				}
+				filecur_++;
+				continue;
+			}
+			fb_.newFile(in);
+			return;
+		}
+		cerr << "Error: No input read files were valid" << endl;
+		exit(1);
+		return;
+	}
+	
+	EList<string> infiles_;  // filenames for read files
+	EList<bool> errs_;       // whether we've already printed an error for each file
+	size_t filecur_;         // index into infiles_ of next file to read
+	FileBuf fb_;             // read file currently being read from
+	TReadId skip_;           // number of reads to skip
+	bool first_;
+};
+
+/**
+ * Parse a single quality string from fb and store qualities in r.
+ * Assume the next character obtained via fb.get() is the first
+ * character of the quality string.  When returning, the next
+ * character returned by fb.peek() or fb.get() should be the first
+ * character of the following line.
+ */
+int parseQuals(
+	Read& r,
+	FileBuf& fb,
+	int firstc,
+	int readLen,
+	int trim3,
+	int trim5,
+	bool intQuals,
+	bool phred64,
+	bool solexa64);
+
+/**
+ * Synchronized concrete pattern source for a list of FASTA or CSFASTA
+ * (if color = true) files.
+ */
+class FastaPatternSource : public BufferedFilePatternSource {
+public:
+	FastaPatternSource(const EList<string>& infiles,
+	                   const PatternParams& p) :
+		BufferedFilePatternSource(infiles, p),
+		first_(true), solexa64_(p.solexa64), phred64_(p.phred64), intQuals_(p.intQuals)
+	{ }
+	virtual void reset() {
+		first_ = true;
+		BufferedFilePatternSource::reset();
+	}
+protected:
+	/**
+	 * Scan to the next FASTA record (starting with >) and return the first
+	 * character of the record (which will always be >).
+	 */
+	static int skipToNextFastaRecord(FileBuf& in) {
+		int c;
+		while((c = in.get()) != '>') {
+			if(in.eof()) return -1;
+		}
+		return c;
+	}
+
+	/// Called when we have to bail without having parsed a read.
+	void bail(Read& r) {
+		r.reset();
+		fb_.resetLastN();
+	}
+
+	/// Read another pattern from a FASTA input file
+	virtual bool read(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done);
+	
+	/// Read another pair of patterns from a FASTA input file
+	virtual bool readPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired)
+	{
+		// (For now, we shouldn't ever be here)
+		cerr << "In FastaPatternSource.readPair()" << endl;
+		throw 1;
+		return false;
+	}
+	
+	virtual void resetForNextFile() {
+		first_ = true;
+	}
+	
+private:
+	bool first_;
+    
+public:
+	bool solexa64_;
+	bool phred64_;
+	bool intQuals_;
+};
+
+
+/**
+ * Tokenize a line of space-separated integer quality values.
+ */
+static inline bool tokenizeQualLine(
+	FileBuf& filebuf,
+	char *buf,
+	size_t buflen,
+	EList<string>& toks)
+{
+	size_t rd = filebuf.gets(buf, buflen);
+	if(rd == 0) return false;
+	assert(NULL == strrchr(buf, '\n'));
+	tokenize(string(buf), " ", toks);
+	return true;
+}
+
+/**
+ * Synchronized concrete pattern source for a list of files with tab-
+ * delimited name, seq, qual fields (or, for paired-end reads,
+ * basename, seq1, qual1, seq2, qual2).
+ */
+class TabbedPatternSource : public BufferedFilePatternSource {
+
+public:
+
+	TabbedPatternSource(
+		const EList<string>& infiles,
+		const PatternParams& p,
+		bool  secondName) :
+		BufferedFilePatternSource(infiles, p),
+		solQuals_(p.solexa64),
+		phred64Quals_(p.phred64),
+		intQuals_(p.intQuals),
+		secondName_(secondName) { }
+
+protected:
+
+	/// Read another pattern from a FASTA input file
+	virtual bool read(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done);
+
+	/// Read another pair of patterns from a FASTA input file
+	virtual bool readPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired);
+	
+private:
+
+	/**
+	 * Parse a name from fb_ and store in r.  Assume that the next
+	 * character obtained via fb_.get() is the first character of
+	 * the sequence and the string stops at the next char upto (could
+	 * be tab, newline, etc.).
+	 */
+	int parseName(Read& r, Read* r2, char upto = '\t');
+
+	/**
+	 * Parse a single sequence from fb_ and store in r.  Assume
+	 * that the next character obtained via fb_.get() is the first
+	 * character of the sequence and the sequence stops at the next
+	 * char upto (could be tab, newline, etc.).
+	 */
+	int parseSeq(Read& r, int& charsRead, int& trim5, char upto = '\t');
+
+	/**
+	 * Parse a single quality string from fb_ and store in r.
+	 * Assume that the next character obtained via fb_.get() is
+	 * the first character of the quality string and the string stops
+	 * at the next char upto (could be tab, newline, etc.).
+	 */
+	int parseQuals(Read& r, int charsRead, int dstLen, int trim5,
+	               char& c2, char upto = '\t', char upto2 = -1);
+
+	bool solQuals_;
+	bool phred64Quals_;
+	bool intQuals_;
+	EList<string> qualToks_;
+	bool secondName_;
+};
+
+/**
+ * Synchronized concrete pattern source for Illumina Qseq files.  In
+ * Qseq files, each read appears on a separate line and the tab-
+ * delimited fields are:
+ *
+ * 1. Machine name
+ * 2. Run number
+ * 3. Lane number
+ * 4. Tile number
+ * 5. X coordinate of spot
+ * 6. Y coordinate of spot
+ * 7. Index: "Index sequence or 0. For no indexing, or for a file that
+ *    has not been demultiplexed yet, this field should have a value of
+ *    0."
+ * 8. Read number: 1 for unpaired, 1 or 2 for paired
+ * 9. Sequence
+ * 10. Quality
+ * 11. Filter: 1 = passed, 0 = didn't
+ */
+class QseqPatternSource : public BufferedFilePatternSource {
+
+public:
+
+	QseqPatternSource(
+		const EList<string>& infiles,
+	    const PatternParams& p) :
+		BufferedFilePatternSource(infiles, p),
+		solQuals_(p.solexa64),
+		phred64Quals_(p.phred64),
+		intQuals_(p.intQuals) { }
+
+protected:
+
+#define BAIL_UNPAIRED() { \
+	peekOverNewline(fb_); \
+	r.reset(); \
+	success = false; \
+	done = true; \
+	return success; \
+}
+
+	/**
+	 * Parse a name from fb_ and store in r.  Assume that the next
+	 * character obtained via fb_.get() is the first character of
+	 * the sequence and the string stops at the next char upto (could
+	 * be tab, newline, etc.).
+	 */
+	int parseName(
+		Read& r,      // buffer for mate 1
+		Read* r2,     // buffer for mate 2 (NULL if mate2 is read separately)
+		bool append,     // true -> append characters, false -> skip them
+		bool clearFirst, // clear the name buffer first
+		bool warnEmpty,  // emit a warning if nothing was added to the name
+		bool useDefault, // if nothing is read, put readCnt_ as a default value
+		int upto);       // stop parsing when we first reach character 'upto'
+
+	/**
+	 * Parse a single sequence from fb_ and store in r.  Assume
+	 * that the next character obtained via fb_.get() is the first
+	 * character of the sequence and the sequence stops at the next
+	 * char upto (could be tab, newline, etc.).
+	 */
+	int parseSeq(
+		Read& r,      // buffer for read
+		int& charsRead,
+		int& trim5,
+		char upto);
+
+	/**
+	 * Parse a single quality string from fb_ and store in r.
+	 * Assume that the next character obtained via fb_.get() is
+	 * the first character of the quality string and the string stops
+	 * at the next char upto (could be tab, newline, etc.).
+	 */
+	int parseQuals(
+		Read& r,      // buffer for read
+		int charsRead,
+		int dstLen,
+		int trim5,
+		char& c2,
+		char upto,
+		char upto2);
+
+	/**
+	 * Read another pattern from a Qseq input file.
+	 */
+	virtual bool read(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done);
+
+	/**
+	 * Read a pair of patterns from 1 Qseq file.  Note: this is never used.
+	 */
+	virtual bool readPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired)
+	{
+		// (For now, we shouldn't ever be here)
+		cerr << "In QseqPatternSource.readPair()" << endl;
+		throw 1;
+		return false;
+	}
+
+	bool solQuals_;
+	bool phred64Quals_;
+	bool intQuals_;
+	EList<string> qualToks_;
+};
+
+/**
+ * Synchronized concrete pattern source for a list of FASTA files where
+ * reads need to be extracted from long continuous sequences.
+ */
+class FastaContinuousPatternSource : public BufferedFilePatternSource {
+public:
+	FastaContinuousPatternSource(const EList<string>& infiles, const PatternParams& p) :
+		BufferedFilePatternSource(infiles, p),
+		length_(p.sampleLen), freq_(p.sampleFreq),
+		eat_(length_-1), beginning_(true),
+		bufCur_(0), subReadCnt_(0llu)
+	{
+		resetForNextFile();
+	}
+
+	virtual void reset() {
+		BufferedFilePatternSource::reset();
+		resetForNextFile();
+	}
+
+protected:
+
+	/// Read another pattern from a FASTA input file
+	virtual bool read(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done)
+	{
+		success = true;
+		done = false;
+		r.reset();
+		while(true) {
+			r.color = gColor;
+			int c = fb_.get();
+			if(c < 0) { success = false; done = true; return success; }
+			if(c == '>') {
+				resetForNextFile();
+				c = fb_.peek();
+				bool sawSpace = false;
+				while(c != '\n' && c != '\r') {
+					if(!sawSpace) {
+						sawSpace = isspace(c);
+					}
+					if(!sawSpace) {
+						nameBuf_.append(c);
+					}
+					fb_.get();
+					c = fb_.peek();
+				}
+				while(c == '\n' || c == '\r') {
+					fb_.get();
+					c = fb_.peek();
+				}
+				nameBuf_.append('_');
+			} else {
+				int cat = asc2dnacat[c];
+				if(cat >= 2) c = 'N';
+				if(cat == 0) {
+					// Encountered non-DNA, non-IUPAC char; skip it
+					continue;
+				} else {
+					// DNA char
+					buf_[bufCur_++] = c;
+					if(bufCur_ == 1024) bufCur_ = 0;
+					if(eat_ > 0) {
+						eat_--;
+						// Try to keep readCnt_ aligned with the offset
+						// into the reference; that lets us see where
+						// the sampling gaps are by looking at the read
+						// name
+						if(!beginning_) readCnt_++;
+						continue;
+					}
+					for(size_t i = 0; i < length_; i++) {
+						if(length_ - i <= bufCur_) {
+							c = buf_[bufCur_ - (length_ - i)];
+						} else {
+							// Rotate
+							c = buf_[bufCur_ - (length_ - i) + 1024];
+						}
+						r.patFw.append(asc2dna[c]);
+						r.qual.append('I');
+					}
+					// Set up a default name if one hasn't been set
+					r.name = nameBuf_;
+					char cbuf[20];
+					itoa10<TReadId>(readCnt_ - subReadCnt_, cbuf);
+					r.name.append(cbuf);
+					eat_ = freq_-1;
+					readCnt_++;
+					beginning_ = false;
+					rdid = endid = readCnt_-1;
+					break;
+				}
+			}
+		}
+		return true;
+	}
+	
+	/// Shouldn't ever be here; it's not sensible to obtain read pairs
+	// from a continuous input.
+	virtual bool readPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired)
+	{
+		cerr << "In FastaContinuousPatternSource.readPair()" << endl;
+		throw 1;
+		return false;
+	}
+
+	/**
+	 * Reset state to be read for the next file.
+	 */
+	virtual void resetForNextFile() {
+		eat_ = length_-1;
+		beginning_ = true;
+		bufCur_ = 0;
+		nameBuf_.clear();
+		subReadCnt_ = readCnt_;
+	}
+
+private:
+	size_t length_;     /// length of reads to generate
+	size_t freq_;       /// frequency to sample reads
+	size_t eat_;        /// number of characters we need to skip before
+	                    /// we have flushed all of the ambiguous or
+	                    /// non-existent characters out of our read
+	                    /// window
+	bool beginning_;    /// skipping over the first read length?
+	char buf_[1024];    /// read buffer
+	BTString nameBuf_;  /// read buffer for name of fasta record being
+	                    /// split into mers
+	size_t bufCur_;     /// buffer cursor; points to where we should
+	                    /// insert the next character
+	uint64_t subReadCnt_;/// number to subtract from readCnt_ to get
+	                    /// the pat id to output (so it resets to 0 for
+	                    /// each new sequence)
+};
+
+/**
+ * Read a FASTQ-format file.
+ * See: http://maq.sourceforge.net/fastq.shtml
+ */
+class FastqPatternSource : public BufferedFilePatternSource {
+
+public:
+
+	FastqPatternSource(const EList<string>& infiles, const PatternParams& p) :
+		BufferedFilePatternSource(infiles, p),
+		first_(true),
+		solQuals_(p.solexa64),
+		phred64Quals_(p.phred64),
+		intQuals_(p.intQuals),
+		fuzzy_(p.fuzzy)
+	{ }
+	
+	virtual void reset() {
+		first_ = true;
+		fb_.resetLastN();
+		BufferedFilePatternSource::reset();
+	}
+	
+protected:
+
+	/**
+	 * Scan to the next FASTQ record (starting with @) and return the first
+	 * character of the record (which will always be @).  Since the quality
+	 * line may start with @, we keep scanning until we've seen a line
+	 * beginning with @ where the line two lines back began with +.
+	 */
+	static int skipToNextFastqRecord(FileBuf& in, bool sawPlus) {
+		int line = 0;
+		int plusLine = -1;
+		int c = in.get();
+		int firstc = c;
+		while(true) {
+			if(line > 20) {
+				// If we couldn't find our desired '@' in the first 20
+				// lines, it's time to give up
+				if(firstc == '>') {
+					// That firstc is '>' may be a hint that this is
+					// actually a FASTA file, so return it intact
+					return '>';
+				}
+				// Return an error
+				return -1;
+			}
+			if(c == -1) return -1;
+			if(c == '\n') {
+				c = in.get();
+				if(c == '@' && sawPlus && plusLine == (line-2)) {
+					return '@';
+				}
+				else if(c == '+') {
+					// Saw a '+' at the beginning of a line; remember where
+					// we saw it
+					sawPlus = true;
+					plusLine = line;
+				}
+				else if(c == -1) {
+					return -1;
+				}
+				line++;
+			}
+			c = in.get();
+		}
+	}
+
+	/// Read another pattern from a FASTQ input file
+	virtual bool read(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done);
+	
+	/// Read another read pair from a FASTQ input file
+	virtual bool readPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired)
+	{
+		// (For now, we shouldn't ever be here)
+		cerr << "In FastqPatternSource.readPair()" << endl;
+		throw 1;
+		return false;
+	}
+	
+	virtual void resetForNextFile() {
+		first_ = true;
+	}
+	
+private:
+
+	/**
+	 * Do things we need to do if we have to bail in the middle of a
+	 * read, usually because we reached the end of the input without
+	 * finishing.
+	 */
+	void bail(Read& r) {
+		r.patFw.clear();
+		fb_.resetLastN();
+	}
+
+	bool first_;
+	bool solQuals_;
+	bool phred64Quals_;
+	bool intQuals_;
+	bool fuzzy_;
+	EList<string> qualToks_;
+};
+
+/**
+ * Read a Raw-format file (one sequence per line).  No quality strings
+ * allowed.  All qualities are assumed to be 'I' (40 on the Phred-33
+ * scale).
+ */
+class RawPatternSource : public BufferedFilePatternSource {
+
+public:
+
+	RawPatternSource(const EList<string>& infiles, const PatternParams& p) :
+		BufferedFilePatternSource(infiles, p), first_(true) { }
+
+	virtual void reset() {
+		first_ = true;
+		BufferedFilePatternSource::reset();
+	}
+
+protected:
+
+	/// Read another pattern from a Raw input file
+	virtual bool read(
+		Read& r,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done)
+	{
+		int c;
+		success = true;
+		done = false;
+		r.reset();
+		c = getOverNewline(this->fb_);
+		if(c < 0) {
+			bail(r); success = false; done = true; return success;
+		}
+		assert(!isspace(c));
+		r.color = gColor;
+		int mytrim5 = gTrim5;
+		if(first_) {
+			// Check that the first character is sane for a raw file
+			int cc = c;
+			if(gColor) {
+				if(cc >= '0' && cc <= '4') cc = "ACGTN"[(int)cc - '0'];
+				if(cc == '.') cc = 'N';
+			}
+			if(asc2dnacat[cc] == 0) {
+				cerr << "Error: reads file does not look like a Raw file" << endl;
+				if(c == '>') {
+					cerr << "Reads file looks like a FASTA file; please use -f" << endl;
+				}
+				if(c == '@') {
+					cerr << "Reads file looks like a FASTQ file; please use -q" << endl;
+				}
+				throw 1;
+			}
+			first_ = false;
+		}
+		if(gColor) {
+			// This may be a primer character.  If so, keep it in the
+			// 'primer' field of the read buf and parse the rest of the
+			// read without it.
+			c = toupper(c);
+			if(asc2dnacat[c] > 0) {
+				// First char is a DNA char
+				int c2 = toupper(fb_.peek());
+				// Second char is a color char
+				if(asc2colcat[c2] > 0) {
+					r.primer = c;
+					r.trimc = c2;
+					mytrim5 += 2; // trim primer and first color
+				}
+			}
+			if(c < 0) {
+				bail(r); success = false; done = true; return success;
+			}
+		}
+		// _in now points just past the first character of a sequence
+		// line, and c holds the first character
+		int chs = 0;
+		while(!isspace(c) && c >= 0) {
+			if(gColor) {
+				if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
+				if(c == '.') c = 'N';
+			}
+			// 5' trimming
+			if(isalpha(c) && chs >= mytrim5) {
+				//size_t len = chs - mytrim5;
+				//if(len >= 1024) tooManyQualities(BTString("(no name)"));
+				r.patFw.append(asc2dna[c]);
+				r.qual.append('I');
+			}
+			chs++;
+			if(isspace(fb_.peek())) break;
+			c = fb_.get();
+		}
+		// 3' trimming
+		r.patFw.trimEnd(gTrim3);
+		r.qual.trimEnd(gTrim3);
+		c = peekToEndOfLine(fb_);
+		r.trimmed3 = gTrim3;
+		r.trimmed5 = mytrim5;
+		r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
+		fb_.resetLastN();
+
+		// Set up name
+		char cbuf[20];
+		itoa10<TReadId>(readCnt_, cbuf);
+		r.name.install(cbuf);
+		readCnt_++;
+
+		rdid = endid = readCnt_-1;
+		return success;
+	}
+	
+	/// Read another read pair from a FASTQ input file
+	virtual bool readPair(
+		Read& ra,
+		Read& rb,
+		TReadId& rdid,
+		TReadId& endid,
+		bool& success,
+		bool& done,
+		bool& paired)
+	{
+		// (For now, we shouldn't ever be here)
+		cerr << "In RawPatternSource.readPair()" << endl;
+		throw 1;
+		return false;
+	}
+	
+	virtual void resetForNextFile() {
+		first_ = true;
+	}
+	
+private:
+
+	/**
+	 * Do things we need to do if we have to bail in the middle of a
+	 * read, usually because we reached the end of the input without
+	 * finishing.
+	 */
+	void bail(Read& r) {
+		r.patFw.clear();
+		fb_.resetLastN();
+	}
+	
+	bool first_;
+};
+
+#ifdef USE_SRA
+
+namespace ngs {
+    class ReadCollection;
+    class ReadIterator;
+}
+
+namespace tthread {
+    class thread;
+};
+
+struct SRA_Data;
+
+/**
+ *
+ */
+class SRAPatternSource : public PatternSource {
+public:
+    SRAPatternSource(
+                     const EList<string>& sra_accs,
+                     const PatternParams& p,
+                     const size_t nthreads = 1) :
+    PatternSource(p),
+    sra_accs_(sra_accs),
+    sra_acc_cur_(0),
+    skip_(p.skip),
+    first_(true),
+    nthreads_(nthreads),
+    sra_run_(NULL),
+    sra_it_(NULL),
+    sra_data_(NULL),
+    io_thread_(NULL)
+    {
+        assert_gt(sra_accs_.size(), 0);
+        errs_.resize(sra_accs_.size());
+        errs_.fill(0, sra_accs_.size(), false);
+        open(); // open first file in the list
+        sra_acc_cur_++;
+    }
+    
+    virtual ~SRAPatternSource();
+    
+    /**
+     * Fill Read with the sequence, quality and name for the next
+     * read in the list of read files.  This function gets called by
+     * all the search threads, so we must handle synchronization.
+     */
+    virtual bool nextReadImpl(
+                              Read& r,
+                              TReadId& rdid,
+                              TReadId& endid,
+                              bool& success,
+                              bool& done)
+    {
+        // We'll be manipulating our file handle/filecur_ state
+        lock();
+        while(true) {
+            do { read(r, rdid, endid, success, done); }
+            while(!success && !done);
+            if(!success && sra_acc_cur_ < sra_accs_.size()) {
+                assert(done);
+                open();
+                resetForNextFile(); // reset state to handle a fresh file
+                sra_acc_cur_++;
+                continue;
+            }
+            break;
+        }
+        assert(r.repOk());
+        // Leaving critical region
+        unlock();
+        return success;
+    }
+    
+    /**
+     *
+     */
+    virtual bool nextReadPairImpl(
+                                  Read& ra,
+                                  Read& rb,
+                                  TReadId& rdid,
+                                  TReadId& endid,
+                                  bool& success,
+                                  bool& done,
+                                  bool& paired)
+    {
+        // We'll be manipulating our file handle/filecur_ state
+        lock();
+        while(true) {
+            do { readPair(ra, rb, rdid, endid, success, done, paired); }
+            while(!success && !done);
+            if(!success && sra_acc_cur_ < sra_accs_.size()) {
+                assert(done);
+                open();
+                resetForNextFile(); // reset state to handle a fresh file
+                sra_acc_cur_++;
+                continue;
+            }
+            break;
+        }
+        assert(ra.repOk());
+        assert(rb.repOk());
+        // Leaving critical region
+        unlock();
+        return success;
+    }
+    
+    /**
+     * Reset state so that we read start reading again from the
+     * beginning of the first file.  Should only be called by the
+     * master thread.
+     */
+    virtual void reset() {
+        PatternSource::reset();
+        sra_acc_cur_ = 0,
+        open();
+        sra_acc_cur_++;
+    }
+    
+    /// Read another pattern from the input file; this is overridden
+    /// to deal with specific file formats
+    virtual bool read(
+                      Read& r,
+                      TReadId& rdid,
+                      TReadId& endid,
+                      bool& success,
+                      bool& done)
+    {
+        return true;
+    }
+    
+    /// Read another pattern pair from the input file; this is
+    /// overridden to deal with specific file formats
+    virtual bool readPair(
+                          Read& ra,
+                          Read& rb,
+                          TReadId& rdid,
+                          TReadId& endid,
+                          bool& success,
+                          bool& done,
+                          bool& paired);
+    
+protected:
+    
+    /// Reset state to handle a fresh file
+    virtual void resetForNextFile() { }
+    
+    void open();
+    
+    EList<string> sra_accs_; // filenames for read files
+    EList<bool> errs_;       // whether we've already printed an error for each file
+    size_t sra_acc_cur_;     // index into infiles_ of next file to read
+    TReadId skip_;           // number of reads to skip
+    bool first_;
+    
+    size_t nthreads_;
+    
+    ngs::ReadCollection* sra_run_;
+    ngs::ReadIterator* sra_it_;
+    
+    SRA_Data* sra_data_;
+    tthread::thread* io_thread_;
+};
+
+#endif
+
+#endif /*PAT_H_*/
diff --git a/pe.cpp b/pe.cpp
new file mode 100644
index 0000000..8eea8c3
--- /dev/null
+++ b/pe.cpp
@@ -0,0 +1,940 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "assert_helpers.h"
+#include "pe.h"
+
+using namespace std;
+
+/**
+ * Return a PE_TYPE flag indicating, given a PE_POLICY and coordinates
+ * for a paired-end alignment, what type of alignment it is, i.e.,
+ * whether it's:
+ *
+ * 1. Straightforwardly concordant
+ * 2. Mates dovetail (one extends beyond the end of the other)
+ * 3. One mate contains the other but they don't dovetail
+ * 4. One mate overlaps the other but neither contains the other and
+ *    they don't dovetail
+ * 5. Discordant
+ */
+int PairedEndPolicy::peClassifyPair(
+	int64_t  off1,   // offset of mate 1
+	size_t   len1,   // length of mate 1
+	bool     fw1,    // whether mate 1 aligned to Watson
+	int64_t  off2,   // offset of mate 2
+	size_t   len2,   // length of mate 2
+	bool     fw2)    // whether mate 2 aligned to Watson
+	const
+{
+	assert_gt(len1, 0);
+	assert_gt(len2, 0);
+	// Expand the maximum fragment length if necessary to accomodate
+	// the longer mate
+	size_t maxfrag = maxfrag_;
+	if(len1 > maxfrag && expandToFit_) maxfrag = len1;
+	if(len2 > maxfrag && expandToFit_) maxfrag = len2;
+	size_t minfrag = minfrag_;
+	if(minfrag < 1) {
+		minfrag = 1;
+	}
+	bool oneLeft = false;
+	if(pol_ == PE_POLICY_FF) {
+		if(fw1 != fw2) {
+			// Bad combination of orientations
+			return PE_ALS_DISCORD;
+		}
+		oneLeft = fw1;
+	} else if(pol_ == PE_POLICY_RR) {
+		if(fw1 != fw2) {
+			// Bad combination of orientations
+			return PE_ALS_DISCORD;
+		}
+		oneLeft = !fw1;
+	} else if(pol_ == PE_POLICY_FR) {
+		if(fw1 == fw2) {
+			// Bad combination of orientations
+			return PE_ALS_DISCORD;
+		}
+		oneLeft = fw1;
+	} else if(pol_ == PE_POLICY_RF) {
+		if(fw1 == fw2) {
+			// Bad combination of orientations
+			return PE_ALS_DISCORD;
+		}
+		oneLeft = !fw1;
+	}
+	// Calc implied fragment size
+	int64_t fraglo = min<int64_t>(off1, off2);
+	int64_t fraghi = max<int64_t>(off1+len1, off2+len2);
+	assert_gt(fraghi, fraglo);
+	size_t frag = (size_t)(fraghi - fraglo);
+	if(frag > maxfrag || frag < minfrag) {
+		// Pair is discordant by virtue of the extents
+		return PE_ALS_DISCORD;
+	}
+	int64_t lo1 = off1;
+	int64_t hi1 = off1 + len1 - 1;
+	int64_t lo2 = off2;
+	int64_t hi2 = off2 + len2 - 1;
+	bool containment = false;
+	// Check whether one mate entirely contains the other
+	if((lo1 >= lo2 && hi1 <= hi2) ||
+	   (lo2 >= lo1 && hi2 <= hi1))
+	{
+		containment = true;
+	}
+	int type = PE_ALS_NORMAL;
+	// Check whether one mate overlaps the other
+	bool olap = false;
+	if((lo1 <= lo2 && hi1 >= lo2) ||
+	   (lo1 <= hi2 && hi1 >= hi2) ||
+	   containment)
+	{
+		// The mates overlap
+		olap = true;
+		if(!olapOk_) return PE_ALS_DISCORD;
+		type = PE_ALS_OVERLAP;
+	}
+	// Check if the mates are in the wrong relative orientation,
+	// without any overlap
+	if(!olap) {
+		if((oneLeft && lo2 < lo1) || (!oneLeft && lo1 < lo2)) {
+			return PE_ALS_DISCORD;
+		}
+	}
+	// If one mate contained the other, report that
+	if(containment) {
+		if(!containOk_) return PE_ALS_DISCORD;
+		type = PE_ALS_CONTAIN;
+	}
+	// Check whether there's dovetailing; i.e. does the left mate
+	// extend past the right end of the right mate, or vice versa
+	if(( oneLeft && (hi1 > hi2 || lo2 < lo1)) ||
+	   (!oneLeft && (hi2 > hi1 || lo1 < lo2)))
+	{
+		if(!dovetailOk_) return PE_ALS_DISCORD;
+		type = PE_ALS_DOVETAIL;
+	}
+	return type;
+}
+
+/**
+ * Given details about how one mate aligns, and some details about the
+ * reference sequence it aligned to, calculate a window and orientation s.t.
+ * a paired-end alignment is concordant iff the opposite mate aligns in the
+ * calculated window with the calculated orientation.  The "window" is really a
+ * cosntraint on which positions the extreme end of the opposite mate can fall.
+ * This is a different type of constraint from the one placed on seed-extend
+ * dynamic programming problems.  That constraints requires that alignments at
+ * one point pass through one of a set of "core" columns.
+ *
+ * When the opposite mate is to the left, we're constraining where its
+ * left-hand extreme can fall, i.e., which cells in the top row of the matrix
+ * it can end in.  When the opposite mate is to the right, we're cosntraining
+ * where its right-hand extreme can fall, i.e., which cells in the bottom row
+ * of the matrix it can end in.  However, in practice we can only constrain
+ * where we start the backtrace, i.e. where the RHS of the alignment falls.
+ * See frameFindMateRect for details.
+ *
+ * This calculaton does not consider gaps - the dynamic programming framer will
+ * take gaps into account.
+ *
+ * Returns false if no concordant alignments are possible, true otherwise.
+ */
+bool PairedEndPolicy::otherMate(
+	bool     is1,       // true -> mate 1 aligned and we're looking
+					    // for 2, false -> vice versa
+	bool     fw,        // orientation of aligned mate
+	int64_t  off,       // offset into the reference sequence
+	int64_t  maxalcols, // max # columns spanned by alignment
+	size_t   reflen,    // length of reference sequence aligned to
+	size_t   len1,      // length of mate 1
+	size_t   len2,      // length of mate 2
+	bool&    oleft,     // out: true iff opp mate must be to right of anchor
+	int64_t& oll,       // out: leftmost Watson off for LHS of opp alignment
+	int64_t& olr,       // out: rightmost Watson off for LHS of opp alignment
+	int64_t& orl,       // out: leftmost Watson off for RHS of opp alignment
+	int64_t& orr,       // out: rightmost Watson off for RHS of opp alignment
+	bool&    ofw)       // out: true iff opp mate must be on Watson strand
+	const
+{
+	assert_gt(len1, 0);
+	assert_gt(len2, 0);
+	assert_gt(maxfrag_, 0);
+	assert_geq(minfrag_, 0);
+	assert_geq(maxfrag_, minfrag_);
+	assert(maxalcols == -1 || maxalcols > 0);
+	
+	// Calculate whether opposite mate should align to left or to right
+	// of given mate, and what strand it should align to
+	pePolicyMateDir(pol_, is1, fw, oleft, ofw);
+	
+	size_t alen = is1 ? len1 : len2; // length of opposite mate
+	
+	// Expand the maximum fragment length if necessary to accomodate
+	// the longer mate
+	size_t maxfrag = maxfrag_;
+	size_t minfrag = minfrag_;
+	if(minfrag < 1) {
+		minfrag = 1;
+	}
+	if(len1 > maxfrag && expandToFit_) maxfrag = len1;
+	if(len2 > maxfrag && expandToFit_) maxfrag = len2;
+	if(!expandToFit_ && (len1 > maxfrag || len2 > maxfrag)) {
+		// Not possible to find a concordant alignment; one of the
+		// mates is too long
+		return false;
+	}
+	
+	// Now calculate bounds within which a dynamic programming
+	// algorithm should search for an alignment for the opposite mate
+	if(oleft) {
+		//    -----------FRAG MAX----------------
+		//                 -------FRAG MIN-------
+		//                               |-alen-|
+		//                             Anchor mate
+		//                               ^off
+		//                  |------|
+		//       Not concordant: LHS not outside min
+		//                 |------|
+		//                Concordant
+		//      |------|
+		//     Concordant
+		//  |------|
+		// Not concordant: LHS outside max
+		
+		//    -----------FRAG MAX----------------
+		//                 -------FRAG MIN-------
+		//                               |-alen-|
+		//                             Anchor mate
+		//                               ^off
+		//    |------------|
+		// LHS can't be outside this range
+		//                               -----------FRAG MAX----------------
+		//    |------------------------------------------------------------|
+		// LHS can't be outside this range, assuming no restrictions on
+		// flipping, dovetailing, containment, overlap, etc.
+		//                                      |-------|
+		//                                      maxalcols
+		//    |-----------------------------------------|
+		// LHS can't be outside this range, assuming no flipping
+		//    |---------------------------------|
+		// LHS can't be outside this range, assuming no dovetailing
+		//    |-------------------------|
+		// LHS can't be outside this range, assuming no overlap
+
+		oll = off + alen - maxfrag;
+		olr = off + alen - minfrag;
+		assert_geq(olr, oll);
+		
+		orl = oll;
+		orr = off + maxfrag - 1;
+		assert_geq(olr, oll);
+
+		// What if overlapping alignments are not allowed?
+		if(!olapOk_) {
+			// RHS can't be flush with or to the right of off
+			orr = min<int64_t>(orr, off-1);
+			if(orr < olr) olr = orr;
+			assert_leq(oll, olr);
+			assert_leq(orl, orr);
+			assert_geq(orr, olr);
+		}
+		// What if dovetail alignments are not allowed?
+		else if(!dovetailOk_) {
+			// RHS can't be past off+alen-1
+			orr = min<int64_t>(orr, off + alen - 1);
+			assert_leq(oll, olr);
+			assert_leq(orl, orr);
+		}
+		// What if flipped alignments are not allowed?
+		else if(!flippingOk_ && maxalcols != -1) {
+			// RHS can't be right of ???
+			orr = min<int64_t>(orr, off + alen - 1 + (maxalcols-1));
+			assert_leq(oll, olr);
+			assert_leq(orl, orr);
+		}
+		assert_geq(olr, oll);
+		assert_geq(orr, orl);
+		assert_geq(orr, olr);
+		assert_geq(orl, oll);
+	} else {
+		//                             -----------FRAG MAX----------------
+		//                             -------FRAG MIN-------
+		//  -----------FRAG MAX----------------
+		//                             |-alen-|
+		//                           Anchor mate
+		//                             ^off
+ 		//                                          |------|
+		//                            Not concordant: RHS not outside min
+		//                                           |------|
+		//                                          Concordant
+		//                                                      |------|
+		//                                                     Concordant
+		//                                                          |------|
+		//                                      Not concordant: RHS outside max
+		//
+
+		//                             -----------FRAG MAX----------------
+		//                             -------FRAG MIN-------
+		//  -----------FRAG MAX----------------
+		//                             |-alen-|
+		//                           Anchor mate
+		//                             ^off
+		//                                                  |------------|
+		//                                      RHS can't be outside this range
+		//  |------------------------------------------------------------|
+		// LHS can't be outside this range, assuming no restrictions on
+		// dovetailing, containment, overlap, etc.
+		//                     |-------|
+		//                     maxalcols
+		//                     |-----------------------------------------|
+		//             LHS can't be outside this range, assuming no flipping
+		//                             |---------------------------------|
+		//          LHS can't be outside this range, assuming no dovetailing
+		//                                     |-------------------------|
+		//              LHS can't be outside this range, assuming no overlap
+		
+		orr = off + (maxfrag - 1);
+		orl  = off + (minfrag - 1);
+		assert_geq(orr, orl);
+		
+		oll = off + alen - maxfrag;
+		olr = orr;
+		assert_geq(olr, oll);
+		
+		// What if overlapping alignments are not allowed?
+		if(!olapOk_) {
+			// LHS can't be left of off+alen
+			oll = max<int64_t>(oll, off+alen);
+			if(oll > orl) orl = oll;
+			assert_leq(oll, olr);
+			assert_leq(orl, orr);
+			assert_geq(orl, oll);
+		}
+		// What if dovetail alignments are not allowed?
+		else if(!dovetailOk_) {
+			// LHS can't be left of off
+			oll = max<int64_t>(oll, off);
+			assert_leq(oll, olr);
+			assert_leq(orl, orr);
+		}
+		// What if flipped alignments are not allowed?
+		else if(!flippingOk_ && maxalcols != -1) {
+			// LHS can't be left of off - maxalcols + 1
+			oll = max<int64_t>(oll, off - maxalcols + 1);
+			assert_leq(oll, olr);
+			assert_leq(orl, orr);
+		}
+		assert_geq(olr, oll);
+		assert_geq(orr, orl);
+		assert_geq(orr, olr);
+		assert_geq(orl, oll);
+	}
+
+	// Boundaries and orientation determined
+	return true;
+}
+
+#ifdef MAIN_PE
+
+#include <string>
+#include <sstream>
+
+void testCaseClassify(
+	const string& name,
+	int      pol,
+	size_t   maxfrag,
+	size_t   minfrag,
+	bool     local,
+	bool     flip,
+	bool     dove,
+	bool     cont,
+	bool     olap,
+	bool     expand,
+	int64_t  off1,
+	size_t   len1,
+	bool     fw1,
+	int64_t  off2,
+	size_t   len2,
+	bool     fw2,
+	int      expect_class)
+{
+	PairedEndPolicy pepol(
+		pol,
+		maxfrag,
+		minfrag,
+		local,
+		flip,
+		dove,
+		cont,
+		olap,
+		expand);
+	int ret = pepol.peClassifyPair(
+		off1,   // offset of mate 1
+		len1,   // length of mate 1
+		fw1,    // whether mate 1 aligned to Watson
+		off2,   // offset of mate 2
+		len2,   // length of mate 2
+		fw2);   // whether mate 2 aligned to Watson
+	assert_eq(expect_class, ret);
+	cout << "peClassifyPair: " << name << "...PASSED" << endl;
+}
+
+void testCaseOtherMate(
+	const string& name,
+	int      pol,
+	size_t   maxfrag,
+	size_t   minfrag,
+	bool     local,
+	bool     flip,
+	bool     dove,
+	bool     cont,
+	bool     olap,
+	bool     expand,
+	bool     is1,
+	bool     fw,
+	int64_t  off,
+	int64_t  maxalcols,
+	size_t   reflen,
+	size_t   len1,
+	size_t   len2,
+	bool     expect_ret,
+	bool     expect_oleft,
+	int64_t  expect_oll,
+	int64_t  expect_olr,
+	int64_t  expect_orl,
+	int64_t  expect_orr,
+	bool     expect_ofw)
+{
+	PairedEndPolicy pepol(
+		pol,
+		maxfrag,
+		minfrag,
+		local,
+		flip,
+		dove,
+		cont,
+		olap,
+		expand);
+	int64_t oll = 0, olr = 0;
+	int64_t orl = 0, orr = 0;
+	bool oleft = false, ofw = false;
+	bool ret = pepol.otherMate(
+		is1,
+		fw,
+		off,
+		maxalcols,
+		reflen,
+		len1,
+		len2,
+		oleft,
+		oll,
+		olr,
+		orl,
+		orr,
+		ofw);
+	assert(ret == expect_ret);
+	if(ret) {
+		assert_eq(expect_oleft, oleft);
+		assert_eq(expect_oll, oll);
+		assert_eq(expect_olr, olr);
+		assert_eq(expect_orl, orl);
+		assert_eq(expect_orr, orr);
+		assert_eq(expect_ofw, ofw);
+	}
+	cout << "otherMate: " << name << "...PASSED" << endl;
+}
+
+int main(int argc, char **argv) {
+
+	// Set of 8 cases where we look for the opposite mate to the right
+	// of the anchor mate, with various combinations of policies and
+	// anchor-mate orientations.
+
+	// |--------|
+	//           |--------|
+	//           ^110     ^119
+	// |------------------|
+	//      min frag
+	//                     |--------|
+	//                     ^120     ^129
+	// |----------------------------|
+	//           max frag
+	// ^
+	// 100
+
+	{
+	int  policies[] = { PE_POLICY_FF, PE_POLICY_RR, PE_POLICY_FR, PE_POLICY_RF, PE_POLICY_FF, PE_POLICY_RR, PE_POLICY_FR, PE_POLICY_RF };
+	bool is1[]      = { true,  true,   true,  true, false, false, false, false };
+	bool fw[]       = { true,  false,  true, false, false,  true,  true, false };
+	bool oleft[]    = { false, false, false, false, false, false, false, false };
+	bool ofw[]      = { true,  false, false,  true, false,  true, false,  true };
+
+	for(int i = 0; i < 8; i++) {
+		ostringstream oss;
+		oss << "Simple";
+		oss << i;
+		testCaseOtherMate(
+			oss.str(),
+			policies[i],  // policy
+			30,           // maxfrag
+			20,           // minfrag
+			false,        // local
+			true,         // flipping OK
+			true,         // dovetail OK
+			true,         // containment OK
+			true,         // overlap OK
+			true,         // expand-to-fit
+			is1[i],       // mate 1 is anchor
+			fw[i],        // anchor aligned to Watson
+			100,          // anchor's offset into ref
+			-1,           // max # alignment cols
+			200,          // ref length
+			10,           // mate 1 length
+			10,           // mate 2 length
+			true,         // expected return val from otherMate
+			oleft[i],     // wheter to look for opposite to left
+			80,           // expected leftmost pos for opp mate LHS
+			129,          // expected rightmost pos for opp mate LHS
+			119,          // expected leftmost pos for opp mate RHS
+			129,          // expected rightmost pos for opp mate RHS
+			ofw[i]);      // expected orientation in which opposite mate must align
+	}
+	}
+
+	// Set of 8 cases where we look for the opposite mate to the left
+	// of the anchor mate, with various combinations of policies and
+	// anchor-mate orientations.
+
+	// |--------|
+	// ^100     ^109
+	//           |--------|
+	//           ^110     ^119
+	//           |------------------|
+	//                 min frag
+	//                     |-Anchor-|
+	//                     ^120     ^129
+	// |----------------------------|
+	//           max frag
+	// ^
+	// 100
+
+	{
+	int  policies[] = { PE_POLICY_FF, PE_POLICY_RR, PE_POLICY_FR, PE_POLICY_RF, PE_POLICY_FF, PE_POLICY_RR, PE_POLICY_FR, PE_POLICY_RF };
+	bool is1[]      = { false, false, false, false,  true,  true,  true,  true };
+	bool fw[]       = {  true, false, false,  true, false,  true, false,  true };
+	bool oleft[]    = {  true,  true,  true,  true,  true,  true,  true,  true };
+	bool ofw[]      = {  true, false,  true, false, false,  true,  true, false };
+	
+	for(int i = 0; i < 8; i++) {
+		ostringstream oss;
+		oss << "Simple";
+		oss << (i+8);
+		testCaseOtherMate(
+			oss.str(),
+			policies[i],  // policy
+			30,           // maxfrag
+			20,           // minfrag
+			false,        // local
+			true,         // flipping OK
+			true,         // dovetail OK
+			true,         // containment OK
+			true,         // overlap OK
+			true,         // expand-to-fit
+			is1[i],       // mate 1 is anchor
+			fw[i],        // anchor aligned to Watson
+			120,          // anchor's offset into ref
+			-1,           // max # alignment cols
+			200,          // ref length
+			10,           // mate 1 length
+			10,           // mate 2 length
+			true,         // expected return val from otherMate
+			oleft[i],     // wheter to look for opposite to left
+			100,          // expected leftmost pos for opp mate LHS
+			110,          // expected rightmost pos for opp mate LHS
+			100,          // expected leftmost pos for opp mate RHS
+			149,          // expected rightmost pos for opp mate RHS
+			ofw[i]);      // expected orientation in which opposite mate must align
+	}
+	}
+
+	// Case where min frag == max frag and opposite is to the right
+
+	// |----------------------------|
+	//      min frag
+	//                     |--------|
+	//                     ^120     ^129
+	// |----------------------------|
+	//           max frag
+	// ^
+	// 100
+	testCaseOtherMate(
+		"MinFragEqMax1",
+		PE_POLICY_FR, // policy
+		30,           // maxfrag
+		30,           // minfrag
+		false,        // local
+		true,         // flipping OK
+		true,         // dovetail OK
+		true,         // containment OK
+		true,         // overlap OK
+		true,         // expand-to-fit
+		false,        // mate 1 is anchor
+		false,        // anchor aligned to Watson
+		120,          // anchor's offset into ref
+		-1,           // max # alignment cols
+		200,          // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		true,         // wheter to look for opposite to left
+		100,          // expected leftmost pos for opp mate LHS
+		100,          // expected rightmost pos for opp mate LHS
+		100,          // expected leftmost pos for opp mate RHS
+		149,          // expected rightmost pos for opp mate RHS
+		true);        // expected orientation in which opposite mate must align
+
+	// Case where min frag == max frag and opposite is to the right
+
+	// |----------------------------|
+	//      min frag                ^129
+	// |--------|
+	// ^100     ^109
+	// |----------------------------|
+	//           max frag
+	testCaseOtherMate(
+		"MinFragEqMax2",
+		PE_POLICY_FR, // policy
+		30,           // maxfrag
+		30,           // minfrag
+		false,        // local
+		true,         // flipping OK
+		true,         // dovetail OK
+		true,         // containment OK
+		true,         // overlap OK
+		true,         // expand-to-fit
+		true,         // mate 1 is anchor
+		true,         // anchor aligned to Watson
+		100,          // anchor's offset into ref
+		-1,           // max # alignment cols
+		200,          // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		false,        // wheter to look for opposite to left
+		80,           // expected leftmost pos for opp mate LHS
+		129,          // expected rightmost pos for opp mate LHS
+		129,          // expected leftmost pos for opp mate RHS
+		129,          // expected rightmost pos for opp mate RHS
+		false);       // expected orientation in which opposite mate must align
+
+	testCaseOtherMate(
+		"MinFragEqMax4NoDove1",
+		PE_POLICY_FR, // policy
+		30,           // maxfrag
+		25,           // minfrag
+		false,        // local
+		true,         // flipping OK
+		false,        // dovetail OK
+		true,         // containment OK
+		true,         // overlap OK
+		true,         // expand-to-fit
+		true,         // mate 1 is anchor
+		true,         // anchor aligned to Watson
+		100,          // anchor's offset into ref
+		-1,           // max # alignment cols
+		200,          // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		false,        // wheter to look for opposite to left
+		100,          // expected leftmost pos for opp mate LHS
+		129,          // expected rightmost pos for opp mate LHS
+		124,          // expected leftmost pos for opp mate RHS
+		129,          // expected rightmost pos for opp mate RHS
+		false);       // expected orientation in which opposite mate must align
+
+	testCaseOtherMate(
+		"MinFragEqMax4NoCont1",
+		PE_POLICY_FR, // policy
+		30,           // maxfrag
+		25,           // minfrag
+		false,        // local
+		true,         // flipping OK
+		false,        // dovetail OK
+		false,        // containment OK
+		true,         // overlap OK
+		true,         // expand-to-fit
+		true,         // mate 1 is anchor
+		true,         // anchor aligned to Watson
+		100,          // anchor's offset into ref
+		-1,           // max # alignment cols
+		200,          // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		false,        // wheter to look for opposite to left
+		100,          // expected leftmost pos for opp mate LHS
+		129,          // expected rightmost pos for opp mate LHS
+		124,          // expected leftmost pos for opp mate RHS
+		129,          // expected rightmost pos for opp mate RHS
+		false);       // expected orientation in which opposite mate must align
+
+	testCaseOtherMate(
+		"MinFragEqMax4NoOlap1",
+		PE_POLICY_FR, // policy
+		30,           // maxfrag
+		25,           // minfrag
+		false,        // local
+		true,         // flipping OK
+		false,        // dovetail OK
+		false,        // containment OK
+		false,        // overlap OK
+		true,         // expand-to-fit
+		true,         // mate 1 is anchor
+		true,         // anchor aligned to Watson
+		100,          // anchor's offset into ref
+		-1,           // max # alignment cols
+		200,          // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		false,        // wheter to look for opposite to left
+		110,          // expected leftmost pos for opp mate LHS
+		129,          // expected rightmost pos for opp mate LHS
+		124,          // expected leftmost pos for opp mate RHS
+		129,          // expected rightmost pos for opp mate RHS
+		false);       // expected orientation in which opposite mate must align
+
+	testCaseOtherMate(
+		"MinFragEqMax4NoDove2",
+		PE_POLICY_FR, // policy
+		30,           // maxfrag
+		25,           // minfrag
+		false,        // local
+		true,         // flipping OK
+		false,        // dovetail OK
+		true,         // containment OK
+		true,         // overlap OK
+		true,         // expand-to-fit
+		false,        // mate 1 is anchor
+		false,        // anchor aligned to Watson
+		120,          // anchor's offset into ref
+		-1,           // max # alignment cols
+		200,          // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		true,         // whether to look for opposite to left
+		100,          // expected leftmost pos for opp mate LHS
+		105,          // expected rightmost pos for opp mate LHS
+		100,          // expected leftmost pos for opp mate RHS
+		129,          // expected rightmost pos for opp mate RHS
+		true);        // expected orientation in which opposite mate must align
+
+	testCaseOtherMate(
+		"MinFragEqMax4NoOlap2",
+		PE_POLICY_FR, // policy
+		30,           // maxfrag
+		25,           // minfrag
+		false,        // local
+		true,         // flipping OK
+		false,        // dovetail OK
+		false,        // containment OK
+		false,        // overlap OK
+		true,         // expand-to-fit
+		false,        // mate 1 is anchor
+		false,        // anchor aligned to Watson
+		120,          // anchor's offset into ref
+		-1,           // max # alignment cols
+		200,          // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		true,         // whether to look for opposite to left
+		100,          // expected leftmost pos for opp mate LHS
+		105,          // expected rightmost pos for opp mate LHS
+		100,          // expected leftmost pos for opp mate RHS
+		119,          // expected rightmost pos for opp mate RHS
+		true);        // expected orientation in which opposite mate must align
+
+	{
+	int olls[] = { 110 };
+	int olrs[] = { 299 };
+	int orls[] = { 149 };
+	int orrs[] = { 299 };
+	for(int i = 0; i < 1; i++) {
+		ostringstream oss;
+		oss << "Overhang1_";
+		oss << (i+1);
+		testCaseOtherMate(
+			oss.str(),
+			PE_POLICY_FR, // policy
+			200,          // maxfrag
+			50,           // minfrag
+			false,        // local
+			true,         // flipping OK
+			true,         // dovetail OK
+			true,         // containment OK
+			false,        // overlap OK
+			true,         // expand-to-fit
+			true,         // mate 1 is anchor
+			true,         // anchor aligned to Watson
+			100,          // anchor's offset into ref
+			-1,           // max # alignment cols
+			200,          // ref length
+			10,           // mate 1 length
+			10,           // mate 2 length
+			true,         // expected return val from otherMate
+			false,        // whether to look for opposite to left
+			olls[i],      // expected leftmost pos for opp mate LHS
+			olrs[i],      // expected rightmost pos for opp mate LHS
+			orls[i],      // expected leftmost pos for opp mate RHS
+			orrs[i],      // expected rightmost pos for opp mate RHS
+			false);       // expected orientation in which opposite mate must align
+	}
+	}
+
+	{
+	int olls[] = { -100 };
+	int olrs[] = {   50 };
+	int orls[] = { -100 };
+	int orrs[] = {   89 };
+	for(int i = 0; i < 1; i++) {
+		ostringstream oss;
+		oss << "Overhang2_";
+		oss << (i+1);
+		testCaseOtherMate(
+			oss.str(),
+			PE_POLICY_FR, // policy
+			200,          // maxfrag
+			50,           // minfrag
+			false,        // local
+			true,         // flipping OK
+			true,         // dovetail OK
+			true,         // containment OK
+			false,        // overlap OK
+			true,         // expand-to-fit
+			true,         // mate 1 is anchor
+			false,        // anchor aligned to Watson
+			90,           // anchor's offset into ref
+			-1,           // max # alignment cols
+			200,          // ref length
+			10,           // mate 1 length
+			10,           // mate 2 length
+			true,         // expected return val from otherMate
+			true,         // whether to look for opposite to left
+			olls[i],      // expected leftmost pos for opp mate LHS
+			olrs[i],      // expected rightmost pos for opp mate LHS
+			orls[i],      // expected leftmost pos for opp mate RHS
+			orrs[i],      // expected rightmost pos for opp mate RHS
+			true);        // expected orientation in which opposite mate must align
+	}
+	}
+
+	{
+	int mate2offs[] = {           150,            149,            149,            100,              99,           299,              1,            250,            250 };
+	int mate2lens[] = {            50,             50,             51,            100,             101,             1,             50,             50,             51 };
+	int peExpects[] = { PE_ALS_NORMAL, PE_ALS_DISCORD, PE_ALS_OVERLAP, PE_ALS_CONTAIN, PE_ALS_DOVETAIL, PE_ALS_NORMAL, PE_ALS_DISCORD,  PE_ALS_NORMAL, PE_ALS_DISCORD };
+
+	for(int i = 0; i < 9; i++) {
+		ostringstream oss;
+		oss << "Simple1_";
+		oss << (i);
+		testCaseClassify(
+			oss.str(),
+			PE_POLICY_FR, // policy
+			200,          // maxfrag
+			100,          // minfrag
+			false,        // local
+			true,         // flipping OK
+			true,         // dovetail OK
+			true,         // containment OK
+			true,         // overlap OK
+			true,         // expand-to-fit
+			100,          // offset of mate 1
+			50,           // length of mate 1
+			true,         // whether mate 1 aligned to Watson
+			mate2offs[i], // offset of mate 2
+			mate2lens[i], // length of mate 2
+			false,        // whether mate 2 aligned to Watson
+			peExpects[i]);// expectation for PE_ALS flag returned
+	}
+	}
+
+	{
+	int mate1offs[] = {           200,            201,            200,            200,             200,           100,            400,            100,             99 };
+	int mate1lens[] = {            50,             49,             51,            100,             101,             1,             50,             50,             51 };
+	int peExpects[] = { PE_ALS_NORMAL, PE_ALS_DISCORD, PE_ALS_OVERLAP, PE_ALS_CONTAIN, PE_ALS_DOVETAIL, PE_ALS_NORMAL, PE_ALS_DISCORD,  PE_ALS_NORMAL, PE_ALS_DISCORD };
+
+	for(int i = 0; i < 9; i++) {
+		ostringstream oss;
+		oss << "Simple2_";
+		oss << (i);
+		testCaseClassify(
+			oss.str(),
+			PE_POLICY_FR, // policy
+			200,          // maxfrag
+			100,          // minfrag
+			false,        // local
+			true,         // flipping OK
+			true,         // dovetail OK
+			true,         // containment OK
+			true,         // overlap OK
+			true,         // expand-to-fit
+			mate1offs[i], // offset of mate 1
+			mate1lens[i], // length of mate 1
+			true,         // whether mate 1 aligned to Watson
+			250,          // offset of mate 2
+			50,           // length of mate 2
+			false,        // whether mate 2 aligned to Watson
+			peExpects[i]);// expectation for PE_ALS flag returned
+	}
+	}
+
+	testCaseOtherMate(
+		"Regression1",
+		PE_POLICY_FF, // policy
+		50,           // maxfrag
+		0,            // minfrag
+		false,        // local
+		true,         // flipping OK
+		true,         // dovetail OK
+		true,         // containment OK
+		true,         // overlap OK
+		true,         // expand-to-fit
+		true,         // mate 1 is anchor
+		false,        // anchor aligned to Watson
+		3,            // anchor's offset into ref
+		-1,           // max # alignment cols
+		53,           // ref length
+		10,           // mate 1 length
+		10,           // mate 2 length
+		true,         // expected return val from otherMate
+		true,         // whether to look for opposite to left
+		-37,          // expected leftmost pos for opp mate LHS
+		13,           // expected rightmost pos for opp mate LHS
+		-37,          // expected leftmost pos for opp mate RHS
+		52,           // expected rightmost pos for opp mate RHS
+		false);       // expected orientation in which opposite mate must align
+}
+
+#endif /*def MAIN_PE*/
diff --git a/pe.h b/pe.h
new file mode 100644
index 0000000..8dd46a4
--- /dev/null
+++ b/pe.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ *  pe.h
+ *
+ * A class encapsulating a paired-end policy and routines for
+ * identifying intervals according to the policy.  For instance,
+ * contains a routine that, given a policy and details about a match
+ * for one mate, returns details about where to search for the other
+ * mate.
+ */
+
+#ifndef PE_H_
+#define PE_H_
+
+#include <iostream>
+#include <stdint.h>
+
+// In description below "To the left" = "Upstream of w/r/t the Watson strand"
+
+// The 4 possible policies describing how mates 1 and 2 should be
+// oriented with respect to the reference genome and each other
+enum {
+	// (fw) Both mates from Watson with 1 to the left, or
+	// (rc) Both mates from Crick with 2 to the left
+	PE_POLICY_FF = 1,
+
+	// (fw) Both mates from Crick with 1 to the left, or
+	// (rc) Both mates from Watson with 2 to the left
+	PE_POLICY_RR,
+	
+	// (fw) Mate 1 from Watson and mate 2 from Crick with 1 to the left, or
+	// (rc) Mate 2 from Watson and mate 1 from Crick with 2 to the left
+	PE_POLICY_FR,
+	
+	// (fw) Mate 1 from Crick and mate 2 from Watson with 1 to the left, or
+	// (rc) Mate 2 from Crick and mate 1 from Watson with 2 to the left
+	PE_POLICY_RF
+};
+
+// Various distinct ways that the mates might align with respect to
+// each other in a concordant alignment.  We distinguish between them
+// because in some cases a user may want to consider some of these
+// categories to be discordant, even if the alignment otherwise
+// conforms to the paired-end policy.
+
+enum {
+	// Describes a paired-end alignment where the mates
+	// straightforwardly conform to the paired-end policy without any
+	// overlap between the mates
+	PE_ALS_NORMAL = 1,
+
+	// Describes a paired-end alignment where the mate overlap, but
+	// neither contains the other and they do not dovetail, but the
+	// alignment conforms to the paired-end policy
+	PE_ALS_OVERLAP,
+	
+	// Describes a paired-end alignment where the mates conform to the
+	// paired-end policy, but one mate strictly contains the other but
+	// they don't dovetail.  We distinguish this from a "normal"
+	// concordant alignment because some users may wish to categorize
+	// such an alignment as discordant.
+	PE_ALS_CONTAIN,
+	
+	// Describes a paired-end alignment where the mates conform to the
+	// paired-end policy, but mates "fall off" each other.  E.g. if the
+	// policy is FR and any of these happen:
+	// 1:     >>>>>   >>>>>
+	// 2:  <<<<<<    <<<<<<
+	// And the overall extent is consistent with the minimum fragment
+	// length, this is a dovetail alignment.  We distinguish this from
+	// a "normal" concordant alignment because some users may wish to
+	// categorize such an alignment as discordant.
+	PE_ALS_DOVETAIL,
+	
+	// The mates are clearly discordant, owing to their orientations
+	// and/or implied fragment length
+	PE_ALS_DISCORD
+};
+
+/**
+ * Return true iff the orientations and relative positions of mates 1
+ * and 2 are compatible with the given PE_POLICY.
+ */
+static inline bool pePolicyCompat(
+	int policy,   // PE_POLICY
+	bool oneLeft, // true iff mate 1 is to the left of mate 2
+	bool oneWat,  // true iff mate 1 aligned to Watson strand
+	bool twoWat)  // true iff mate 2 aligned to Watson strand
+{
+	switch(policy) {
+		case PE_POLICY_FF:
+			return oneWat == twoWat && oneWat == oneLeft;
+		case PE_POLICY_RR:
+			return oneWat == twoWat && oneWat != oneLeft;
+		case PE_POLICY_FR:
+			return oneWat != twoWat && oneWat == oneLeft;
+		case PE_POLICY_RF:
+			return oneWat != twoWat && oneWat != oneLeft;
+		default: {
+			std::cerr << "Bad PE_POLICY: " << policy << std::endl;
+			throw 1;
+		}
+	}
+	throw 1;
+}
+
+/**
+ * Given that the given mate aligns in the given orientation, return
+ * true iff the other mate must appear "to the right" of the given mate
+ * in order for the alignment to be concordant.
+ */
+static inline void pePolicyMateDir(
+	int   policy,// in: PE_POLICY
+	bool  is1,   // in: true iff mate 1 is the one that already aligned
+	bool  fw,    // in: true iff already-aligned mate aligned to Watson
+	bool& left,  // out: set =true iff other mate must be to the left
+	bool& mfw)   // out: set =true iff other mate must align to watson
+{
+	switch(policy) {
+		case PE_POLICY_FF: {
+			left = (is1 != fw);
+			mfw = fw;
+			break;
+		}
+		case PE_POLICY_RR: {
+			left = (is1 == fw);
+			mfw = fw;
+			break;
+		}
+		case PE_POLICY_FR: {
+			left = !fw;
+			mfw = !fw;
+			break;
+		}
+		case PE_POLICY_RF: {
+			left = fw;
+			mfw = !fw;
+			break;
+		}
+		default: {
+			std::cerr << "Error: No such PE_POLICY: " << policy << std::endl;
+			throw 1;
+		}
+	}
+	return;
+}
+
+/**
+ * Encapsulates paired-end alignment parameters.
+ */
+class PairedEndPolicy {
+
+public:
+
+	PairedEndPolicy() { reset(); }
+	
+	PairedEndPolicy(
+		int pol,
+		size_t maxfrag,
+		size_t minfrag,
+		bool local,
+		bool flippingOk,
+		bool dovetailOk,
+		bool containOk,
+		bool olapOk,
+		bool expandToFit)
+	{
+		init(
+			pol,
+			maxfrag,
+			minfrag,
+			local,
+			flippingOk,
+			dovetailOk,
+			containOk,
+			olapOk,
+			expandToFit);
+	}
+
+	/** 
+	 * Initialize with nonsense values.
+	 */
+	void reset() {
+		init(-1, 0xffffffff, 0xffffffff, false, false, false, false, false, false);
+	}
+
+	/**
+	 * Initialize given policy, maximum & minimum fragment lengths.
+	 */
+	void init(
+		int pol,
+		size_t maxfrag,
+		size_t minfrag,
+		bool local,
+		bool flippingOk,
+		bool dovetailOk,
+		bool containOk,
+		bool olapOk,
+		bool expandToFit)
+	{
+		pol_         = pol;
+		maxfrag_     = maxfrag;
+		minfrag_     = minfrag;
+		local_       = local;
+		flippingOk_  = flippingOk;
+		dovetailOk_  = dovetailOk;
+		containOk_   = containOk;
+		olapOk_      = olapOk;
+		expandToFit_ = expandToFit;
+	}
+
+/**
+ * Given details about how one mate aligns, and some details about the
+ * reference sequence it aligned to, calculate a window and orientation s.t.
+ * a paired-end alignment is concordant iff the opposite mate aligns in the
+ * calculated window with the calculated orientation.  The calculaton does not
+ * consider gaps.  The dynamic programming framer will take gaps into account.
+ *
+ * Returns false if no concordant alignments are possible, true otherwise.
+ */
+bool otherMate(
+	bool     is1,       // true -> mate 1 aligned and we're looking
+						// for 2, false -> vice versa
+	bool     fw,        // orientation of aligned mate
+	int64_t  off,       // offset into the reference sequence
+	int64_t  maxalcols, // max # columns spanned by alignment
+	size_t   reflen,    // length of reference sequence aligned to
+	size_t   len1,      // length of mate 1
+	size_t   len2,      // length of mate 2
+	bool&    oleft,     // out: true iff opp mate must be to right of anchor
+	int64_t& oll,       // out: leftmost Watson off for LHS of opp alignment
+	int64_t& olr,       // out: rightmost Watson off for LHS of opp alignment
+	int64_t& orl,       // out: leftmost Watson off for RHS of opp alignment
+	int64_t& orr,       // out: rightmost Watson off for RHS of opp alignment
+	bool&    ofw)       // out: true iff opp mate must be on Watson strand
+	const;
+
+	/**
+	 * Return a PE_TYPE flag indicating, given a PE_POLICY and coordinates
+	 * for a paired-end alignment,	qwhat type of alignment it is, i.e.,
+	 * whether it's:
+	 *
+	 * 1. Straightforwardly concordant
+	 * 2. Mates dovetail (one extends beyond the end of the other)
+	 * 3. One mate contains the other but they don't dovetail
+	 * 4. One mate overlaps the other but neither contains the other and
+	 *    they don't dovetail
+	 * 5. Discordant
+	 */
+	int peClassifyPair(
+		int64_t  off1,   // offset of mate 1
+		size_t   len1,   // length of mate 1
+		bool     fw1,    // whether mate 1 aligned to Watson
+		int64_t  off2,   // offset of mate 2
+		size_t   len2,   // length of mate 2
+		bool     fw2)    // whether mate 2 aligned to Watson
+		const;
+
+	int    policy()     const { return pol_;     }
+	size_t maxFragLen() const { return maxfrag_; }
+	size_t minFragLen() const { return minfrag_; }
+
+protected:
+
+	// Use local alignment to search for the opposite mate, rather than
+	// a type of alignment that requires the read to align end-to-end
+	bool local_;
+
+	// Policy governing how mates should be oriented with respect to
+	// each other and the reference genome
+	int pol_;
+	
+	// true iff settings are such that mates that violate the expected relative
+	// orientation but are still consistent with maximum fragment length are OK
+	bool flippingOk_;
+
+	// true iff settings are such that dovetailed mates should be
+	// considered concordant.
+	bool dovetailOk_;
+
+	// true iff paired-end alignments where one mate's alignment is
+	// strictly contained within the other's should be considered
+	// concordant
+	bool containOk_;
+
+	// true iff paired-end alignments where one mate's alignment
+	// overlaps the other's should be considered concordant
+	bool olapOk_;
+	
+	// What to do when a mate length is > maxfrag_?  If expandToFit_ is
+	// true, we temporarily increase maxfrag_ to equal the mate length.
+	// Otherwise we say that any paired-end alignment involving the
+	// long mate is discordant.
+	bool expandToFit_;
+	
+	// Maximum fragment size to consider
+	size_t maxfrag_;
+
+	// Minimum fragment size to consider
+	size_t minfrag_;
+};
+
+#endif /*ndef PE_H_*/
diff --git a/presets.cpp b/presets.cpp
new file mode 100644
index 0000000..9fef89c
--- /dev/null
+++ b/presets.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include "presets.h"
+#include "opts.h"
+
+using namespace std;
+
+void PresetsV0::apply(
+	const std::string& preset,
+	std::string& policy,
+	EList<std::pair<int, std::string> >& opts)
+{
+	// Presets:                 Same as:
+	//  For --end-to-end:
+	//   --very-fast            -M 5 -R 1 -N 0 -L 22 -i S,1,2.50
+	//   --fast                 -M 10 -R 2 -N 0 -L 22 -i S,1,2.50
+	//   --sensitive            -M 15 -R 2 -N 0 -L 22 -i S,1,1.15
+	//   --very-sensitive       -M 25 -R 3 -N 0 -L 19 -i S,1,0.50
+	if(preset == "very-fast") {
+		policy += ";SEED=0,22";
+		policy += ";DPS=5";
+		policy += ";ROUNDS=1";
+		policy += ";IVAL=S,0,2.50";
+	} else if(preset == "fast") {
+		policy += ";SEED=0,22";
+		policy += ";DPS=10";
+		policy += ";ROUNDS=2";
+		policy += ";IVAL=S,0,2.50";
+	} else if(preset == "sensitive") {
+		policy += ";SEED=0,22";
+		policy += ";DPS=15";
+		policy += ";ROUNDS=2";
+		policy += ";IVAL=S,1,1.15";
+	} else if(preset == "very-sensitive") {
+		policy += ";SEED=0,20";
+		policy += ";DPS=20";
+		policy += ";ROUNDS=3";
+		policy += ";IVAL=S,1,0.50";
+	}
+	//  For --local:
+	//   --very-fast-local      -M 1 -N 0 -L 25 -i S,1,2.00
+	//   --fast-local           -M 2 -N 0 -L 22 -i S,1,1.75
+	//   --sensitive-local      -M 2 -N 0 -L 20 -i S,1,0.75 (default)
+	//   --very-sensitive-local -M 3 -N 0 -L 20 -i S,1,0.50
+	else if(preset == "very-fast-local") {
+		policy += ";SEED=0,25";
+		policy += ";DPS=5";
+		policy += ";ROUNDS=1";
+		policy += ";IVAL=S,1,2.00";
+	} else if(preset == "fast-local") {
+		policy += ";SEED=0,22";
+		policy += ";DPS=10";
+		policy += ";ROUNDS=2";
+		policy += ";IVAL=S,1,1.75";
+	} else if(preset == "sensitive-local") {
+		policy += ";SEED=0,20";
+		policy += ";DPS=15";
+		policy += ";ROUNDS=2";
+		policy += ";IVAL=S,1,0.75";
+	} else if(preset == "very-sensitive-local") {
+		policy += ";SEED=0,20";
+		policy += ";DPS=20";
+		policy += ";ROUNDS=3";
+		policy += ";IVAL=S,1,0.50";
+	}
+	else {
+		cerr << "Unknown preset: " << preset.c_str() << endl;
+	}
+}
diff --git a/presets.h b/presets.h
new file mode 100644
index 0000000..dfcec41
--- /dev/null
+++ b/presets.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * presets.h
+ *
+ * Maps simple command-line options to more complicated combinations of
+ * options for ease-of-use.
+ */
+
+#ifndef PRESETS_H_
+#define PRESETS_H_
+
+#include <string>
+#include <utility>
+#include "ds.h"
+
+class Presets {
+public:
+	
+	Presets() { }
+	
+	virtual ~Presets() { }
+	
+	virtual void apply(
+		const std::string& preset,
+		std::string& policy,
+		EList<std::pair<int, std::string> >& opts) = 0;
+	
+	virtual const char * name() = 0;
+};
+
+/**
+ * Initial collection of presets: 8/14/2011 prior to first Bowtie 2 release.
+ */
+class PresetsV0 : public Presets {
+public:
+	
+	PresetsV0() : Presets() { }
+	
+	virtual ~PresetsV0() { }
+	
+	virtual void apply(
+		const std::string& preset,
+		std::string& policy,
+		EList<std::pair<int, std::string> >& opts);
+
+	virtual const char * name() { return "V0"; }
+};
+
+#endif /*ndef PRESETS_H_*/
diff --git a/processor_support.h b/processor_support.h
new file mode 100644
index 0000000..f68ee65
--- /dev/null
+++ b/processor_support.h
@@ -0,0 +1,70 @@
+#ifndef PROCESSOR_SUPPORT_H_
+#define PROCESSOR_SUPPORT_H_
+
+// Utility class ProcessorSupport provides POPCNTenabled() to determine
+// processor support for POPCNT instruction. It uses CPUID to
+// retrieve the processor capabilities.
+// for Intel ICC compiler __cpuid() is an intrinsic 
+// for Microsoft compiler __cpuid() is provided by #include <intrin.h>
+// for GCC compiler __get_cpuid() is provided by #include <cpuid.h>
+
+// Intel compiler defines __GNUC__, so this is needed to disambiguate
+
+#if defined(__INTEL_COMPILER)
+#   define USING_INTEL_COMPILER
+#elif defined(__GNUC__)
+#   define USING_GCC_COMPILER
+#   include <cpuid.h>
+#elif defined(_MSC_VER)
+// __MSC_VER defined by Microsoft compiler
+#define USING MSC_COMPILER
+#endif
+
+struct regs_t {unsigned int EAX, EBX, ECX, EDX;};
+#define BIT(n) ((1<<n))
+
+class ProcessorSupport {
+
+#ifdef POPCNT_CAPABILITY 
+
+public: 
+    ProcessorSupport() { } 
+    bool POPCNTenabled()
+    {
+    // from: Intel® 64 and IA-32 Architectures Software Developer’s Manual, 325462-036US,March 2013
+    //Before an application attempts to use the POPCNT instruction, it must check that the
+    //processor supports SSE4.2
+    //“(if CPUID.01H:ECX.SSE4_2[bit 20] = 1) and POPCNT (if CPUID.01H:ECX.POPCNT[bit 23] = 1)”
+    //
+    // see p.272 of http://download.intel.com/products/processor/manual/253667.pdf available at
+    // http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html
+    // Also http://en.wikipedia.org/wiki/SSE4 talks about available on Intel & AMD processors
+
+    regs_t regs;
+
+    try {
+#if ( defined(USING_INTEL_COMPILER) || defined(USING_MSC_COMPILER) )
+        __cpuid((void *) &regs,0); // test if __cpuid() works, if not catch the exception
+        __cpuid((void *) &regs,0x1); // POPCNT bit is bit 23 in ECX
+#elif defined(USING_GCC_COMPILER)
+        __get_cpuid(0x1, &regs.EAX, &regs.EBX, &regs.ECX, &regs.EDX);
+#else
+        std::cerr << “ERROR: please define __cpuid() for this build.\n”; 
+        assert(0);
+#endif
+        if( !( (regs.ECX & BIT(20)) && (regs.ECX & BIT(23)) ) ) return false;
+    }
+    catch (int e) {
+        return false;
+    }
+    return true;
+    }
+
+#endif // POPCNT_CAPABILITY
+};
+
+#endif /*PROCESSOR_SUPPORT_H_*/
+
+
+
+
diff --git a/qual.cpp b/qual.cpp
new file mode 100644
index 0000000..f62fbc7
--- /dev/null
+++ b/qual.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/// An array that transforms Phred qualities into their maq-like
+/// equivalents by dividing by ten and rounding to the nearest 10,
+/// but saturating at 3.
+unsigned char qualRounds[] = {
+	0, 0, 0, 0, 0,                          //   0 -   4
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, //   5 -  14
+	20, 20, 20, 20, 20, 20, 20, 20, 20, 20, //  15 -  24
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  25 -  34
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  35 -  44
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  45 -  54
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  55 -  64
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  65 -  74
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  75 -  84
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  85 -  94
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, //  95 - 104
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 105 - 114
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 115 - 124
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 125 - 134
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 135 - 144
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 145 - 154
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 155 - 164
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 165 - 174
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 175 - 184
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 185 - 194
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 195 - 204
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 205 - 214
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 215 - 224
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 225 - 234
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 235 - 244
+	30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 245 - 254
+	30                                      // 255
+};
+
+/**
+ * Lookup table for converting from Solexa-scaled (log-odds) quality
+ * values to Phred-scaled quality values.
+ */
+unsigned char solToPhred[] = {
+	/* -10 */ 0, 1, 1, 1, 1, 1, 1, 2, 2, 3,
+	/* 0 */ 3, 4, 4, 5, 5, 6, 7, 8, 9, 10,
+	/* 10 */ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+	/* 20 */ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+	/* 30 */ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+	/* 40 */ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+	/* 50 */ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+	/* 60 */ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+	/* 70 */ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+	/* 80 */ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+	/* 90 */ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+	/* 100 */ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+	/* 110 */ 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+	/* 120 */ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+	/* 130 */ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
+	/* 140 */ 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+	/* 150 */ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	/* 160 */ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+	/* 170 */ 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+	/* 180 */ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+	/* 190 */ 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
+	/* 200 */ 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+	/* 210 */ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
+	/* 220 */ 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
+	/* 230 */ 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	/* 240 */ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
+	/* 250 */ 250, 251, 252, 253, 254, 255
+};
diff --git a/qual.h b/qual.h
new file mode 100644
index 0000000..089080b
--- /dev/null
+++ b/qual.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QUAL_H_
+#define QUAL_H_
+
+#include <stdexcept>
+#include "search_globals.h"
+#include "sstring.h"
+
+extern unsigned char qualRounds[];
+extern unsigned char solToPhred[];
+
+/// Translate a Phred-encoded ASCII character into a Phred quality
+static inline uint8_t phredcToPhredq(char c) {
+	return ((uint8_t)c >= 33 ? ((uint8_t)c - 33) : 0);
+}
+
+/**
+ * Convert a Solexa-scaled quality value into a Phred-scale quality
+ * value.
+ *
+ * p = probability that base is miscalled
+ * Qphred = -10 * log10 (p)
+ * Qsolexa = -10 * log10 (p / (1 - p))
+ * See: http://en.wikipedia.org/wiki/FASTQ_format
+ *
+ */
+static inline uint8_t solexaToPhred(int sol) {
+	assert_lt(sol, 256);
+	if(sol < -10) return 0;
+	return solToPhred[sol+10];
+}
+
+class SimplePhredPenalty {
+public:
+	static uint8_t mmPenalty (uint8_t qual) {
+		return qual;
+	}
+	static uint8_t delPenalty(uint8_t qual) {
+		return qual;
+	}
+	static uint8_t insPenalty(uint8_t qual_left, uint8_t qual_right) {
+		return std::max(qual_left, qual_right);
+	}
+};
+
+class MaqPhredPenalty {
+public:
+	static uint8_t mmPenalty (uint8_t qual) {
+		return qualRounds[qual];
+	}
+	static uint8_t delPenalty(uint8_t qual) {
+		return qualRounds[qual];
+	}
+	static uint8_t insPenalty(uint8_t qual_left, uint8_t qual_right) {
+		return qualRounds[std::max(qual_left, qual_right)];
+	}
+};
+
+static inline uint8_t mmPenalty(bool maq, uint8_t qual) {
+	if(maq) {
+		return MaqPhredPenalty::mmPenalty(qual);
+	} else {
+		return SimplePhredPenalty::mmPenalty(qual);
+	}
+}
+
+static inline uint8_t delPenalty(bool maq, uint8_t qual) {
+	if(maq) {
+		return MaqPhredPenalty::delPenalty(qual);
+	} else {
+		return SimplePhredPenalty::delPenalty(qual);
+	}
+}
+
+static inline uint8_t insPenalty(bool maq, uint8_t qual_left, uint8_t qual_right) {
+	if(maq) {
+		return MaqPhredPenalty::insPenalty(qual_left, qual_right);
+	} else {
+		return SimplePhredPenalty::insPenalty(qual_left, qual_right);
+	}
+}
+
+/**
+ * Take an ASCII-encoded quality value and convert it to a Phred33
+ * ASCII char.
+ */
+inline static char charToPhred33(char c, bool solQuals, bool phred64Quals) {
+	using namespace std;
+	if(c == ' ') {
+		std::cerr << "Saw a space but expected an ASCII-encoded quality value." << endl
+		          << "Are quality values formatted as integers?  If so, try --integer-quals." << endl;
+		throw 1;
+	}
+	if (solQuals) {
+		// Convert solexa-scaled chars to phred
+		// http://maq.sourceforge.net/fastq.shtml
+		char cc = solexaToPhred((int)c - 64) + 33;
+		if (cc < 33) {
+			std::cerr << "Saw ASCII character "
+			          << ((int)c)
+			          << " but expected 64-based Solexa qual (converts to " << (int)cc << ")." << endl
+			          << "Try not specifying --solexa-quals." << endl;
+			throw 1;
+		}
+		c = cc;
+	}
+	else if(phred64Quals) {
+		if (c < 64) {
+			cerr << "Saw ASCII character "
+			     << ((int)c)
+			     << " but expected 64-based Phred qual." << endl
+			     << "Try not specifying --solexa1.3-quals/--phred64-quals." << endl;
+			throw 1;
+		}
+		// Convert to 33-based phred
+		c -= (64-33);
+	}
+	else {
+		// Keep the phred quality
+		if (c < 33) {
+			cerr << "Saw ASCII character "
+			     << ((int)c)
+			     << " but expected 33-based Phred qual." << endl;
+			throw 1;
+		}
+	}
+	return c;
+}
+
+/**
+ * Take an integer quality value and convert it to a Phred33 ASCII
+ * char.
+ */
+inline static char intToPhred33(int iQ, bool solQuals) {
+	using namespace std;
+	int pQ;
+	if (solQuals) {
+		// Convert from solexa quality to phred
+		// quality and translate to ASCII
+		// http://maq.sourceforge.net/qual.shtml
+		pQ = solexaToPhred((int)iQ) + 33;
+	} else {
+		// Keep the phred quality and translate
+		// to ASCII
+		pQ = (iQ <= 93 ? iQ : 93) + 33;
+	}
+	if (pQ < 33) {
+		cerr << "Saw negative Phred quality " << ((int)pQ-33) << "." << endl;
+		throw 1;
+	}
+	assert_geq(pQ, 0);
+	return (int)pQ;
+}
+
+inline static uint8_t roundPenalty(uint8_t p) {
+	if(gNoMaqRound) return p;
+	return qualRounds[p];
+}
+
+/**
+ * Fill the q[] array with the penalties that are determined by
+ * subtracting the quality values of the alternate basecalls from
+ * the quality of the primary basecall.
+ */
+inline static uint8_t penaltiesAt(size_t off, uint8_t *q,
+                                  int alts,
+                                  const BTString&    qual,
+                                  const BTDnaString *altQry,
+                                  const BTString    *altQual)
+{
+	uint8_t primQ = qual[off]; // qual of primary call
+	uint8_t bestPenalty = roundPenalty(phredcToPhredq(primQ));
+	// By default, any mismatch incurs a penalty equal to the quality
+	// of the called base
+	q[0] = q[1] = q[2] = q[3] = bestPenalty;
+	for(int i = 0; i < alts; i++) {
+		uint8_t altQ = altQual[i][off]; // qual of alt call
+		if(altQ == 33) break; // no alt call
+		assert_leq(altQ, primQ);
+		uint8_t pen = roundPenalty(primQ - altQ);
+		if(pen < bestPenalty) {
+			bestPenalty = pen;
+		}
+		// Get the base
+		int altC = (int)altQry[i][off];
+		assert_lt(altC, 4);
+		q[altC] = pen;
+	}
+	// Return the best penalty so that the caller can evaluate whether
+	// any of the penalties are within-budget
+	return bestPenalty;
+}
+
+/**
+ * Fill the q[] array with the penalties that are determined by
+ * subtracting the quality values of the alternate basecalls from
+ * the quality of the primary basecall.
+ */
+inline static uint8_t loPenaltyAt(size_t off, int alts,
+                                  const BTString&    qual,
+                                  const BTString    *altQual)
+{
+	uint8_t primQ = qual[off]; // qual of primary call
+	uint8_t bestPenalty = roundPenalty(phredcToPhredq(primQ));
+	for(int i = 0; i < alts; i++) {
+		uint8_t altQ = altQual[i][off]; // qual of alt call
+		if(altQ == 33) break; // no more alt calls at this position
+		assert_leq(altQ, primQ);
+		uint8_t pen = roundPenalty(primQ - altQ);
+		if(pen < bestPenalty) {
+			bestPenalty = pen;
+		}
+	}
+	return bestPenalty;
+}
+
+#endif /*QUAL_H_*/
diff --git a/random_source.cpp b/random_source.cpp
new file mode 100644
index 0000000..0311f91
--- /dev/null
+++ b/random_source.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "random_source.h"
+#include "random_util.h"
+
+#ifdef MERSENNE_TWISTER
+
+void RandomSource::gen_state() {
+	for(int i = 0; i < (n - m); ++i) {
+		state_[i] = state_[i + m] ^ twiddle(state_[i], state_[i + 1]);
+	}
+	for(int i = n - m; i < (n - 1); ++i) {
+		state_[i] = state_[i + m - n] ^ twiddle(state_[i], state_[i + 1]);
+	}
+	state_[n - 1] = state_[m - 1] ^ twiddle(state_[n - 1], state_[0]);
+	p_ = 0; // reset position
+}
+
+void RandomSource::init(uint32_t s) {  // init by 32 bit seed
+	reset();
+	state_[0] = s;
+	for(int i = 1; i < n; ++i) {
+		state_[i] = 1812433253UL * (state_[i - 1] ^ (state_[i - 1] >> 30)) + i;
+	}
+	p_ = n; // force gen_state() to be called for next random number
+	inited_ = true;
+}
+
+void RandomSource::init(const uint32_t* array, int size) { // init by array
+	init(19650218UL);
+	int i = 1, j = 0;
+	for(int k = ((n > size) ? n : size); k; --k) {
+		state_[i] = (state_[i] ^ ((state_[i - 1] ^ (state_[i - 1] >> 30)) * 1664525UL)) + array[j] + j; // non linear
+		++j; j %= size;
+		if((++i) == n) { state_[0] = state_[n - 1]; i = 1; }
+	}
+	for(int k = n - 1; k; --k) {
+		state_[i] = (state_[i] ^ ((state_[i - 1] ^ (state_[i - 1] >> 30)) * 1566083941UL)) - i;
+		if((++i) == n) { state_[0] = state_[n - 1]; i = 1; }
+	}
+	state_[0] = 0x80000000UL; // MSB is 1; assuring non-zero initial array
+	p_ = n; // force gen_state() to be called for next random number
+	inited_ = true;
+}
+
+#endif
+
+#ifdef MAIN_RANDOM_SOURCE
+
+using namespace std;
+
+int main(void) {
+	cerr << "Test 1" << endl;
+	{
+		RandomSource rnd;
+		int cnts[32];
+		for(size_t i = 0; i < 32; i++) {
+			cnts[i] = 0;
+		}
+		for(uint32_t j = 0; j < 10; j++) {
+			rnd.init(j);
+			for(size_t i = 0; i < 10000; i++) {
+				uint32_t rndi = rnd.nextU32();
+				for(size_t i = 0; i < 32; i++) {
+					if((rndi & 1) != 0) {
+						cnts[i]++;
+					}
+					rndi >>= 1;
+				}
+			}
+			for(size_t i = 0; i < 32; i++) {
+				cerr << i << ": " << cnts[i] << endl;
+			}
+		}
+	}
+
+	cerr << "Test 2" << endl;
+	{
+		int cnts[4][4];
+		for(size_t i = 0; i < 4; i++) {
+			for(size_t j = 0; j < 4; j++) {
+				cnts[i][j] = 0;
+			}
+		}
+		RandomSource rnd;
+		Random1toN rn1n;
+		for(size_t i = 0; i < 100; i++) {
+			rnd.init((uint32_t)i);
+			rn1n.init(4, true);
+			uint32_t ri = rn1n.next(rnd);
+			cnts[ri][0]++;
+			ri = rn1n.next(rnd);
+			cnts[ri][1]++;
+			ri = rn1n.next(rnd);
+			cnts[ri][2]++;
+			ri = rn1n.next(rnd);
+			cnts[ri][3]++;
+		}
+		for(size_t i = 0; i < 4; i++) {
+			for(size_t j = 0; j < 4; j++) {
+				cerr << cnts[i][j];
+				if(j < 3) {
+					cerr << ", ";
+				}
+			}
+			cerr << endl;
+		}
+	}
+}
+
+#endif
diff --git a/random_source.h b/random_source.h
new file mode 100644
index 0000000..098d54f
--- /dev/null
+++ b/random_source.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef RANDOM_GEN_H_
+#define RANDOM_GEN_H_
+
+#include <stdint.h>
+#include "assert_helpers.h"
+
+//#define MERSENNE_TWISTER
+
+#ifndef MERSENNE_TWISTER
+
+/**
+ * Simple pseudo-random linear congruential generator, a la Numerical
+ * Recipes.
+ */
+class RandomSource {
+public:
+	static const uint32_t DEFUALT_A = 1664525;
+	static const uint32_t DEFUALT_C = 1013904223;
+
+	RandomSource() :
+		a(DEFUALT_A), c(DEFUALT_C), inited_(false) { }
+	RandomSource(uint32_t _last) :
+		a(DEFUALT_A), c(DEFUALT_C), last(_last), inited_(true) { }
+	RandomSource(uint32_t _a, uint32_t _c) :
+		a(_a), c(_c), inited_(false) { }
+
+	void init(uint32_t seed = 0) {
+		last = seed;
+		inited_ = true;
+		lastOff = 30;
+	}
+
+	uint32_t nextU32() {
+		assert(inited_);
+		uint32_t ret;
+		last = a * last + c;
+		ret = last >> 16;
+		last = a * last + c;
+		ret ^= last;
+		lastOff = 0;
+		return ret;
+	}
+    
+    uint64_t nextU64() {
+		assert(inited_);
+		uint64_t first = nextU32();
+		first = first << 32;
+		uint64_t second = nextU32();
+		return first | second;
+	}
+
+	/**
+	 * Return a pseudo-random unsigned 32-bit integer sampled uniformly
+	 * from [lo, hi].
+	 */
+	uint32_t nextU32Range(uint32_t lo, uint32_t hi) {
+		uint32_t ret = lo;
+		if(hi > lo) {
+			ret += (nextU32() % (hi-lo+1));
+		}
+		return ret;
+	}
+
+	/**
+	 * Get next 2-bit unsigned integer.
+	 */
+	uint32_t nextU2() {
+		assert(inited_);
+		if(lastOff > 30) {
+			nextU32();
+		}
+		uint32_t ret = (last >> lastOff) & 3;
+		lastOff += 2;
+		return ret;
+	}
+
+	/**
+	 * Get next boolean.
+	 */
+	bool nextBool() {
+		assert(inited_);
+		if(lastOff > 31) {
+			nextU32();
+		}
+		uint32_t ret = (last >> lastOff) & 1;
+		lastOff++;
+		return ret;
+	}
+	
+	/**
+	 * Return an unsigned int chosen by picking randomly from among
+	 * options weighted by probabilies supplied as the elements of the
+	 * 'weights' array of length 'numWeights'.  The weights should add
+	 * to 1.
+	 */
+	uint32_t nextFromProbs(
+		const float* weights,
+		size_t numWeights)
+	{
+		float f = nextFloat();
+		float tot = 0.0f; // total weight seen so far
+		for(uint32_t i = 0; i < numWeights; i++) {
+			tot += weights[i];
+			if(f < tot) return i;
+		}
+		return (uint32_t)(numWeights-1);
+	}
+
+	float nextFloat() {
+		assert(inited_);
+		return (float)nextU32() / (float)0xffffffff;
+	}
+
+	static uint32_t nextU32(uint32_t last,
+	                        uint32_t a = DEFUALT_A,
+	                        uint32_t c = DEFUALT_C)
+	{
+		return (a * last) + c;
+	}
+	
+	uint32_t currentA() const { return a; }
+	uint32_t currentC() const { return c; }
+	uint32_t currentLast() const { return last; }
+
+private:
+	uint32_t a;
+	uint32_t c;
+	uint32_t last;
+	uint32_t lastOff;
+	bool inited_;
+};
+
+#else
+
+class RandomSource { // Mersenne Twister random number generator
+
+public:
+
+	// default constructor: uses default seed only if this is the first instance
+	RandomSource() {
+		reset();
+	}
+	
+	// constructor with 32 bit int as seed
+	RandomSource(uint32_t s) {
+		init(s);
+	}
+	
+	// constructor with array of size 32 bit ints as seed
+	RandomSource(const uint32_t* array, int size) {
+		init(array, size);
+	}
+	
+	void reset() {
+		state_[0] = 0;
+		p_ = 0;
+		inited_ = false;
+	}
+	
+	virtual ~RandomSource() { }
+	
+	// the two seed functions
+	void init(uint32_t); // seed with 32 bit integer
+	void init(const uint32_t*, int size); // seed with array
+
+	/**
+	 * Return next 1-bit unsigned integer.
+	 */
+	bool nextBool() {
+		return (nextU32() & 1) == 0;
+	}
+	
+	/**
+	 * Get next unsigned 32-bit integer.
+	 */
+	inline uint32_t nextU32() {
+		assert(inited_);
+		if(p_ == n) {
+			gen_state(); // new state vector needed
+		}
+		// gen_state() is split off to be non-inline, because it is only called once
+		// in every 624 calls and otherwise irand() would become too big to get inlined
+		uint32_t x = state_[p_++];
+		x ^= (x >> 11);
+		x ^= (x << 7) & 0x9D2C5680UL;
+		x ^= (x << 15) & 0xEFC60000UL;
+		x ^= (x >> 18);
+		return x;
+	}
+	
+	/**
+	 * Return next float between 0 and 1.
+	 */
+	float nextFloat() {
+		assert(inited_);
+		return (float)nextU32() / (float)0xffffffff;
+	}
+	
+protected: // used by derived classes, otherwise not accessible; use the ()-operator
+
+	static const int n = 624, m = 397; // compile time constants
+
+	// the variables below are static (no duplicates can exist)
+	uint32_t state_[n]; // state vector array
+	int p_; // position in state array
+	
+	bool inited_; // true if init function has been called
+	
+	// private functions used to generate the pseudo random numbers
+	uint32_t twiddle(uint32_t u, uint32_t v) {
+		return (((u & 0x80000000UL) | (v & 0x7FFFFFFFUL)) >> 1) ^ ((v & 1UL) ? 0x9908B0DFUL : 0x0UL);
+	}
+	
+	void gen_state(); // generate new state
+	
+};
+
+#endif
+
+#endif /*RANDOM_GEN_H_*/
diff --git a/random_util.cpp b/random_util.cpp
new file mode 100644
index 0000000..2c5ed7d
--- /dev/null
+++ b/random_util.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "random_util.h"
+
+const size_t Random1toN::SWAPLIST_THRESH = 128;
+const size_t Random1toN::CONVERSION_THRESH = 16;
+const float Random1toN::CONVERSION_FRAC = 0.10f;
diff --git a/random_util.h b/random_util.h
new file mode 100644
index 0000000..39f8c04
--- /dev/null
+++ b/random_util.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef RANDOM_UTIL_H_
+#define RANDOM_UTIL_H_
+
+#include <algorithm>
+#include "random_source.h"
+#include "ds.h"
+
+/**
+ * Return a random integer in [1, N].  Each time it's called it samples again
+ * without replacement.  done() indicates when all elements have been given
+ * out.
+ */
+class Random1toN {
+
+	typedef uint32_t T;
+
+public:
+
+	// A set with fewer than this many elements should kick us into swap-list
+	// mode immediately.  Otherwise we start in seen-list mode and then
+	// possibly proceed to swap-list mode later.
+	static const size_t SWAPLIST_THRESH;
+	
+	// Convert seen-list to swap-list after this many entries in the seen-list.
+	static const size_t CONVERSION_THRESH;
+
+	// Convert seen-list to swap-list after this (this times n_) many entries
+	// in the seen-list.
+	static const float CONVERSION_FRAC;
+
+	Random1toN(int cat = 0) :
+		sz_(0), n_(0), cur_(0),
+		list_(SWAPLIST_THRESH, cat), seen_(CONVERSION_THRESH, cat),
+		thresh_(0) {}
+	
+	Random1toN(size_t n, int cat = 0) :
+		sz_(0), n_(n), cur_(0),
+		list_(SWAPLIST_THRESH, cat), seen_(CONVERSION_THRESH, cat),
+		thresh_(0) {}
+
+	/**
+	 * Initialize the set of pseudo-randoms to be given out without replacement.
+	 */
+	void init(size_t n, bool withoutReplacement) {
+		sz_ = n_ = n;
+		converted_ = false;
+		swaplist_ = n < SWAPLIST_THRESH || withoutReplacement;
+		cur_ = 0;
+		list_.clear();
+		seen_.clear();
+		thresh_ = std::max(CONVERSION_THRESH, (size_t)(CONVERSION_FRAC * n));
+	}
+	
+	/**
+	 * Reset in preparation for giving out a fresh collection of pseudo-randoms
+	 * without replacement.
+	 */
+	void reset() {
+		sz_ = n_ = cur_ = 0; swaplist_ = converted_ = false;
+		list_.clear(); seen_.clear();
+		thresh_ = 0;
+	}
+
+	/**
+	 * Get next pseudo-random element without replacement.
+	 */
+	T next(RandomSource& rnd) {
+		assert(!done());
+		if(cur_ == 0 && !converted_) {
+			// This is the first call to next()
+			if(n_ == 1) {
+				// Trivial case: set of 1
+				cur_ = 1;
+				return 0;
+			}
+			if(swaplist_) {
+				// The set is small, so we go immediately to the random
+				// swapping list
+				list_.resize(n_);
+				for(size_t i = 0; i < n_; i++) {
+					list_[i] = (T)i;
+				}
+			}
+		}
+		if(swaplist_) {
+			// Get next pseudo-random using the swap-list
+			size_t r = cur_ + (rnd.nextU32() % (n_ - cur_));
+			if(r != cur_) {
+				std::swap(list_[cur_], list_[r]);
+			}
+			return list_[cur_++];
+		} else {
+			assert(!converted_);
+			// Get next pseudo-random but reject it if it's in the seen-list
+			bool again = true;
+			T rn = 0;
+			size_t seenSz = seen_.size();
+			while(again) {
+				rn = rnd.nextU32() % (T)n_;
+				again = false;
+				for(size_t i = 0; i < seenSz; i++) {
+					if(seen_[i] == rn) {
+						again = true;
+						break;
+					}
+				}
+			}
+			// Add it to the seen-list
+			seen_.push_back(rn);
+			cur_++;
+			assert_leq(cur_, n_);
+			// Move on to using the swap-list?
+			assert_gt(thresh_, 0);
+			if(seen_.size() >= thresh_ && cur_ < n_) {
+				// Add all elements not already in the seen list to the
+				// swap-list
+				assert(!seen_.empty());
+				seen_.sort();
+				list_.resize(n_ - cur_);
+				size_t prev = 0;
+				size_t cur = 0;
+				for(size_t i = 0; i <= seenSz; i++) {
+					// Add all the elements between the previous element and
+					// this one
+					for(size_t j = prev; j < seen_[i]; j++) {
+						list_[cur++] = (T)j;
+					}
+					prev = seen_[i]+1;
+				}
+				for(size_t j = prev; j < n_; j++) {
+					list_[cur++] = (T)j;
+				}
+				assert_eq(cur, n_ - cur_);
+				seen_.clear();
+				cur_ = 0;
+				n_ = list_.size();
+				converted_ = true;
+				swaplist_ = true;
+			}
+			return rn;
+		}
+	}
+	
+	/**
+	 * Return true iff the generator was initialized.
+	 */
+	bool inited() const { return n_ > 0; }
+	
+	/**
+	 * Set so that there are no pseudo-randoms remaining.
+	 */
+	void setDone() { assert(inited()); cur_ = n_; }
+	
+	/**
+	 * Return true iff all pseudo-randoms have already been given out.
+	 */
+	bool done() const { return inited() && cur_ >= n_; }
+
+	/**
+	 * Return the total number of pseudo-randoms we are initialized to give
+	 * out, including ones already given out.
+	 */
+	size_t size() const { return n_; }
+	
+	/**
+	 * Return the number of pseudo-randoms left to give out.
+	 */
+	size_t left() const { return n_ - cur_; }
+
+	/**
+	 * Return the total size occupued by the Descent driver and all its
+	 * constituent parts.
+	 */
+	size_t totalSizeBytes() const {
+		return list_.totalSizeBytes() +
+		       seen_.totalSizeBytes();
+	}
+
+	/**
+	 * Return the total capacity of the Descent driver and all its constituent
+	 * parts.
+	 */
+	size_t totalCapacityBytes() const {
+		return list_.totalCapacityBytes() +
+		       seen_.totalCapacityBytes();
+	}
+
+protected:
+
+	size_t   sz_;        // domain to pick elts from
+	size_t   n_;         // number of elements in active list
+	bool     swaplist_;  // if small, use swapping
+	bool     converted_; // true iff seen-list was converted to swap-list
+	size_t   cur_;       // # times next() was called
+	EList<T> list_;      // pseudo-random swapping list
+	EList<T> seen_;      // prior to swaplist_ mode, list of
+	                     // pseudo-randoms given out
+	size_t   thresh_;    // conversion threshold for this instantiation, which
+	                     // depends both on CONVERSION_THRESH and on n_
+};
+
+#endif
diff --git a/read.h b/read.h
new file mode 100644
index 0000000..cd32248
--- /dev/null
+++ b/read.h
@@ -0,0 +1,533 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef READ_H_
+#define READ_H_
+
+#include <stdint.h>
+#include <sys/time.h>
+#include "ds.h"
+#include "sstring.h"
+#include "filebuf.h"
+#include "util.h"
+
+enum rna_strandness_format {
+    RNA_STRANDNESS_UNKNOWN = 0,
+    RNA_STRANDNESS_F,
+    RNA_STRANDNESS_R,
+    RNA_STRANDNESS_FR,
+    RNA_STRANDNESS_RF
+};
+
+typedef uint64_t TReadId;
+typedef size_t TReadOff;
+typedef int64_t TAlScore;
+
+class HitSet;
+
+/**
+ * A buffer for keeping all relevant information about a single read.
+ */
+struct Read {
+
+	Read() { reset(); }
+	
+	Read(const char *nm, const char *seq, const char *ql) { init(nm, seq, ql); }
+
+	void reset() {
+		rdid = 0;
+		endid = 0;
+		alts = 0;
+		trimmed5 = trimmed3 = 0;
+		readOrigBuf.clear();
+		patFw.clear();
+		patRc.clear();
+		qual.clear();
+		patFwRev.clear();
+		patRcRev.clear();
+		qualRev.clear();
+		name.clear();
+		for(int j = 0; j < 3; j++) {
+			altPatFw[j].clear();
+			altPatFwRev[j].clear();
+			altPatRc[j].clear();
+			altPatRcRev[j].clear();
+			altQual[j].clear();
+			altQualRev[j].clear();
+		}
+		color = fuzzy = false;
+		primer = '?';
+		trimc = '?';
+		filter = '?';
+		seed = 0;
+		ns_ = 0;
+	}
+	
+	/**
+	 * Finish initializing a new read.
+	 */
+	void finalize() {
+		for(size_t i = 0; i < patFw.length(); i++) {
+			if((int)patFw[i] > 3) {
+				ns_++;
+			}
+		}
+		constructRevComps();
+		constructReverses();
+	}
+
+	/**
+	 * Simple init function, used for testing.
+	 */
+	void init(
+		const char *nm,
+		const char *seq,
+		const char *ql)
+	{
+		reset();
+		patFw.installChars(seq);
+		qual.install(ql);
+		for(size_t i = 0; i < patFw.length(); i++) {
+			if((int)patFw[i] > 3) {
+				ns_++;
+			}
+		}
+		constructRevComps();
+		constructReverses();
+		if(nm != NULL) name.install(nm);
+	}
+
+	/// Return true iff the read (pair) is empty
+	bool empty() const {
+		return patFw.empty();
+	}
+
+	/// Return length of the read in the buffer
+	size_t length() const {
+		return patFw.length();
+	}
+	
+	/**
+	 * Return the number of Ns in the read.
+	 */
+	size_t ns() const {
+		return ns_;
+	}
+
+	/**
+	 * Construct reverse complement of the pattern and the fuzzy
+	 * alternative patters.  If read is in colorspace, just reverse
+	 * them.
+	 */
+	void constructRevComps() {
+		if(color) {
+			patRc.installReverse(patFw);
+			for(int j = 0; j < alts; j++) {
+				altPatRc[j].installReverse(altPatFw[j]);
+			}
+		} else {
+			patRc.installReverseComp(patFw);
+			for(int j = 0; j < alts; j++) {
+				altPatRc[j].installReverseComp(altPatFw[j]);
+			}
+		}
+	}
+
+	/**
+	 * Given patFw, patRc, and qual, construct the *Rev versions in
+	 * place.  Assumes constructRevComps() was called previously.
+	 */
+	void constructReverses() {
+		patFwRev.installReverse(patFw);
+		patRcRev.installReverse(patRc);
+		qualRev.installReverse(qual);
+		for(int j = 0; j < alts; j++) {
+			altPatFwRev[j].installReverse(altPatFw[j]);
+			altPatRcRev[j].installReverse(altPatRc[j]);
+			altQualRev[j].installReverse(altQual[j]);
+		}
+	}
+
+	/**
+	 * Append a "/1" or "/2" string onto the end of the name buf if
+	 * it's not already there.
+	 */
+	void fixMateName(int i) {
+		assert(i == 1 || i == 2);
+		size_t namelen = name.length();
+		bool append = false;
+		if(namelen < 2) {
+			// Name is too short to possibly have /1 or /2 on the end
+			append = true;
+		} else {
+			if(i == 1) {
+				// append = true iff mate name does not already end in /1
+				append =
+					name[namelen-2] != '/' ||
+					name[namelen-1] != '1';
+			} else {
+				// append = true iff mate name does not already end in /2
+				append =
+					name[namelen-2] != '/' ||
+					name[namelen-1] != '2';
+			}
+		}
+		if(append) {
+			name.append('/');
+			name.append("012"[i]);
+		}
+	}
+
+	/**
+	 * Dump basic information about this read to the given ostream.
+	 */
+	void dump(std::ostream& os) const {
+		using namespace std;
+		os << name << ' ';
+		if(color) {
+			os << patFw.toZBufXForm("0123.");
+		} else {
+			os << patFw;
+		}
+		os << ' ';
+		// Print out the fuzzy alternative sequences
+		for(int j = 0; j < 3; j++) {
+			bool started = false;
+			if(!altQual[j].empty()) {
+				for(size_t i = 0; i < length(); i++) {
+					if(altQual[j][i] != '!') {
+						started = true;
+					}
+					if(started) {
+						if(altQual[j][i] == '!') {
+							os << '-';
+						} else {
+							if(color) {
+								os << "0123."[(int)altPatFw[j][i]];
+							} else {
+								os << altPatFw[j][i];
+							}
+						}
+					}
+				}
+			}
+			cout << " ";
+		}
+		os << qual.toZBuf() << " ";
+		// Print out the fuzzy alternative quality strings
+		for(int j = 0; j < 3; j++) {
+			bool started = false;
+			if(!altQual[j].empty()) {
+				for(size_t i = 0; i < length(); i++) {
+					if(altQual[j][i] != '!') {
+						started = true;
+					}
+					if(started) {
+						os << altQual[j][i];
+					}
+				}
+			}
+			if(j == 2) {
+				os << endl;
+			} else {
+				os << " ";
+			}
+		}
+	}
+	
+	/**
+	 * Check whether two reads are the same in the sense that they will
+	 * lead to us finding the same set of alignments.
+	 */
+	static bool same(
+		const BTDnaString& seq1,
+		const BTString&    qual1,
+		const BTDnaString& seq2,
+		const BTString&    qual2,
+		bool qualitiesMatter)
+	{
+		if(seq1.length() != seq2.length()) {
+			return false;
+		}
+		for(size_t i = 0; i < seq1.length(); i++) {
+			if(seq1[i] != seq2[i]) return false;
+		}
+		if(qualitiesMatter) {
+			if(qual1.length() != qual2.length()) {
+				return false;
+			}
+			for(size_t i = 0; i < qual1.length(); i++) {
+				if(qual1[i] != qual2[i]) return false;
+			}
+		}
+		return true;
+	}
+
+	/**
+	 * Get the nucleotide and quality value at the given offset from 5' end.
+	 * If 'fw' is false, get the reverse complement.
+	 */
+	std::pair<int, int> get(TReadOff off5p, bool fw) const {
+		assert_lt(off5p, length());
+		int c = (int)patFw[off5p];
+        int q = qual[off5p];
+        assert_geq(q, 33);
+		return make_pair((!fw && c < 4) ? (c ^ 3) : c, q - 33);
+	}
+	
+	/**
+	 * Get the nucleotide at the given offset from 5' end.
+	 * If 'fw' is false, get the reverse complement.
+	 */
+	int getc(TReadOff off5p, bool fw) const {
+		assert_lt(off5p, length());
+		int c = (int)patFw[off5p];
+		return (!fw && c < 4) ? (c ^ 3) : c;
+	}
+	
+	/**
+	 * Get the quality value at the given offset from 5' end.
+	 */
+	int getq(TReadOff off5p) const {
+		assert_lt(off5p, length());
+        int q = qual[off5p];
+        assert_geq(q, 33);
+		return q-33;
+	}
+
+#ifndef NDEBUG
+	/**
+	 * Check that read info is internally consistent.
+	 */
+	bool repOk() const {
+		if(patFw.empty()) return true;
+		assert_eq(qual.length(), patFw.length());
+		return true;
+	}
+#endif
+
+	BTDnaString patFw;            // forward-strand sequence
+	BTDnaString patRc;            // reverse-complement sequence
+	BTString    qual;             // quality values
+
+	BTDnaString altPatFw[3];
+	BTDnaString altPatRc[3];
+	BTString    altQual[3];
+
+	BTDnaString patFwRev;
+	BTDnaString patRcRev;
+	BTString    qualRev;
+
+	BTDnaString altPatFwRev[3];
+	BTDnaString altPatRcRev[3];
+	BTString    altQualRev[3];
+
+	// For remembering the exact input text used to define a read
+	SStringExpandable<char> readOrigBuf;
+
+	BTString name;      // read name
+	TReadId  rdid;      // 0-based id based on pair's offset in read file(s)
+	TReadId  endid;     // 0-based id based on pair's offset in read file(s)
+	                    // and which mate ("end") this is
+	int      mate;      // 0 = single-end, 1 = mate1, 2 = mate2
+	uint32_t seed;      // random seed
+	size_t   ns_;       // # Ns
+	int      alts;      // number of alternatives
+	bool     fuzzy;     // whether to employ fuzziness
+	bool     color;     // whether read is in color space
+	char     primer;    // primer base, for csfasta files
+	char     trimc;     // trimmed color, for csfasta files
+	char     filter;    // if read format permits filter char, set it here
+	int      trimmed5;  // amount actually trimmed off 5' end
+	int      trimmed3;  // amount actually trimmed off 3' end
+	HitSet  *hitset;    // holds previously-found hits; for chaining
+};
+
+/**
+ * A string of FmStringOps represent a string of tasks performed by the
+ * best-first alignment search.  We model the search as a series of FM ops
+ * interspersed with reported alignments.
+ */
+struct FmStringOp {
+	bool alignment;  // true -> found an alignment
+	TAlScore pen;    // penalty of the FM op or alignment
+	size_t n;        // number of FM ops (only relevant for non-alignment)
+};
+
+/**
+ * A string that summarizes the progress of an FM-index-assistet best-first
+ * search.  Useful for trying to figure out what the aligner is spending its
+ * time doing for a given read.
+ */
+struct FmString {
+
+	/**
+	 * Add one or more FM index ops to the op string
+	 */
+	void add(bool alignment, TAlScore pen, size_t nops) {
+		if(ops.empty() || ops.back().pen != pen) {
+			ops.expand();
+			ops.back().alignment = alignment;
+			ops.back().pen = pen;
+			ops.back().n = 0;
+		}
+		ops.back().n++;
+	}
+	
+	/**
+	 * Reset FmString to uninitialized state.
+	 */
+	void reset() {
+		pen = std::numeric_limits<TAlScore>::max();
+		ops.clear();
+	}
+
+	/**
+	 * Print a :Z optional field where certain characters (whitespace, colon
+	 * and percent) are escaped using % escapes.
+	 */
+	void print(BTString& o, char *buf) const {
+		for(size_t i = 0; i < ops.size(); i++) {
+			if(i > 0) {
+				o.append(';');
+			}
+			if(ops[i].alignment) {
+				o.append("A,");
+				itoa10(ops[i].pen, buf);
+				o.append(buf);
+			} else {
+				o.append("F,");
+				itoa10(ops[i].pen, buf); o.append(buf);
+				o.append(',');
+				itoa10(ops[i].n, buf); o.append(buf);
+			}
+		}
+	}
+
+	TAlScore pen;          // current penalty
+	EList<FmStringOp> ops; // op string
+};
+
+/**
+ * Key per-read metrics.  These are used for thresholds, allowing us to bail
+ * for unproductive reads.  They also the basis of what's printed when the user
+ * specifies --read-times.
+ */
+struct PerReadMetrics {
+
+	PerReadMetrics() { reset(); }
+
+	void reset() {
+		nExIters =
+		nExDps   = nExDpSuccs   = nExDpFails   =
+		nMateDps = nMateDpSuccs = nMateDpFails =
+		nExUgs   = nExUgSuccs   = nExUgFails   =
+		nMateUgs = nMateUgSuccs = nMateUgFails =
+		nExEes   = nExEeSuccs   = nExEeFails   =
+		nRedundants =
+		nEeFmops = nSdFmops = nExFmops =
+		nDpFail = nDpFailStreak = nDpLastSucc =
+		nUgFail = nUgFailStreak = nUgLastSucc =
+		nEeFail = nEeFailStreak = nEeLastSucc =
+		nFilt = 0;
+		nFtabs = 0;
+		nRedSkip = 0;
+		nRedFail = 0;
+		nRedIns = 0;
+		doFmString = false;
+		nSeedRanges = nSeedElts = 0;
+		nSeedRangesFw = nSeedEltsFw = 0;
+		nSeedRangesRc = nSeedEltsRc = 0;
+		seedMedian = seedMean = 0;
+		bestLtMinscMate1 =
+		bestLtMinscMate2 = std::numeric_limits<TAlScore>::min();
+		fmString.reset();
+	}
+
+	struct timeval  tv_beg; // timer start to measure how long alignment takes
+	struct timezone tz_beg; // timer start to measure how long alignment takes
+
+	uint64_t nExIters;      // iterations of seed hit extend loop
+
+	uint64_t nExDps;        // # extend DPs run on this read
+	uint64_t nExDpSuccs;    // # extend DPs run on this read
+	uint64_t nExDpFails;    // # extend DPs run on this read
+	
+	uint64_t nExUgs;        // # extend ungapped alignments run on this read
+	uint64_t nExUgSuccs;    // # extend ungapped alignments run on this read
+	uint64_t nExUgFails;    // # extend ungapped alignments run on this read
+
+	uint64_t nExEes;        // # extend ungapped alignments run on this read
+	uint64_t nExEeSuccs;    // # extend ungapped alignments run on this read
+	uint64_t nExEeFails;    // # extend ungapped alignments run on this read
+
+	uint64_t nMateDps;      // # mate DPs run on this read
+	uint64_t nMateDpSuccs;  // # mate DPs run on this read
+	uint64_t nMateDpFails;  // # mate DPs run on this read
+	
+	uint64_t nMateUgs;      // # mate ungapped alignments run on this read
+	uint64_t nMateUgSuccs;  // # mate ungapped alignments run on this read
+	uint64_t nMateUgFails;  // # mate ungapped alignments run on this read
+
+	uint64_t nRedundants;   // # redundant seed hits
+	
+	uint64_t nSeedRanges;   // # BW ranges found for seeds
+	uint64_t nSeedElts;     // # BW elements found for seeds
+
+	uint64_t nSeedRangesFw; // # BW ranges found for seeds from fw read
+	uint64_t nSeedEltsFw;   // # BW elements found for seeds from fw read
+
+	uint64_t nSeedRangesRc; // # BW ranges found for seeds from fw read
+	uint64_t nSeedEltsRc;   // # BW elements found for seeds from fw read
+	
+	uint64_t seedMedian;    // median seed hit count
+	uint64_t seedMean;      // rounded mean seed hit count
+	
+	uint64_t nEeFmops;      // FM Index ops for end-to-end alignment
+	uint64_t nSdFmops;      // FM Index ops used to align seeds
+	uint64_t nExFmops;      // FM Index ops used to resolve offsets
+	
+	uint64_t nFtabs;        // # ftab lookups
+	uint64_t nRedSkip;      // # times redundant path was detected and aborted
+	uint64_t nRedFail;      // # times a path was deemed non-redundant
+	uint64_t nRedIns;       // # times a path was added to redundancy list
+	
+	uint64_t nDpFail;       // number of dp failures in a row up until now
+	uint64_t nDpFailStreak; // longest streak of dp failures
+	uint64_t nDpLastSucc;   // index of last dp attempt that succeeded
+	
+	uint64_t nUgFail;       // number of ungap failures in a row up until now
+	uint64_t nUgFailStreak; // longest streak of ungap failures
+	uint64_t nUgLastSucc;   // index of last ungap attempt that succeeded
+
+	uint64_t nEeFail;       // number of ungap failures in a row up until now
+	uint64_t nEeFailStreak; // longest streak of ungap failures
+	uint64_t nEeLastSucc;   // index of last ungap attempt that succeeded
+	
+	uint64_t nFilt;         // # mates filtered
+	
+	TAlScore bestLtMinscMate1; // best invalid score observed for mate 1
+	TAlScore bestLtMinscMate2; // best invalid score observed for mate 2
+	
+	// For collecting information to go into an FM string
+	bool doFmString;
+	FmString fmString;
+};
+
+#endif /*READ_H_*/
diff --git a/read_qseq.cpp b/read_qseq.cpp
new file mode 100644
index 0000000..ced6c68
--- /dev/null
+++ b/read_qseq.cpp
@@ -0,0 +1,304 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pat.h"
+
+/**
+ * Parse a name from fb_ and store in r.  Assume that the next
+ * character obtained via fb_.get() is the first character of
+ * the sequence and the string stops at the next char upto (could
+ * be tab, newline, etc.).
+ */
+int QseqPatternSource::parseName(
+	Read& r,      // buffer for mate 1
+	Read* r2,     // buffer for mate 2 (NULL if mate2 is read separately)
+	bool append,     // true -> append characters, false -> skip them
+	bool clearFirst, // clear the name buffer first
+	bool warnEmpty,  // emit a warning if nothing was added to the name
+	bool useDefault, // if nothing is read, put readCnt_ as a default value
+	int upto)        // stop parsing when we first reach character 'upto'
+{
+	if(clearFirst) {
+		if(r2 != NULL) r2->name.clear();
+		r.name.clear();
+	}
+	while(true) {
+		int c;
+		if((c = fb_.get()) < 0) {
+			// EOF reached in the middle of the name
+			return -1;
+		}
+		if(c == '\n' || c == '\r') {
+			// EOL reached in the middle of the name
+			return -1;
+		}
+		if(c == upto) {
+			// Finished with field
+			break;
+		}
+		if(append) {
+			if(r2 != NULL) r2->name.append(c);
+			r.name.append(c);
+		}
+	}
+	// Set up a default name if one hasn't been set
+	if(r.name.empty() && useDefault && append) {
+		char cbuf[20];
+		itoa10(readCnt_, cbuf);
+		r.name.append(cbuf);
+		if(r2 != NULL) r2->name.append(cbuf);
+	}
+	if(r.name.empty() && warnEmpty) {
+		cerr << "Warning: read had an empty name field" << endl;
+	}
+	return (int)r.name.length();
+}
+
+/**
+ * Parse a single sequence from fb_ and store in r.  Assume
+ * that the next character obtained via fb_.get() is the first
+ * character of the sequence and the sequence stops at the next
+ * char upto (could be tab, newline, etc.).
+ */
+int QseqPatternSource::parseSeq(
+	Read& r,
+	int& charsRead,
+	int& trim5,
+	char upto)
+{
+	int begin = 0;
+	int c = fb_.get();
+	assert(c != upto);
+	r.patFw.clear();
+	r.color = gColor;
+	if(gColor) {
+		// NOTE: clearly this is not relevant for Illumina output, but
+		// I'm keeping it here in case there's some reason to put SOLiD
+		// data in this format in the future.
+	
+		// This may be a primer character.  If so, keep it in the
+		// 'primer' field of the read buf and parse the rest of the
+		// read without it.
+		c = toupper(c);
+		if(asc2dnacat[c] > 0) {
+			// First char is a DNA char
+			int c2 = toupper(fb_.peek());
+			// Second char is a color char
+			if(asc2colcat[c2] > 0) {
+				r.primer = c;
+				r.trimc = c2;
+				trim5 += 2; // trim primer and first color
+			}
+		}
+		if(c < 0) { return -1; }
+	}
+	while(c != upto) {
+		if(c == '.') c = 'N';
+		if(gColor) {
+			if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
+		}
+		if(isalpha(c)) {
+			assert_in(toupper(c), "ACGTN");
+			if(begin++ >= trim5) {
+				assert_neq(0, asc2dnacat[c]);
+				r.patFw.append(asc2dna[c]);
+			}
+			charsRead++;
+		}
+		if((c = fb_.get()) < 0) {
+			return -1;
+		}
+	}
+	r.patFw.trimEnd(gTrim3);
+	return (int)r.patFw.length();
+}
+
+/**
+ * Parse a single quality string from fb_ and store in r.
+ * Assume that the next character obtained via fb_.get() is
+ * the first character of the quality string and the string stops
+ * at the next char upto (could be tab, newline, etc.).
+ */
+int QseqPatternSource::parseQuals(
+	Read& r,
+	int charsRead,
+	int dstLen,
+	int trim5,
+	char& c2,
+	char upto = '\t',
+	char upto2 = -1)
+{
+	int qualsRead = 0;
+	int c = 0;
+	if (intQuals_) {
+		// Probably not relevant
+		char buf[4096];
+		while (qualsRead < charsRead) {
+			qualToks_.clear();
+			if(!tokenizeQualLine(fb_, buf, 4096, qualToks_)) break;
+			for (unsigned int j = 0; j < qualToks_.size(); ++j) {
+				char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
+				assert_geq(c, 33);
+				if (qualsRead >= trim5) {
+					r.qual.append(c);
+				}
+				++qualsRead;
+			}
+		} // done reading integer quality lines
+		if (charsRead > qualsRead) tooFewQualities(r.name);
+	} else {
+		// Non-integer qualities
+		while((qualsRead < dstLen + trim5) && c >= 0) {
+			c = fb_.get();
+			c2 = c;
+			if (c == ' ') wrongQualityFormat(r.name);
+			if(c < 0) {
+				// EOF occurred in the middle of a read - abort
+				return -1;
+			}
+			if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
+				if (qualsRead >= trim5) {
+					c = charToPhred33(c, solQuals_, phred64Quals_);
+					assert_geq(c, 33);
+					r.qual.append(c);
+				}
+				qualsRead++;
+			} else {
+				break;
+			}
+		}
+	}
+	if(r.qual.length() < (size_t)dstLen) {
+		tooFewQualities(r.name);
+	}
+	// TODO: How to detect too many qualities??
+	r.qual.resize(dstLen);
+	while(c != -1 && c != upto && (upto2 == -1 || c != upto2)) {
+		c = fb_.get();
+		c2 = c;
+	}
+	return qualsRead;
+}
+
+/**
+ * Read another pattern from a Qseq input file.
+ */
+bool QseqPatternSource::read(
+	Read& r,
+	TReadId& rdid,
+	TReadId& endid,
+	bool& success,
+	bool& done)
+{
+	r.reset();
+	r.color = gColor;
+	success = true;
+	done = false;
+	readCnt_++;
+	rdid = endid = readCnt_-1;
+	peekOverNewline(fb_);
+	fb_.resetLastN();
+	// 1. Machine name
+	if(parseName(r, NULL, true, true,  true, false, '\t') == -1) BAIL_UNPAIRED();
+	assert_neq('\t', fb_.peek());
+	r.name.append('_');
+	// 2. Run number
+	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
+	assert_neq('\t', fb_.peek());
+	r.name.append('_');
+	// 3. Lane number
+	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
+	assert_neq('\t', fb_.peek());
+	r.name.append('_');
+	// 4. Tile number
+	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
+	assert_neq('\t', fb_.peek());
+	r.name.append('_');
+	// 5. X coordinate of spot
+	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
+	assert_neq('\t', fb_.peek());
+	r.name.append('_');
+	// 6. Y coordinate of spot
+	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
+	assert_neq('\t', fb_.peek());
+	r.name.append('_');
+	// 7. Index
+	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
+	assert_neq('\t', fb_.peek());
+	r.name.append('/');
+	// 8. Mate number
+	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
+	// Empty sequence??
+	if(fb_.peek() == '\t') {
+		// Get tab that separates seq from qual
+		ASSERT_ONLY(int c =) fb_.get();
+		assert_eq('\t', c);
+		assert_eq('\t', fb_.peek());
+		// Get tab that separates qual from filter
+		ASSERT_ONLY(c =) fb_.get();
+		assert_eq('\t', c);
+		// Next char is first char of filter flag
+		assert_neq('\t', fb_.peek());
+		fb_.resetLastN();
+		cerr << "Warning: skipping empty QSEQ read with name '" << r.name << "'" << endl;
+	} else {
+		assert_neq('\t', fb_.peek());
+		int charsRead = 0;
+		int mytrim5 = gTrim5;
+		// 9. Sequence
+		int dstLen = parseSeq(r, charsRead, mytrim5, '\t');
+		assert_neq('\t', fb_.peek());
+		if(dstLen < 0) BAIL_UNPAIRED();
+		char ct = 0;
+		// 10. Qualities
+		if(parseQuals(r, charsRead, dstLen, mytrim5, ct, '\t', -1) < 0) BAIL_UNPAIRED();
+		r.trimmed3 = gTrim3;
+		r.trimmed5 = mytrim5;
+		if(ct != '\t') {
+			cerr << "Error: QSEQ with name " << r.name << " did not have tab after qualities" << endl;
+			throw 1;
+		}
+		assert_eq(ct, '\t');
+	}
+	// 11. Filter flag
+	int filt = fb_.get();
+	if(filt == -1) BAIL_UNPAIRED();
+	r.filter = filt;
+	if(filt != '0' && filt != '1') {
+		// Bad value for filt
+	}
+	if(fb_.peek() != -1 && fb_.peek() != '\n') {
+		// Bad value right after the filt field
+	}
+	fb_.get();
+	r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
+	fb_.resetLastN();
+	if(r.qual.length() < r.patFw.length()) {
+		tooFewQualities(r.name);
+	} else if(r.qual.length() > r.patFw.length()) {
+		tooManyQualities(r.name);
+	}
+#ifndef NDEBUG
+	assert_eq(r.patFw.length(), r.qual.length());
+	for(size_t i = 0; i < r.qual.length(); i++) {
+		assert_geq((int)r.qual[i], 33);
+	}
+#endif
+	return true;
+}
diff --git a/ref_coord.cpp b/ref_coord.cpp
new file mode 100644
index 0000000..738c6fa
--- /dev/null
+++ b/ref_coord.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ref_coord.h"
+#include <iostream>
+
+using namespace std;
+
+ostream& operator<<(ostream& out, const Interval& c) {
+	out << c.upstream() << "+" << c.len();
+	return out;
+}
+
+ostream& operator<<(ostream& out, const Coord& c) {
+	out << c.ref() << ":" << c.off();
+	return out;
+}
diff --git a/ref_coord.h b/ref_coord.h
new file mode 100644
index 0000000..1f821e8
--- /dev/null
+++ b/ref_coord.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef REF_COORD_H_
+#define REF_COORD_H_
+
+#include <stdint.h>
+#include <iostream>
+#include <limits>
+#include "assert_helpers.h"
+
+typedef int64_t TRefId;
+typedef int64_t TRefOff;
+
+/**
+ * Encapsulates a reference coordinate; i.e. identifiers for
+ *  (a) a reference sequence, and
+ *  (b) a 0-based offset into that sequence.
+ */
+class Coord {
+
+public:
+
+	Coord() { reset(); }
+
+	Coord(const Coord& c) { init(c); }
+	
+	Coord(TRefId rf, TRefOff of, bool fw) { init(rf, of, fw); }
+
+	/**
+	 * Copy given fields into this Coord.
+	 */
+	void init(TRefId rf, TRefOff of, bool fw) {
+		ref_ = rf;
+		off_ = of;
+		orient_ = (fw ? 1 : 0);
+	}
+
+	/**
+	 * Copy contents of given Coord into this one.
+	 */
+	void init(const Coord& c) {
+		ref_ = c.ref_;
+		off_ = c.off_;
+		orient_ = c.orient_;
+	}
+	
+	/**
+	 * Return true iff this Coord is identical to the given Coord.
+	 */
+	bool operator==(const Coord& o) const {
+		assert(inited());
+		assert(o.inited());
+		return ref_ == o.ref_ && off_ == o.off_ && fw() == o.fw();
+	}
+
+	/**
+	 * Return true iff this Coord is less than the given Coord.  One Coord is
+	 * less than another if (a) its reference id is less, (b) its orientation is
+	 * less, or (c) its offset is less.
+	 */
+	bool operator<(const Coord& o) const {
+		if(ref_ < o.ref_) return true;
+		if(ref_ > o.ref_) return false;
+		if(orient_ < o.orient_) return true;
+		if(orient_ > o.orient_) return false;
+		if(off_ < o.off_) return true;
+		if(off_ > o.off_) return false;
+		return false;
+	}
+	
+	/**
+	 * Return the opposite result from operator<.
+	 */
+	bool operator>=(const Coord& o) const {
+		return !((*this) < o);
+	}
+	
+	/**
+	 * Return true iff this Coord is greater than the given Coord.  One Coord
+	 * is greater than another if (a) its reference id is greater, (b) its
+	 * orientation is greater, or (c) its offset is greater.
+	 */
+	bool operator>(const Coord& o) const {
+		if(ref_ > o.ref_) return true;
+		if(ref_ < o.ref_) return false;
+		if(orient_ > o.orient_) return true;
+		if(orient_ < o.orient_) return false;
+		if(off_ > o.off_) return true;
+		if(off_ < o.off_) return false;
+		return false;
+	}
+	
+	/**
+	 * Return the opposite result from operator>.
+	 */
+	bool operator<=(const Coord& o) const {
+		return !((*this) > o);
+	}
+	
+	/**
+	 * Reset this coord to uninitialized state.
+	 */
+	void reset() {
+		ref_ = std::numeric_limits<TRefId>::max();
+		off_ = std::numeric_limits<TRefOff>::max();
+		orient_ = -1;
+	}
+	
+	/**
+	 * Return true iff this Coord is initialized (i.e. ref and off have both
+	 * been set since the last call to reset()).
+	 */
+	bool inited() const {
+		if(ref_ != std::numeric_limits<TRefId>::max() &&
+		   off_ != std::numeric_limits<TRefOff>::max())
+		{
+			assert(orient_ == 0 || orient_ == 1);
+			return true;
+		}
+		return false;
+	}
+	
+	/**
+	 * Get orientation of the Coord.
+	 */
+	bool fw() const {
+		assert(inited());
+		assert(orient_ == 0 || orient_ == 1);
+		return orient_ == 1;
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that coord is internally consistent.
+	 */
+	bool repOk() const {
+		if(ref_ != std::numeric_limits<TRefId>::max() &&
+		   off_ != std::numeric_limits<TRefOff>::max())
+		{
+			assert(orient_ == 0 || orient_ == 1);
+		}
+		return true;
+	}
+#endif
+	
+	/**
+	 * Check whether an interval defined by this coord and having
+	 * length 'len' is contained within an interval defined by
+	 * 'inbegin' and 'inend'.
+	 */
+	bool within(int64_t len, int64_t inbegin, int64_t inend) const {
+		return off_ >= inbegin && off_ + len <= inend;
+	}
+	
+	inline TRefId  ref()    const { return ref_; }
+	inline TRefOff off()    const { return off_; }
+	inline int     orient() const { return orient_; }
+	
+	inline void setRef(TRefId  id)  { ref_ = id;  }
+	inline void setOff(TRefOff off) { off_ = off; }
+
+	inline void adjustOff(TRefOff off) { off_ += off; }
+
+protected:
+
+	TRefId  ref_;    // which reference?
+	TRefOff off_;    // 0-based offset into reference
+	int     orient_; // true -> Watson strand
+};
+
+std::ostream& operator<<(std::ostream& out, const Coord& c);
+
+/**
+ * Encapsulates a reference interval, which consists of a Coord and a length.
+ */
+class Interval {
+
+public:
+	
+	Interval() { reset(); }
+	
+	explicit Interval(const Coord& upstream, TRefOff len) {
+		init(upstream, len);
+	}
+
+	explicit Interval(TRefId rf, TRefOff of, bool fw, TRefOff len) {
+		init(rf, of, fw, len);
+	}
+
+	void init(const Coord& upstream, TRefOff len) {
+		upstream_ = upstream;
+		len_ = len;
+	}
+	
+	void init(TRefId rf, TRefOff of, bool fw, TRefOff len) {
+		upstream_.init(rf, of, fw);
+		len_ = len;
+	}
+	
+	/**
+	 * Set offset.
+	 */
+	void setOff(TRefOff of) {
+		upstream_.setOff(of);
+	}
+
+	/**
+	 * Set length.
+	 */
+	void setLen(TRefOff len) {
+		len_ = len;
+	}
+
+	/**
+	 * Reset this interval to uninitialized state.
+	 */
+	void reset() {
+		upstream_.reset();
+		len_ = 0;
+	}
+	
+	/**
+	 * Return true iff this Interval is initialized.
+	 */
+	bool inited() const {
+		if(upstream_.inited()) {
+			assert_gt(len_, 0);
+			return true;
+		} else {
+			return false;
+		}
+	}
+	
+	/**
+	 * Return true iff this Interval is equal to the given Interval,
+	 * i.e. if they cover the same set of positions.
+	 */
+	bool operator==(const Interval& o) const {
+		return upstream_ == o.upstream_ &&
+		       len_ == o.len_;
+	}
+
+	/**
+	 * Return true iff this Interval is less than the given Interval.
+	 * One interval is less than another if its upstream location is
+	 * prior to the other's or, if their upstream locations are equal,
+	 * if its length is less than the other's.
+	 */
+	bool operator<(const Interval& o) const {
+		if(upstream_ < o.upstream_) return true;
+		if(upstream_ > o.upstream_) return false;
+		if(len_ < o.len_) return true;
+		return false;
+	}
+	
+	/**
+	 * Return opposite result from operator<.
+	 */
+	bool operator>=(const Interval& o) const {
+		return !((*this) < o);
+	}
+
+	/**
+	 * Return true iff this Interval is greater than than the given
+	 * Interval.  One interval is greater than another if its upstream
+	 * location is after the other's or, if their upstream locations
+	 * are equal, if its length is greater than the other's.
+	 */
+	bool operator>(const Interval& o) const {
+		if(upstream_ > o.upstream_) return true;
+		if(upstream_ < o.upstream_) return false;
+		if(len_ > o.len_) return true;
+		return false;
+	}
+
+	/**
+	 * Return opposite result from operator>.
+	 */
+	bool operator<=(const Interval& o) const {
+		return !((*this) > o);
+	}
+	
+	/**
+	 * Set upstream Coord.
+	 */
+	void setUpstream(const Coord& c) {
+		upstream_ = c;
+	}
+
+	/**
+	 * Set length.
+	 */
+	void setLength(TRefOff l) {
+		len_ = l;
+	}
+	
+	inline TRefId  ref()    const { return upstream_.ref(); }
+	inline TRefOff off()    const { return upstream_.off(); }
+	inline TRefOff dnoff()  const { return upstream_.off() + len_; }
+	inline int     orient() const { return upstream_.orient(); }
+
+	/**
+	 * Return a Coord encoding the coordinate just past the downstream edge of
+	 * the interval.
+	 */
+	inline Coord downstream() const {
+		return Coord(
+			upstream_.ref(),
+			upstream_.off() + len_,
+			upstream_.orient());
+	}
+	
+	/**
+	 * Return true iff the given Coord is inside this Interval.
+	 */
+	inline bool contains(const Coord& c) const {
+		return
+			c.ref()    == ref() &&
+			c.orient() == orient() &&
+			c.off()    >= off() &&
+			c.off()    <  dnoff();
+	}
+
+	/**
+	 * Return true iff the given Coord is inside this Interval, without
+	 * requiring orientations to match.
+	 */
+	inline bool containsIgnoreOrient(const Coord& c) const {
+		return
+			c.ref()    == ref() &&
+			c.off()    >= off() &&
+			c.off()    <  dnoff();
+	}
+
+	/**
+	 * Return true iff the given Interval is inside this Interval.
+	 */
+	inline bool contains(const Interval& c) const {
+		return
+			c.ref()    == ref() &&
+			c.orient() == orient() &&
+			c.off()    >= off() &&
+			c.dnoff()  <= dnoff();
+	}
+
+	/**
+	 * Return true iff the given Interval is inside this Interval, without
+	 * requiring orientations to match.
+	 */
+	inline bool containsIgnoreOrient(const Interval& c) const {
+		return
+			c.ref()    == ref() &&
+			c.off()    >= off() &&
+			c.dnoff()  <= dnoff();
+	}
+
+	/**
+	 * Return true iff the given Interval overlaps this Interval.
+	 */
+	inline bool overlaps(const Interval& c) const {
+		return
+			c.ref()    == upstream_.ref() &&
+			c.orient() == upstream_.orient() &&
+			((off() <= c.off()   && dnoff() > c.off())   ||
+			 (off() <= c.dnoff() && dnoff() > c.dnoff()) ||
+			 (c.off() <= off()   && c.dnoff() > off())   ||
+			 (c.off() <= dnoff() && c.dnoff() > dnoff()));
+	}
+
+	/**
+	 * Return true iff the given Interval overlaps this Interval, without
+	 * requiring orientations to match.
+	 */
+	inline bool overlapsIgnoreOrient(const Interval& c) const {
+		return
+			c.ref()    == upstream_.ref() &&
+			((off() <= c.off()   && dnoff() > c.off())   ||
+			 (off() <= c.dnoff() && dnoff() > c.dnoff()) ||
+			 (c.off() <= off()   && c.dnoff() > off())   ||
+			 (c.off() <= dnoff() && c.dnoff() > dnoff()));
+	}
+	
+	inline const Coord&  upstream()   const { return upstream_; }
+	inline TRefOff       len()      const { return len_;      }
+
+#ifndef NDEBUG
+	/**
+	 * Check that the Interval is internally consistent.
+	 */
+	bool repOk() const {
+		assert(upstream_.repOk());
+		assert_geq(len_, 0);
+		return true;
+	}
+#endif
+
+	inline void adjustOff(TRefOff off) { upstream_.adjustOff(off); }
+
+protected:
+
+	Coord   upstream_;
+	TRefOff len_;
+};
+
+std::ostream& operator<<(std::ostream& out, const Interval& c);
+
+#endif /*ndef REF_COORD_H_*/
diff --git a/ref_read.cpp b/ref_read.cpp
new file mode 100644
index 0000000..40dd454
--- /dev/null
+++ b/ref_read.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "ref_read.h"
+
+/**
+ * Reads past the next ambiguous or unambiguous stretch of sequence
+ * from the given FASTA file and returns its length.  Does not do
+ * anything with the sequence characters themselves; this is purely for
+ * measuring lengths.
+ */
+RefRecord fastaRefReadSize(
+	FileBuf& in,
+	const RefReadInParams& rparms,
+	bool first,
+	BitpairOutFileBuf* bpout)
+{
+	int c;
+	static int lastc = '>'; // last character seen
+
+	// RefRecord params
+	TIndexOffU len = 0; // 'len' counts toward total length
+	// 'off' counts number of ambiguous characters before first
+	// unambiguous character
+	size_t off = 0;
+
+	// Pick off the first carat and any preceding whitespace
+	if(first) {
+		assert(!in.eof());
+		lastc = '>';
+		c = in.getPastWhitespace();
+		if(in.eof()) {
+			// Got eof right away; emit warning
+			cerr << "Warning: Empty input file" << endl;
+			lastc = -1;
+			return RefRecord(0, 0, true);
+		}
+		assert(c == '>');
+	}
+
+	first = true;
+	// Skip to the end of the id line; if the next line is either
+	// another id line or a comment line, keep skipping
+	if(lastc == '>') {
+		// Skip to the end of the name line
+		do {
+			if((c = in.getPastNewline()) == -1) {
+				// No more input
+				cerr << "Warning: Encountered empty reference sequence" << endl;
+				lastc = -1;
+				return RefRecord(0, 0, true);
+			}
+			if(c == '>') {
+				cerr << "Warning: Encountered empty reference sequence" << endl;
+			}
+			// continue until a non-name, non-comment line
+		} while (c == '>');
+	} else {
+		first = false; // not the first in a sequence
+		off = 1; // The gap has already been consumed, so count it
+		if((c = in.get()) == -1) {
+			// Don't emit a warning, since this might legitimately be
+			// a gap on the end of the final sequence in the file
+			lastc = -1;
+			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
+		}
+	}
+
+	// Now skip to the first DNA character, counting gap characters
+	// as we go
+	int lc = -1; // last-DNA char variable for color conversion
+	while(true) {
+		int cat = asc2dnacat[c];
+		if(rparms.nsToAs && cat >= 2) c = 'A';
+		if(cat == 1) {
+			// This is a DNA character
+			if(rparms.color) {
+				if(lc != -1) {
+					// Got two consecutive unambiguous DNAs
+					break; // to read-in loop
+				}
+				// Keep going; we need two consecutive unambiguous DNAs
+				lc = asc2dna[(int)c];
+				// The 'if(off > 0)' takes care of the case where
+				// the reference is entirely unambiguous and we don't
+				// want to incorrectly increment off.
+				if(off > 0) off++;
+			} else {
+				break; // to read-in loop
+			}
+		} else if(cat >= 2) {
+			if(lc != -1 && off == 0) off++;
+			lc = -1;
+			off++; // skip over gap character and increment
+		} else if(c == '>') {
+			if(off > 0 && lastc == '>') {
+				cerr << "Warning: Encountered reference sequence with only gaps" << endl;
+			} else if(lastc == '>') {
+				cerr << "Warning: Encountered empty reference sequence" << endl;
+			}
+			lastc = '>';
+			//return RefRecord(off, 0, false);
+			return RefRecord((TIndexOffU)off, 0, first);
+		}
+		c = in.get();
+		if(c == -1) {
+			// End-of-file
+			if(off > 0 && lastc == '>') {
+				cerr << "Warning: Encountered reference sequence with only gaps" << endl;
+			} else if(lastc == '>') {
+				cerr << "Warning: Encountered empty reference sequence" << endl;
+			}
+			lastc = -1;
+			//return RefRecord(off, 0, false);
+			return RefRecord((TIndexOffU)off, 0, first);
+		}
+	}
+	assert(!rparms.color || (lc != -1));
+	assert_eq(1, asc2dnacat[c]); // C must be unambiguous base
+	if(off > 0 && rparms.color && first) {
+		// Handle the case where the first record has ambiguous
+		// characters but we're in color space; one of those counts is
+		// spurious
+		off--;
+	}
+
+	// in now points just past the first character of a sequence
+	// line, and c holds the first character
+	while(c != -1 && c != '>') {
+		if(rparms.nsToAs && asc2dnacat[c] >= 2) c = 'A';
+		uint8_t cat = asc2dnacat[c];
+		int cc = toupper(c);
+		if(rparms.bisulfite && cc == 'C') c = cc = 'T';
+		if(cat == 1) {
+			// It's a DNA character
+			assert(cc == 'A' || cc == 'C' || cc == 'G' || cc == 'T');
+			// Check for overflow
+			if((TIndexOffU)(len + 1) < len) {
+				throw RefTooLongException();
+			}
+			// Consume it
+			len++;
+			// Output it
+			if(bpout != NULL) {
+				if(rparms.color) {
+					// output color
+					bpout->write(dinuc2color[asc2dna[(int)c]][lc]);
+				} else if(!rparms.color) {
+					// output nucleotide
+					bpout->write(asc2dna[c]);
+				}
+			}
+			lc = asc2dna[(int)c];
+		} else if(cat >= 2) {
+			// It's an N or a gap
+			lastc = c;
+			assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
+			return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
+		} else {
+			// Not DNA and not a gap, ignore it
+#ifndef NDEBUG
+			if(!isspace(c)) {
+				cerr << "Unexpected character in sequence: ";
+				if(isprint(c)) {
+					cerr << ((char)c) << endl;
+				} else {
+					cerr << "(" << c << ")" << endl;
+				}
+			}
+#endif
+		}
+		c = in.get();
+	}
+	lastc = c;
+	return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
+}
+
+#if 0
+static void
+printRecords(ostream& os, const EList<RefRecord>& l) {
+	for(size_t i = 0; i < l.size(); i++) {
+		os << l[i].first << ", " << l[i].off << ", " << l[i].len << endl;
+	}
+}
+#endif
+
+/**
+ * Reverse the 'src' list of RefRecords into the 'dst' list.  Don't
+ * modify 'src'.
+ */
+void reverseRefRecords(
+	const EList<RefRecord>& src,
+	EList<RefRecord>& dst,
+	bool recursive,
+	bool verbose)
+{
+	dst.clear();
+	{
+		EList<RefRecord> cur;
+		for(int i = (int)src.size()-1; i >= 0; i--) {
+			bool first = (i == (int)src.size()-1 || src[i+1].first);
+			// Clause after the || on next line is to deal with empty FASTA
+			// records at the end of the 'src' list, which would be wrongly
+			// omitted otherwise.
+			if(src[i].len || (first && src[i].off == 0)) {
+				cur.push_back(RefRecord(0, src[i].len, first));
+				first = false;
+			}
+			if(src[i].off) cur.push_back(RefRecord(src[i].off, 0, first));
+		}
+		for(int i = 0; i < (int)cur.size(); i++) {
+			assert(cur[i].off == 0 || cur[i].len == 0);
+			if(i < (int)cur.size()-1 && cur[i].off != 0 && !cur[i+1].first) {
+				dst.push_back(RefRecord(cur[i].off, cur[i+1].len, cur[i].first));
+				i++;
+			} else {
+				dst.push_back(cur[i]);
+			}
+		}
+	}
+	//if(verbose) {
+	//	cout << "Source: " << endl;
+	//	printRecords(cout, src);
+	//	cout << "Dest: " << endl;
+	//	printRecords(cout, dst);
+	//}
+#ifndef NDEBUG
+	size_t srcnfirst = 0, dstnfirst = 0;
+	for(size_t i = 0; i < src.size(); i++) {
+		if(src[i].first) {
+			srcnfirst++;
+		}
+	}
+	for(size_t i = 0; i < dst.size(); i++) {
+		if(dst[i].first) {
+			dstnfirst++;
+		}
+	}
+	assert_eq(srcnfirst, dstnfirst);
+	if(!recursive) {
+		EList<RefRecord> tmp;
+		reverseRefRecords(dst, tmp, true);
+		assert_eq(tmp.size(), src.size());
+		for(size_t i = 0; i < src.size(); i++) {
+			assert_eq(src[i].len, tmp[i].len);
+			assert_eq(src[i].off, tmp[i].off);
+			assert_eq(src[i].first, tmp[i].first);
+		}
+	}
+#endif
+}
+
+/**
+ * Calculate a vector containing the sizes of all of the patterns in
+ * all of the given input files, in order.  Returns the total size of
+ * all references combined.  Rewinds each istream before returning.
+ */
+std::pair<size_t, size_t>
+fastaRefReadSizes(
+	EList<FileBuf*>& in,
+	EList<RefRecord>& recs,
+	const RefReadInParams& rparms,
+	BitpairOutFileBuf* bpout,
+	TIndexOff& numSeqs)
+{
+	TIndexOffU unambigTot = 0;
+	size_t bothTot = 0;
+	assert_gt(in.size(), 0);
+	// For each input istream
+	for(size_t i = 0; i < in.size(); i++) {
+		bool first = true;
+		assert(!in[i]->eof());
+		// For each pattern in this istream
+		while(!in[i]->eof()) {
+			RefRecord rec;
+			try {
+				rec = fastaRefReadSize(*in[i], rparms, first, bpout);
+				if((unambigTot + rec.len) < unambigTot) {
+					throw RefTooLongException();
+				}
+			}
+			catch(RefTooLongException& e) {
+				cerr << e.what() << endl;
+				throw 1;
+			}
+			// Add the length of this record.
+			if(rec.first) numSeqs++;
+			unambigTot += rec.len;
+			bothTot += rec.len;
+			bothTot += rec.off;
+			first = false;
+			if(rec.len == 0 && rec.off == 0 && !rec.first) continue;
+			recs.push_back(rec);
+		}
+		// Reset the input stream
+		in[i]->reset();
+		assert(!in[i]->eof());
+#ifndef NDEBUG
+		// Check that it's really reset
+		int c = in[i]->get();
+		assert_eq('>', c);
+		in[i]->reset();
+		assert(!in[i]->eof());
+#endif
+	}
+	assert_geq(bothTot, 0);
+	assert_geq(unambigTot, 0);
+	return make_pair(
+		unambigTot, // total number of unambiguous DNA characters read
+		bothTot); // total number of DNA characters read, incl. ambiguous ones
+}
diff --git a/ref_read.h b/ref_read.h
new file mode 100644
index 0000000..d41c113
--- /dev/null
+++ b/ref_read.h
@@ -0,0 +1,314 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef REF_READ_H_
+#define REF_READ_H_
+
+#include <iostream>
+#include <cassert>
+#include <string>
+#include <ctype.h>
+#include <fstream>
+#include <stdexcept>
+#include "alphabet.h"
+#include "assert_helpers.h"
+#include "filebuf.h"
+#include "word_io.h"
+#include "ds.h"
+#include "endian_swap.h"
+
+using namespace std;
+
+class RefTooLongException : public exception {
+
+public:
+	RefTooLongException() {
+#ifdef BOWTIE_64BIT_INDEX
+		// This should never happen!
+		msg = "Error: Reference sequence has more than 2^64-1 characters!  "
+		      "Please divide the reference into smaller chunks and index each "
+			  "independently.";
+#else
+		msg = "Error: Reference sequence has more than 2^32-1 characters!  "
+		      "Please build a large index by passing the --large-index option "
+			  "to bowtie2-build";
+#endif
+	}
+	
+	~RefTooLongException() throw() {}
+	
+	const char* what() const throw() {
+		return msg.c_str();
+	}
+
+protected:
+	
+	string msg;
+	
+};
+
+/**
+ * Encapsulates a stretch of the reference containing only unambiguous
+ * characters.  From an ordered list of RefRecords, one can (almost)
+ * deduce the "shape" of the reference sequences (almost because we
+ * lose information about stretches of ambiguous characters at the end
+ * of reference sequences).
+ */
+struct RefRecord {
+	RefRecord() : off(), len(), first() { }
+	RefRecord(TIndexOffU _off, TIndexOffU _len, bool _first) :
+		off(_off), len(_len), first(_first)
+	{ }
+
+	RefRecord(FILE *in, bool swap) {
+		assert(in != NULL);
+		if(!fread(&off, OFF_SIZE, 1, in)) {
+			cerr << "Error reading RefRecord offset from FILE" << endl;
+			throw 1;
+		}
+		if(swap) off = endianSwapIndex(off);
+		if(!fread(&len, OFF_SIZE, 1, in)) {
+			cerr << "Error reading RefRecord offset from FILE" << endl;
+			throw 1;
+		}
+		if(swap) len = endianSwapIndex(len);
+		first = fgetc(in) ? true : false;
+	}
+
+	void write(std::ostream& out, bool be) {
+		writeIndex<TIndexOffU>(out, off, be);
+		writeIndex<TIndexOffU>(out, len, be);
+		out.put(first ? 1 : 0);
+	}
+
+	TIndexOffU off; /// Offset of the first character in the record
+	TIndexOffU len; /// Length of the record
+	bool   first; /// Whether this record is the first for a reference sequence
+};
+
+enum {
+	REF_READ_FORWARD = 0, // don't reverse reference sequence
+	REF_READ_REVERSE,     // reverse entire reference sequence
+	REF_READ_REVERSE_EACH // reverse each unambiguous stretch of reference
+};
+
+/**
+ * Parameters governing treatment of references as they're read in.
+ */
+struct RefReadInParams {
+	RefReadInParams(bool col, int r, bool nsToA, bool bisulf) :
+		color(col), reverse(r), nsToAs(nsToA), bisulfite(bisulf) { }
+	// extract colors from reference
+	bool color;
+	// reverse each reference sequence before passing it along
+	int reverse;
+	// convert ambiguous characters to As
+	bool nsToAs;
+	// bisulfite-convert the reference
+	bool bisulfite;
+};
+
+extern RefRecord
+fastaRefReadSize(
+	FileBuf& in,
+	const RefReadInParams& rparms,
+	bool first,
+	BitpairOutFileBuf* bpout = NULL);
+
+extern std::pair<size_t, size_t>
+fastaRefReadSizes(
+	EList<FileBuf*>& in,
+	EList<RefRecord>& recs,
+	const RefReadInParams& rparms,
+	BitpairOutFileBuf* bpout,
+	TIndexOff& numSeqs);
+
+extern void
+reverseRefRecords(
+	const EList<RefRecord>& src,
+	EList<RefRecord>& dst,
+	bool recursive = false,
+	bool verbose = false);
+
+/**
+ * Reads the next sequence from the given FASTA file and appends it to
+ * the end of dst, optionally reversing it.
+ */
+template <typename TStr>
+static RefRecord fastaRefReadAppend(
+	FileBuf& in,             // input file
+	bool first,              // true iff this is the first record in the file
+	TStr& dst,               // destination buf for parsed characters
+	TIndexOffU& dstoff,          // index of next character in dst to assign
+	RefReadInParams& rparms, // 
+	string* name = NULL)     // put parsed FASTA name here
+{
+	int c;
+	static int lastc = '>';
+	if(first) {
+		c = in.getPastWhitespace();
+		if(c != '>') {
+			cerr << "Reference file does not seem to be a FASTA file" << endl;
+			throw 1;
+		}
+		lastc = c;
+	}
+	assert_neq(-1, lastc);
+
+	// RefRecord params
+	size_t len = 0;
+	size_t off = 0;
+	first = true;
+
+	size_t ilen = dstoff;
+
+	// Chew up the id line; if the next line is either
+	// another id line or a comment line, keep chewing
+	int lc = -1; // last-DNA char variable for color conversion
+	c = lastc;
+	if(c == '>' || c == '#') {
+		do {
+			while (c == '#') {
+				if((c = in.getPastNewline()) == -1) {
+					lastc = -1;
+					goto bail;
+				}
+			}
+			assert_eq('>', c);
+			while(true) {
+				c = in.get();
+				if(c == -1) {
+					lastc = -1;
+					goto bail;
+				}
+				if(c == '\n' || c == '\r') {
+					while(c == '\r' || c == '\n') c = in.get();
+					if(c == -1) {
+						lastc = -1;
+						goto bail;
+					}
+					break;
+				}
+				if (name) name->push_back(c);
+			}
+			// c holds the first character on the line after the name
+			// line
+			if(c == '>') {
+				// If there's another name line immediately after this one,
+				// discard the previous name and start fresh with the new one
+				if (name) name->clear();
+			}
+		} while (c == '>' || c == '#');
+	} else {
+		ASSERT_ONLY(int cc = toupper(c));
+		assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
+		first = false;
+	}
+
+	// Skip over an initial stretch of gaps or ambiguous characters.
+	// For colorspace we skip until we see two consecutive unambiguous
+	// characters (i.e. the first unambiguous color).
+	while(true) {
+		int cat = asc2dnacat[c];
+		if(rparms.nsToAs && cat >= 2) {
+			c = 'A';
+		}
+		int cc = toupper(c);
+		if(rparms.bisulfite && cc == 'C') c = cc = 'T';
+		if(cat == 1) {
+			// This is a DNA character
+			if(rparms.color) {
+				if(lc != -1) {
+					// Got two consecutive unambiguous DNAs
+					break; // to read-in loop
+				}
+				// Keep going; we need two consecutive unambiguous DNAs
+				lc = asc2dna[(int)c];
+				// The 'if(off > 0)' takes care of the case where
+				// the reference is entirely unambiguous and we don't
+				// want to incorrectly increment off.
+				if(off > 0) off++;
+			} else {
+				break; // to read-in loop
+			}
+		} else if(cat >= 2) {
+			if(lc != -1 && off == 0) {
+				off++;
+			}
+			lc = -1;
+			off++; // skip it
+		} else if(c == '>') {
+			lastc = '>';
+			goto bail;
+		}
+		c = in.get();
+		if(c == -1) {
+			lastc = -1;
+			goto bail;
+		}
+	}
+	if(first && rparms.color && off > 0) {
+		// Handle the case where the first record has ambiguous
+		// characters but we're in color space; one of those counts is
+		// spurious
+		off--;
+	}
+	assert(!rparms.color || lc != -1);
+	assert_eq(1, asc2dnacat[c]);
+
+	// in now points just past the first character of a sequence
+	// line, and c holds the first character
+	while(true) {
+		// Note: can't have a comment in the middle of a sequence,
+		// though a comment can end a sequence
+		int cat = asc2dnacat[c];
+		assert_neq(2, cat);
+		if(cat == 1) {
+			// Consume it
+			if(!rparms.color || lc != -1) len++;
+			// Add it to reference buffer
+			if(rparms.color) {
+				dst.set((char)dinuc2color[asc2dna[(int)c]][lc], dstoff++);
+			} else if(!rparms.color) {
+				dst.set(asc2dna[c], dstoff++);
+			}
+			assert_lt((int)dst[dstoff-1], 4);
+			lc = asc2dna[(int)c];
+		}
+		c = in.get();
+		if(rparms.nsToAs && asc2dnacat[c] >= 2) c = 'A';
+		if (c == -1 || c == '>' || c == '#' || asc2dnacat[c] >= 2) {
+			lastc = c;
+			break;
+		}
+		if(rparms.bisulfite && toupper(c) == 'C') c = 'T';
+	}
+
+  bail:
+	// Optionally reverse the portion that we just appended.
+	// ilen = length of buffer before this last sequence was appended.
+	if(rparms.reverse == REF_READ_REVERSE_EACH) {
+		// Find limits of the portion we just appended
+		size_t nlen = dstoff;
+		dst.reverseWindow(ilen, nlen);
+	}
+	return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
+}
+
+#endif /*ndef REF_READ_H_*/
diff --git a/reference.cpp b/reference.cpp
new file mode 100644
index 0000000..f56bb6f
--- /dev/null
+++ b/reference.cpp
@@ -0,0 +1,670 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string>
+#include <string.h>
+#include "reference.h"
+#include "mem_ids.h"
+
+using namespace std;
+
+/**
+ * Load from .3.gEbwt_ext/.4.gEbwt_ext Bowtie index files.
+ */
+BitPairReference::BitPairReference(
+	const string& in,
+	bool color,
+	bool sanity,
+	EList<string>* infiles,
+	EList<SString<char> >* origs,
+	bool infilesSeq,
+	bool useMm,
+	bool useShmem,
+	bool mmSweep,
+	bool verbose,
+	bool startVerbose) :
+	buf_(NULL),
+	sanityBuf_(NULL),
+	loaded_(true),
+	sanity_(sanity),
+	useMm_(useMm),
+	useShmem_(useShmem),
+	verbose_(verbose)
+{
+	string s3 = in + ".3." + gEbwt_ext;
+	string s4 = in + ".4." + gEbwt_ext;
+	
+	FILE *f3, *f4;
+	if((f3 = fopen(s3.c_str(), "rb")) == NULL) {
+	    cerr << "Could not open reference-string index file " << s3 << " for reading." << endl;
+		cerr << "This is most likely because your index was built with an older version" << endl
+		<< "(<= 0.9.8.1) of bowtie-build.  Please re-run bowtie-build to generate a new" << endl
+		<< "index (or download one from the Bowtie website) and try again." << endl;
+		loaded_ = false;
+		return;
+	}
+    if((f4 = fopen(s4.c_str(), "rb"))  == NULL) {
+        cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
+		loaded_ = false;
+		return;
+	}
+#ifdef BOWTIE_MM
+    char *mmFile = NULL;
+	if(useMm_) {
+		if(verbose_ || startVerbose) {
+			cerr << "  Memory-mapping reference index file " << s4.c_str() << ": ";
+			logTime(cerr);
+		}
+		struct stat sbuf;
+		if (stat(s4.c_str(), &sbuf) == -1) {
+			perror("stat");
+			cerr << "Error: Could not stat index file " << s4.c_str() << " prior to memory-mapping" << endl;
+			throw 1;
+		}
+		mmFile = (char*)mmap((void *)0, (size_t)sbuf.st_size,
+				     PROT_READ, MAP_SHARED, fileno(f4), 0);
+		if(mmFile == (void *)(-1) || mmFile == NULL) {
+			perror("mmap");
+			cerr << "Error: Could not memory-map the index file " << s4.c_str() << endl;
+			throw 1;
+		}
+		if(mmSweep) {
+			TIndexOff sum = 0;
+			for(off_t i = 0; i < sbuf.st_size; i += 1024) {
+				sum += (TIndexOff) mmFile[i];
+			}
+			if(startVerbose) {
+				cerr << "  Swept the memory-mapped ref index file; checksum: " << sum << ": ";
+				logTime(cerr);
+			}
+		}
+	}
+#endif
+	
+	// Read endianness sentinel, set 'swap'
+	uint32_t one;
+	bool swap = false;
+	one = readIndex<int32_t>(f3, swap);
+	if(one != 1) {
+		if(useMm_) {
+			cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
+			throw 1;
+		}
+		assert_eq(0x1000000, one);
+		swap = true; // have to endian swap U32s
+	}
+	
+	// Read # records
+	TIndexOffU sz;
+	sz = readIndex<TIndexOffU>(f3, swap);
+	if(sz == 0) {
+		cerr << "Error: number of reference records is 0 in " << s3.c_str() << endl;
+		throw 1;
+	}
+	
+	// Read records
+	nrefs_ = 0;
+	
+	// Cumulative count of all unambiguous characters on a per-
+	// stretch 8-bit alignment (i.e. count of bytes we need to
+	// allocate in buf_)
+	TIndexOffU cumsz = 0;
+	TIndexOffU cumlen = 0;
+	// For each unambiguous stretch...
+	for(TIndexOffU i = 0; i < sz; i++) {
+		recs_.push_back(RefRecord(f3, swap));
+		if(recs_.back().first) {
+			// This is the first record for this reference sequence (and the
+			// last record for the one before)
+			refRecOffs_.push_back((TIndexOffU)recs_.size()-1);
+			// refOffs_ links each reference sequence with the total number of
+			// unambiguous characters preceding it in the pasted reference
+			refOffs_.push_back(cumsz);
+			if(nrefs_ > 0) {
+				// refLens_ links each reference sequence with the total number
+				// of ambiguous and unambiguous characters in it.
+				refLens_.push_back(cumlen);
+			}
+			cumlen = 0;
+			nrefs_++;
+		} else if(i == 0) {
+			cerr << "First record in reference index file was not marked as "
+			     << "'first'" << endl;
+			throw 1;
+		}
+		cumUnambig_.push_back(cumsz);
+		cumRefOff_.push_back(cumlen);
+		cumsz += recs_.back().len;
+		cumlen += recs_.back().off;
+		cumlen += recs_.back().len;
+	}
+	if(verbose_ || startVerbose) {
+		cerr << "Read " << nrefs_ << " reference strings from "
+		     << sz << " records: ";
+		logTime(cerr);
+	}
+	// Store a cap entry for the end of the last reference seq
+	refRecOffs_.push_back((TIndexOffU)recs_.size());
+	refOffs_.push_back(cumsz);
+	refLens_.push_back(cumlen);
+	cumUnambig_.push_back(cumsz);
+	cumRefOff_.push_back(cumlen);
+	bufSz_ = cumsz;
+	assert_eq(nrefs_, refLens_.size());
+	assert_eq(sz, recs_.size());
+	if (f3 != NULL) fclose(f3); // done with .3.gEbwt_ext file
+	// Round cumsz up to nearest byte boundary
+	if((cumsz & 3) != 0) {
+		cumsz += (4 - (cumsz & 3));
+	}
+	bufAllocSz_ = cumsz >> 2;
+	assert_eq(0, cumsz & 3); // should be rounded up to nearest 4
+	if(useMm_) {
+#ifdef BOWTIE_MM
+		buf_ = (uint8_t*)mmFile;
+		if(sanity_) {
+			FILE *ftmp = fopen(s4.c_str(), "rb");
+			sanityBuf_ = new uint8_t[cumsz >> 2];
+			size_t ret = fread(sanityBuf_, 1, cumsz >> 2, ftmp);
+			if(ret != (cumsz >> 2)) {
+				cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4.c_str() << endl;
+				throw 1;
+			}
+			fclose(ftmp);
+			for(size_t i = 0; i < (cumsz >> 2); i++) {
+				assert_eq(sanityBuf_[i], buf_[i]);
+			}
+		}
+#else
+		cerr << "Shouldn't be at " << __FILE__ << ":" << __LINE__ << " without BOWTIE_MM defined" << endl;
+		throw 1;
+#endif
+	} else {
+		bool shmemLeader = true;
+		if(!useShmem_) {
+			// Allocate a buffer to hold the reference string
+			try {
+				buf_ = new uint8_t[cumsz >> 2];
+				if(buf_ == NULL) throw std::bad_alloc();
+			} catch(std::bad_alloc& e) {
+				cerr << "Error: Ran out of memory allocating space for the bitpacked reference.  Please" << endl
+				<< "re-run on a computer with more memory." << endl;
+				throw 1;
+			}
+		} else {
+			shmemLeader = ALLOC_SHARED_U8(
+										  (s4 + "[ref]"), (cumsz >> 2), &buf_,
+										  "ref", (verbose_ || startVerbose));
+		}
+		if(shmemLeader) {
+			// Open the bitpair-encoded reference file
+			FILE *f4 = fopen(s4.c_str(), "rb");
+			if(f4 == NULL) {
+				cerr << "Could not open reference-string index file " << s4.c_str() << " for reading." << endl;
+				cerr << "This is most likely because your index was built with an older version" << endl
+				<< "(<= 0.9.8.1) of bowtie-build.  Please re-run bowtie-build to generate a new" << endl
+				<< "index (or download one from the Bowtie website) and try again." << endl;
+				loaded_ = false;
+				return;
+			}
+			// Read the whole thing in
+			size_t ret = fread(buf_, 1, cumsz >> 2, f4);
+			// Didn't read all of it?
+			if(ret != (cumsz >> 2)) {
+				cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4.c_str() << endl;
+				throw 1;
+			}
+			// Make sure there's no more
+			char c;
+			ret = fread(&c, 1, 1, f4);
+			assert_eq(0, ret); // should have failed
+			fclose(f4);
+#ifdef BOWTIE_SHARED_MEM
+			if(useShmem_) NOTIFY_SHARED(buf_, (cumsz >> 2));
+#endif
+		} else {
+#ifdef BOWTIE_SHARED_MEM
+			if(useShmem_) WAIT_SHARED(buf_, (cumsz >> 2));
+#endif
+		}
+	}
+	
+	// Populate byteToU32_
+	bool big = currentlyBigEndian();
+	for(int i = 0; i < 256; i++) {
+		uint32_t word = 0;
+		if(big) {
+			word |= ((i >> 0) & 3) << 24;
+			word |= ((i >> 2) & 3) << 16;
+			word |= ((i >> 4) & 3) << 8;
+			word |= ((i >> 6) & 3) << 0;
+		} else {
+			word |= ((i >> 0) & 3) << 0;
+			word |= ((i >> 2) & 3) << 8;
+			word |= ((i >> 4) & 3) << 16;
+			word |= ((i >> 6) & 3) << 24;
+		}
+		byteToU32_[i] = word;
+	}
+	
+#ifndef NDEBUG
+	if(sanity_) {
+		// Compare the sequence we just read from the compact index
+		// file to the true reference sequence.
+		EList<SString<char> > *os; // for holding references
+		EList<SString<char> > osv(DEBUG_CAT); // for holding ref seqs
+		EList<SString<char> > osn(DEBUG_CAT); // for holding ref names
+		EList<size_t> osvLen(DEBUG_CAT); // for holding ref seq lens
+		EList<size_t> osnLen(DEBUG_CAT); // for holding ref name lens
+		SStringExpandable<uint32_t> tmp_destU32_;
+		if(infiles != NULL) {
+			if(infilesSeq) {
+				for(size_t i = 0; i < infiles->size(); i++) {
+					// Remove initial backslash; that's almost
+					// certainly being used to protect the first
+					// character of the sequence from getopts (e.g.,
+					// when the first char is -)
+					if((*infiles)[i].at(0) == '\\') {
+						(*infiles)[i].erase(0, 1);
+					}
+					osv.push_back(SString<char>((*infiles)[i]));
+				}
+			} else {
+				parseFastas(*infiles, osn, osnLen, osv, osvLen);
+			}
+			os = &osv;
+		} else {
+			assert(origs != NULL);
+			os = origs;
+		}
+		
+		// Go through the loaded reference files base-by-base and
+		// sanity check against what we get by calling getBase and
+		// getStretch
+		for(size_t i = 0; i < os->size(); i++) {
+			size_t olen = ((*os)[i]).length();
+			size_t olenU32 = (olen + 12) / 4;
+			uint32_t *buf = new uint32_t[olenU32];
+			uint8_t *bufadj = (uint8_t*)buf;
+			bufadj += getStretch(buf, i, 0, olen, tmp_destU32_);
+			for(size_t j = 0; j < olen; j++) {
+				assert_eq((int)(*os)[i][j], (int)bufadj[j]);
+				assert_eq((int)(*os)[i][j], (int)getBase(i, j));
+			}
+			delete[] buf;
+		}
+	}
+#endif
+}
+
+BitPairReference::~BitPairReference() {
+	if(buf_ != NULL && !useMm_ && !useShmem_) delete[] buf_;
+	if(sanityBuf_ != NULL) delete[] sanityBuf_;
+}
+
+/**
+ * Return a single base of the reference.  Calling this repeatedly
+ * is not an efficient way to retrieve bases from the reference;
+ * use loadStretch() instead.
+ *
+ * This implementation scans linearly through the records for the
+ * unambiguous stretches of the target reference sequence.  When
+ * there are many records, binary search would be more appropriate.
+ */
+int BitPairReference::getBase(size_t tidx, size_t toff) const {
+	uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
+	uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
+	assert_gt(recf, reci);
+	uint64_t bufOff = refOffs_[tidx];
+	uint64_t off = 0;
+	// For all records pertaining to the target reference sequence...
+	for(uint64_t i = reci; i < recf; i++) {
+		assert_geq(toff, off);
+		off += recs_[i].off;
+		if(toff < off) {
+			return 4;
+		}
+		assert_geq(toff, off);
+		uint64_t recOff = off + recs_[i].len;
+		if(toff < recOff) {
+			toff -= off;
+			bufOff += (uint64_t)toff;
+			assert_lt(bufOff, bufSz_);
+			const uint64_t bufElt = (bufOff) >> 2;
+			const uint64_t shift = (bufOff & 3) << 1;
+			return ((buf_[bufElt] >> shift) & 3);
+		}
+		bufOff += recs_[i].len;
+		off = recOff;
+		assert_geq(toff, off);
+	} // end for loop over records
+	return 4;
+}
+
+/**
+ * Load a stretch of the reference string into memory at 'dest'.
+ *
+ * This implementation scans linearly through the records for the
+ * unambiguous stretches of the target reference sequence.  When
+ * there are many records, binary search would be more appropriate.
+ */
+int BitPairReference::getStretchNaive(
+	uint32_t *destU32,
+	size_t tidx,
+	size_t toff,
+	size_t count) const
+{
+	uint8_t *dest = (uint8_t*)destU32;
+	uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
+	uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
+	assert_gt(recf, reci);
+	uint64_t cur = 0;
+	uint64_t bufOff = refOffs_[tidx];
+	uint64_t off = 0;
+	// For all records pertaining to the target reference sequence...
+	for(uint64_t i = reci; i < recf; i++) {
+		assert_geq(toff, off);
+		off += recs_[i].off;
+		for(; toff < off && count > 0; toff++) {
+			dest[cur++] = 4;
+			count--;
+		}
+		if(count == 0) break;
+		assert_geq(toff, off);
+		if(toff < off + recs_[i].len) {
+			bufOff += (TIndexOffU)(toff - off); // move bufOff pointer forward
+		} else {
+			bufOff += recs_[i].len;
+		}
+		off += recs_[i].len;
+		for(; toff < off && count > 0; toff++) {
+			assert_lt(bufOff, bufSz_);
+			const uint64_t bufElt = (bufOff) >> 2;
+			const uint64_t shift = (bufOff & 3) << 1;
+			dest[cur++] = (buf_[bufElt] >> shift) & 3;
+			bufOff++;
+			count--;
+		}
+		if(count == 0) break;
+		assert_geq(toff, off);
+	} // end for loop over records
+	// In any chars are left after scanning all the records,
+	// they must be ambiguous
+	while(count > 0) {
+		count--;
+		dest[cur++] = 4;
+	}
+	assert_eq(0, count);
+	return 0;
+}
+
+/**
+ * Load a stretch of the reference string into memory at 'dest'.
+ */
+int BitPairReference::getStretch(
+	uint32_t *destU32,
+	size_t tidx,
+	size_t toff,
+	size_t count
+	ASSERT_ONLY(, SStringExpandable<uint32_t>& destU32_2)) const
+{
+	ASSERT_ONLY(size_t origCount = count);
+	ASSERT_ONLY(size_t origToff = toff);
+	if(count == 0) return 0;
+	uint8_t *dest = (uint8_t*)destU32;
+#ifndef NDEBUG
+	destU32_2.clear();
+	uint8_t *dest_2 = NULL;
+	int off2;
+	if((rand() % 10) == 0) {
+		destU32_2.resize((origCount >> 2) + 2);
+		off2 = getStretchNaive(destU32_2.wbuf(), tidx, origToff, origCount);
+		dest_2 = ((uint8_t*)destU32_2.wbuf()) + off2;
+	}
+#endif
+	destU32[0] = 0x04040404; // Add Ns, which we might end up using later
+	uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
+	uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
+	assert_gt(recf, reci);
+	uint64_t cur = 4; // keep a cushion of 4 bases at the beginning
+	uint64_t bufOff = refOffs_[tidx];
+	uint64_t off = 0;
+	int64_t offset = 4;
+	bool firstStretch = true;
+	bool binarySearched = false;
+	uint64_t left  = reci;
+	uint64_t right = recf;
+	uint64_t mid   = 0;
+	// For all records pertaining to the target reference sequence...
+	for(uint64_t i = reci; i < recf; i++) {
+		uint64_t origBufOff = bufOff;
+		assert_geq(toff, off);
+		if (firstStretch && recf > reci + 16){
+			// binary search finds smallest i s.t. toff >= cumRefOff_[i]
+			while (left < right-1) {
+				mid = left + ((right - left) >> 1);
+				if (cumRefOff_[mid] <= toff)
+					left = mid;
+				else
+					right = mid;
+			}
+			off = cumRefOff_[left];
+			bufOff = cumUnambig_[left];
+			origBufOff = bufOff;
+			i = left;
+			assert(cumRefOff_[i+1] == 0 || cumRefOff_[i+1] > toff);
+			binarySearched = true;
+		}
+		off += recs_[i].off; // skip Ns at beginning of stretch
+		assert_gt(count, 0);
+		if(toff < off) {
+			size_t cpycnt = min((size_t)(off - toff), count);
+			memset(&dest[cur], 4, cpycnt);
+			count -= cpycnt;
+			toff += cpycnt;
+			cur += cpycnt;
+			if(count == 0) break;
+		}
+		assert_geq(toff, off);
+		if(toff < off + recs_[i].len) {
+			bufOff += toff - off; // move bufOff pointer forward
+		} else {
+			bufOff += recs_[i].len;
+		}
+		off += recs_[i].len;
+		assert(off == cumRefOff_[i+1] || cumRefOff_[i+1] == 0);
+		assert(!binarySearched || toff < off);
+		_unused(binarySearched); //make production build happy
+		if(toff < off) {
+			if(firstStretch) {
+				if(toff + 8 < off && count > 8) {
+					// We already added some Ns, so we have to do
+					// a fixup at the beginning of the buffer so
+					// that we can start clobbering at cur >> 2
+					if(cur & 3) {
+						offset -= (cur & 3);
+					}
+					uint64_t curU32 = cur >> 2;
+					// Do the initial few bases
+					if(bufOff & 3) {
+						const uint64_t bufElt = (bufOff) >> 2;
+						const int64_t low2 = bufOff & 3;
+						// Lots of cache misses on the following line
+						destU32[curU32] = byteToU32_[buf_[bufElt]];
+						for(int j = 0; j < low2; j++) {
+							((char *)(&destU32[curU32]))[j] = 4;
+						}
+						curU32++;
+						offset += low2;
+						const int64_t chars = 4 - low2;
+						count -= chars;
+						bufOff += chars;
+						toff += chars;
+					}
+					assert_eq(0, bufOff & 3);
+					uint64_t bufOffU32 = bufOff >> 2;
+					uint64_t countLim = count >> 2;
+					uint64_t offLim = ((off - (toff + 4)) >> 2);
+					uint64_t lim = min(countLim, offLim);
+					// Do the fast thing for as far as possible
+					for(uint64_t j = 0; j < lim; j++) {
+						// Lots of cache misses on the following line
+						destU32[curU32] = byteToU32_[buf_[bufOffU32++]];
+#ifndef NDEBUG
+						if(dest_2 != NULL) {
+							assert_eq(dest[(curU32 << 2) + 0], dest_2[(curU32 << 2) - offset + 0]);
+							assert_eq(dest[(curU32 << 2) + 1], dest_2[(curU32 << 2) - offset + 1]);
+							assert_eq(dest[(curU32 << 2) + 2], dest_2[(curU32 << 2) - offset + 2]);
+							assert_eq(dest[(curU32 << 2) + 3], dest_2[(curU32 << 2) - offset + 3]);
+						}
+#endif
+						curU32++;
+					}
+					toff += (lim << 2);
+					assert_leq(toff, off);
+					assert_leq((lim << 2), count);
+					count -= (lim << 2);
+					bufOff = bufOffU32 << 2;
+					cur = curU32 << 2;
+				}
+				// Do the slow thing for the rest
+				for(; toff < off && count > 0; toff++) {
+					assert_lt(bufOff, bufSz_);
+					const uint64_t bufElt = (bufOff) >> 2;
+					const uint64_t shift = (bufOff & 3) << 1;
+					dest[cur++] = (buf_[bufElt] >> shift) & 3;
+					bufOff++;
+					count--;
+				}
+				firstStretch = false;
+			} else {
+				// Do the slow thing
+				for(; toff < off && count > 0; toff++) {
+					assert_lt(bufOff, bufSz_);
+					const uint64_t bufElt = (bufOff) >> 2;
+					const uint64_t shift = (bufOff & 3) << 1;
+					dest[cur++] = (buf_[bufElt] >> shift) & 3;
+					bufOff++;
+					count--;
+				}
+			}
+		}
+		if(count == 0) break;
+		assert_eq(recs_[i].len, bufOff - origBufOff);
+		_unused(origBufOff); // make production build happy
+		assert_geq(toff, off);
+	} // end for loop over records
+	// In any chars are left after scanning all the records,
+	// they must be ambiguous
+	while(count > 0) {
+		count--;
+		dest[cur++] = 4;
+	}
+	assert_eq(0, count);
+	return (int)offset;
+}
+
+
+/**
+ * Parse the input fasta files, populating the szs list and writing the
+ * .3.gEbwt_ext and .4.gEbwt_ext portions of the index as we go.
+ */
+pair<size_t, size_t>
+BitPairReference::szsFromFasta(
+	EList<FileBuf*>& is,
+	const string& outfile,
+	bool bigEndian,
+	const RefReadInParams& refparams,
+	EList<RefRecord>& szs,
+	bool sanity)
+{
+	RefReadInParams parms = refparams;
+	std::pair<size_t, size_t> sztot;
+	if(!outfile.empty()) {
+		string file3 = outfile + ".3." + gEbwt_ext;
+		string file4 = outfile + ".4." + gEbwt_ext;
+		// Open output stream for the '.3.gEbwt_ext' file which will
+		// hold the size records.
+		ofstream fout3(file3.c_str(), ios::binary);
+		if(!fout3.good()) {
+			cerr << "Could not open index file for writing: \"" << file3.c_str() << "\"" << endl
+				 << "Please make sure the directory exists and that permissions allow writing by" << endl
+				 << "Bowtie." << endl;
+			throw 1;
+		}
+		BitpairOutFileBuf bpout(file4.c_str());
+		// Read in the sizes of all the unambiguous stretches of the genome
+		// into a vector of RefRecords.  The input streams are reset once
+		// it's done.
+		writeIndex<int32_t>(fout3, 1, bigEndian); // endianness sentinel
+		bool color = parms.color;
+		if(color) {
+			parms.color = false;
+			// Make sure the .3.gEbwt_ext and .4.gEbwt_ext files contain
+			// nucleotides; not colors
+			TIndexOff numSeqs = 0;
+			ASSERT_ONLY(std::pair<size_t, size_t> sztot2 =)
+			fastaRefReadSizes(is, szs, parms, &bpout, numSeqs);
+			parms.color = true;
+			writeIndex<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
+			for(size_t i = 0; i < szs.size(); i++) {
+				szs[i].write(fout3, bigEndian);
+			}
+			szs.clear();
+			// Now read in the colorspace size records; these are
+			// the ones that were indexed
+			TIndexOff numSeqs2 = 0;
+			sztot = fastaRefReadSizes(is, szs, parms, NULL, numSeqs2);
+			assert_eq(numSeqs, numSeqs2);
+			assert_eq(sztot2.second, sztot.second + numSeqs);
+		} else {
+			TIndexOff numSeqs = 0;
+			sztot = fastaRefReadSizes(is, szs, parms, &bpout, numSeqs);
+			writeIndex<TIndexOffU>(fout3, (TIndexOffU)szs.size(), bigEndian); // write # records
+			for(size_t i = 0; i < szs.size(); i++) szs[i].write(fout3, bigEndian);
+		}
+		if(sztot.first == 0) {
+			cerr << "Error: No unambiguous stretches of characters in the input.  Aborting..." << endl;
+			throw 1;
+		}
+		assert_gt(sztot.first, 0);
+		assert_gt(sztot.second, 0);
+		bpout.close();
+		fout3.close();
+	} else {
+		// Read in the sizes of all the unambiguous stretches of the
+		// genome into a vector of RefRecords
+		TIndexOff numSeqs = 0;
+		sztot = fastaRefReadSizes(is, szs, parms, NULL, numSeqs);
+#ifndef NDEBUG
+		if(parms.color) {
+			parms.color = false;
+			EList<RefRecord> szs2(EBWTB_CAT);
+			TIndexOff numSeqs2 = 0;
+			ASSERT_ONLY(std::pair<size_t, size_t> sztot2 =)
+			fastaRefReadSizes(is, szs2, parms, NULL, numSeqs2);
+			assert_eq(numSeqs, numSeqs2);
+			// One less color than base
+			assert_geq(sztot2.second, sztot.second + numSeqs);
+			parms.color = true;
+		}
+#endif
+	}
+	return sztot;
+}
diff --git a/reference.h b/reference.h
new file mode 100644
index 0000000..9b8e2e7
--- /dev/null
+++ b/reference.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef REFERENCE_H_
+#define REFERENCE_H_
+
+#include <stdexcept>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <utility>
+#ifdef BOWTIE_MM
+#include <sys/mman.h>
+#include <sys/shm.h>
+#endif
+#include "endian_swap.h"
+#include "ref_read.h"
+#include "sequence_io.h"
+#include "mm.h"
+#include "shmem.h"
+#include "timer.h"
+#include "sstring.h"
+#include "btypes.h"
+
+
+/**
+ * Concrete reference representation that bulk-loads the reference from
+ * the bit-pair-compacted binary file and stores it in memory also in
+ * bit-pair-compacted format.  The user may request reference
+ * characters either on a per-character bases or by "stretch" using
+ * getBase(...) and getStretch(...) respectively.
+ *
+ * Most of the complexity in this class is due to the fact that we want
+ * to represent references with ambiguous (non-A/C/G/T) characters but
+ * we don't want to use more than two bits per base.  This means we
+ * need a way to encode the ambiguous stretches of the reference in a
+ * way that is external to the bitpair sequence.  To accomplish this,
+ * we use the RefRecords vector, which is stored in the .3.ebwt index
+ * file.  The bitpairs themselves are stored in the .4.ebwt index file.
+ *
+ * Once it has been loaded, a BitPairReference is read-only, and is
+ * safe for many threads to access at once.
+ */
+class BitPairReference {
+
+public:
+	/**
+	 * Load from .3.ebwt/.4.ebwt Bowtie index files.
+	 */
+	BitPairReference(
+		const string& in,
+		bool color,
+		bool sanity = false,
+		EList<string>* infiles = NULL,
+		EList<SString<char> >* origs = NULL,
+		bool infilesSeq = false,
+		bool useMm = false,
+		bool useShmem = false,
+		bool mmSweep = false,
+		bool verbose = false,
+		bool startVerbose = false);
+
+	~BitPairReference();
+
+	/**
+	 * Return a single base of the reference.  Calling this repeatedly
+	 * is not an efficient way to retrieve bases from the reference;
+	 * use loadStretch() instead.
+	 *
+	 * This implementation scans linearly through the records for the
+	 * unambiguous stretches of the target reference sequence.  When
+	 * there are many records, binary search would be more appropriate.
+	 */
+	int getBase(size_t tidx, size_t toff) const;
+
+	/**
+	 * Load a stretch of the reference string into memory at 'dest'.
+	 *
+	 * This implementation scans linearly through the records for the
+	 * unambiguous stretches of the target reference sequence.  When
+	 * there are many records, binary search would be more appropriate.
+	 */
+	int getStretchNaive(
+		uint32_t *destU32,
+		size_t tidx,
+		size_t toff,
+		size_t count) const;
+
+	/**
+	 * Load a stretch of the reference string into memory at 'dest'.
+	 *
+	 * This implementation scans linearly through the records for the
+	 * unambiguous stretches of the target reference sequence.  When
+	 * there are many records, binary search would be more appropriate.
+	 */
+	int getStretch(
+		uint32_t *destU32,
+		size_t tidx,
+		size_t toff,
+		size_t count
+		ASSERT_ONLY(, SStringExpandable<uint32_t>& destU32_2)) const;
+
+	/**
+	 * Return the number of reference sequences.
+	 */
+	TIndexOffU numRefs() const {
+		return nrefs_;
+	}
+
+	/**
+	 * Return the approximate length of a reference sequence (it might leave
+	 * off some Ns on the end).
+	 *
+	 * TODO: Is it still true that it might leave off Ns?
+	 */
+	TIndexOffU approxLen(TIndexOffU elt) const {
+		assert_lt(elt, nrefs_);
+		return refLens_[elt];
+	}
+
+	/**
+	 * Return true iff buf_ and all the vectors are populated.
+	 */
+	bool loaded() const {
+		return loaded_;
+	}
+	
+	/**
+	 * Given a reference sequence id, return its offset into the pasted
+	 * reference string; i.e., return the number of unambiguous nucleotides
+	 * preceding it.
+	 */
+	TIndexOffU pastedOffset(TIndexOffU idx) const {
+		return refOffs_[idx];
+	}
+
+	/**
+	 * Parse the input fasta files, populating the szs list and writing the
+	 * .3.ebwt and .4.ebwt portions of the index as we go.
+	 */
+	static std::pair<size_t, size_t>
+	szsFromFasta(
+		EList<FileBuf*>& is,
+		const string& outfile,
+		bool bigEndian,
+		const RefReadInParams& refparams,
+		EList<RefRecord>& szs,
+		bool sanity);
+	
+protected:
+
+	uint32_t byteToU32_[256];
+
+	EList<RefRecord> recs_;       /// records describing unambiguous stretches
+	// following two lists are purely for the binary search in getStretch
+	EList<TIndexOffU> cumUnambig_; // # unambig ref chars up to each record
+	EList<TIndexOffU> cumRefOff_;  // # ref chars up to each record
+	EList<TIndexOffU> refLens_;    /// approx lens of ref seqs (excludes trailing ambig chars)
+	EList<TIndexOffU> refOffs_;    /// buf_ begin offsets per ref seq
+	EList<TIndexOffU> refRecOffs_; /// record begin/end offsets per ref seq
+	uint8_t *buf_;      /// the whole reference as a big bitpacked byte array
+	uint8_t *sanityBuf_;/// for sanity-checking buf_
+	TIndexOffU bufSz_;    /// size of buf_
+	TIndexOffU bufAllocSz_;
+	TIndexOffU nrefs_;    /// the number of reference sequences
+	bool     loaded_;   /// whether it's loaded
+	bool     sanity_;   /// do sanity checking
+	bool     useMm_;    /// load the reference as a memory-mapped file
+	bool     useShmem_; /// load the reference into shared memory
+	bool     verbose_;
+	ASSERT_ONLY(SStringExpandable<uint32_t> tmp_destU32_);
+};
+
+#endif
diff --git a/scoring.cpp b/scoring.cpp
new file mode 100644
index 0000000..1348821
--- /dev/null
+++ b/scoring.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include "scoring.h"
+
+using namespace std;
+
+/**
+ * Return true iff a read of length 'rdlen' passes the score filter, i.e.,
+ * has enough characters to rise above the minimum score threshold.
+ */
+bool Scoring::scoreFilter(
+	int64_t minsc,
+	size_t rdlen) const
+{
+	int64_t sc = (int64_t)(rdlen * match(30));
+	return sc >= minsc;
+}
+
+/**
+ * Given the score floor for valid alignments and the length of the read,
+ * calculate the maximum possible number of read gaps that could occur in a
+ * valid alignment.
+ */
+int Scoring::maxReadGaps(
+	int64_t minsc,
+	size_t rdlen) const
+{
+	// Score if all characters match.  TODO: remove assumption that match bonus
+	// is independent of quality value.
+	int64_t sc = (int64_t)(rdlen * match(30));
+	assert_geq(sc, minsc);
+	// Now convert matches to read gaps until sc calls below minsc
+	bool first = true;
+	int num = 0;
+	while(sc >= minsc) {
+		if(first) {
+			first = false;
+			// Subtract both penalties
+			sc -= readGapOpen();
+		} else {
+			// Subtract just the extension penalty
+			sc -= readGapExtend();
+		}
+		num++;
+	}
+	assert_gt(num, 0);
+	return num-1;
+}
+
+/**
+ * Given the score floor for valid alignments and the length of the read,
+ * calculate the maximum possible number of reference gaps that could occur
+ * in a valid alignment.
+ */
+int Scoring::maxRefGaps(
+	int64_t minsc,
+	size_t rdlen) const
+{
+	// Score if all characters match.  TODO: remove assumption that match bonus
+	// is independent of quality value.
+	int64_t sc = (int64_t)(rdlen * match(30));
+	assert_geq(sc, minsc);
+	// Now convert matches to read gaps until sc calls below minsc
+	bool first = true;
+	int num = 0;
+	while(sc >= minsc) {
+		sc -= match(30);
+		if(first) {
+			first = false;
+			// Subtract both penalties
+			sc -= refGapOpen();
+		} else {
+			// Subtract just the extension penalty
+			sc -= refGapExtend();
+		}
+		num++;
+	}
+	assert_gt(num, 0);
+	return num-1;
+}
+
+/**
+ * Given a read sequence, return true iff the read passes the N filter.
+ * The N filter rejects reads with more than the number of Ns.
+ */
+bool Scoring::nFilter(const BTDnaString& rd, size_t& ns) const {
+	size_t rdlen = rd.length();
+	size_t maxns = nCeil.f<size_t>((double)rdlen);
+	assert_geq(rd.length(), 0);
+	for(size_t i = 0; i < rdlen; i++) {
+		if(rd[i] == 4) {
+			ns++;
+			if(ns > maxns) {
+				return false; // doesn't pass
+			}
+		}
+	}
+	return true; // passes
+}
+
+/**
+ * Given a read sequence, return true iff the read passes the N filter.
+ * The N filter rejects reads with more than the number of Ns.
+ *
+ * For paired-end reads, there is a	question of how to apply the filter.
+ * The filter could be applied to both mates separately, which might then
+ * prevent paired-end alignment.  Or the filter could be applied to the
+ * reads as though they're concatenated together.  The latter approach has
+ * pros and cons.  The pro is that we can use paired-end information to
+ * recover alignments for mates that would not have passed the N filter on
+ * their own.  The con is that we might not want to do that, since the
+ * non-N portion of the bad mate might contain particularly unreliable
+ * information.
+ */
+void Scoring::nFilterPair(
+	const BTDnaString* rd1, // mate 1
+	const BTDnaString* rd2, // mate 2
+	size_t& ns1,            // # Ns in mate 1
+	size_t& ns2,            // # Ns in mate 2
+	bool& filt1,            // true -> mate 1 rejected by filter
+	bool& filt2)            // true -> mate 2 rejected by filter
+	const
+{
+	// Both fail to pass by default
+	filt1 = filt2 = false;
+	if(rd1 != NULL && rd2 != NULL && ncatpair) {
+		size_t rdlen1 = rd1->length();
+		size_t rdlen2 = rd2->length();
+		size_t maxns = nCeil.f<size_t>((double)(rdlen1 + rdlen2));
+		for(size_t i = 0; i < rdlen1; i++) {
+			if((*rd1)[i] == 4) ns1++;
+			if(ns1 > maxns) {
+				// doesn't pass
+				return;
+			}
+		}
+		for(size_t i = 0; i < rdlen2; i++) {
+			if((*rd2)[i] == 4) ns2++;
+			if(ns2 > maxns) {
+				// doesn't pass
+				return;
+			}
+		}
+		// Both pass
+		filt1 = filt2 = true;
+	} else {
+		if(rd1 != NULL) filt1 = nFilter(*rd1, ns1);
+		if(rd2 != NULL) filt2 = nFilter(*rd2, ns2);
+	}
+}
+
+#ifdef SCORING_MAIN
+
+int main() {
+	{
+		cout << "Case 1: Simple 1 ... ";
+		Scoring sc = Scoring::base1();
+		assert_eq(COST_MODEL_CONSTANT, sc.matchType);
+		
+		assert_eq(0, sc.maxRefGaps(0, 10));  // 10 - 1 - 15 = -6
+		assert_eq(0, sc.maxRefGaps(0, 11));  // 11 - 1 - 15 = -5
+		assert_eq(0, sc.maxRefGaps(0, 12));  // 12 - 1 - 15 = -4
+		assert_eq(0, sc.maxRefGaps(0, 13));  // 13 - 1 - 15 = -3
+		assert_eq(0, sc.maxRefGaps(0, 14));  // 14 - 1 - 15 = -2
+		assert_eq(0, sc.maxRefGaps(0, 15));  // 15 - 1 - 15 = -1
+		assert_eq(1, sc.maxRefGaps(0, 16));  // 16 - 1 - 15 =  0
+		assert_eq(1, sc.maxRefGaps(0, 17));  // 17 - 2 - 19 = -4
+		assert_eq(1, sc.maxRefGaps(0, 18));  // 18 - 2 - 19 = -3
+		assert_eq(1, sc.maxRefGaps(0, 19));  // 19 - 2 - 19 = -2
+		assert_eq(1, sc.maxRefGaps(0, 20));  // 20 - 2 - 19 = -1
+		assert_eq(2, sc.maxRefGaps(0, 21));  // 21 - 2 - 19 =  0
+		
+		assert_eq(0, sc.maxReadGaps(0, 10));   // 10 - 0 - 15 = -5
+		assert_eq(0, sc.maxReadGaps(0, 11));   // 11 - 0 - 15 = -4
+		assert_eq(0, sc.maxReadGaps(0, 12));   // 12 - 0 - 15 = -3
+		assert_eq(0, sc.maxReadGaps(0, 13));   // 13 - 0 - 15 = -2
+		assert_eq(0, sc.maxReadGaps(0, 14));   // 14 - 0 - 15 = -1
+		assert_eq(1, sc.maxReadGaps(0, 15));   // 15 - 0 - 15 =  0
+		assert_eq(1, sc.maxReadGaps(0, 16));   // 16 - 0 - 19 = -3
+		assert_eq(1, sc.maxReadGaps(0, 17));   // 17 - 0 - 19 = -2
+		assert_eq(1, sc.maxReadGaps(0, 18));   // 18 - 0 - 19 = -1
+		assert_eq(2, sc.maxReadGaps(0, 19));   // 19 - 0 - 19 =  0
+		assert_eq(2, sc.maxReadGaps(0, 20));   // 20 - 0 - 23 = -3
+		assert_eq(2, sc.maxReadGaps(0, 21));   // 21 - 0 - 23 = -2
+		
+		// N ceiling: const=2, linear=0.1
+		assert_eq(1, sc.nCeil(1));
+		assert_eq(2, sc.nCeil(3));
+		assert_eq(2, sc.nCeil(5));
+		assert_eq(2, sc.nCeil(7));
+		assert_eq(2, sc.nCeil(9));
+		assert_eq(3, sc.nCeil(10));
+		for(int i = 0; i < 30; i++) {
+			assert_eq(3, sc.n(i));
+			assert_eq(3, sc.mm(i));
+		}
+		assert_eq(5, sc.gapbar);
+		cout << "PASSED" << endl;
+	}
+	{
+		cout << "Case 2: Simple 2 ... ";
+		Scoring sc(
+			4,               // reward for a match
+			COST_MODEL_QUAL, // how to penalize mismatches
+			0,               // constant if mm pelanty is a constant
+			30,              // penalty for nuc mm in decoded colorspace als
+			-3.0f,           // constant coeff for minimum score
+			-3.0f,           // linear coeff for minimum score
+			DEFAULT_FLOOR_CONST,  // constant coeff for score floor
+			DEFAULT_FLOOR_LINEAR, // linear coeff for score floor
+			3.0f,            // max # ref Ns allowed in alignment; const coeff
+			0.4f,            // max # ref Ns allowed in alignment; linear coeff
+			COST_MODEL_QUAL, // how to penalize Ns in the read
+			0,               // constant if N pelanty is a constant
+			true,            // whether to concatenate mates before N filtering
+			25,              // constant coeff for cost of gap in the read
+			25,              // constant coeff for cost of gap in the ref
+			10,              // coeff of linear term for cost of gap in read
+			10,              // coeff of linear term for cost of gap in ref
+			5,               // 5 rows @ top/bot diagonal-entrance-only
+			-1,              // no restriction on row
+			false            // score prioritized over row
+		);
+
+		assert_eq(COST_MODEL_CONSTANT, sc.matchType);
+		assert_eq(4, sc.matchConst);
+		assert_eq(COST_MODEL_QUAL, sc.mmcostType);
+		assert_eq(COST_MODEL_QUAL, sc.npenType);
+		
+		assert_eq(0, sc.maxRefGaps(0, 8));  // 32 - 4 - 35 = -7
+		assert_eq(0, sc.maxRefGaps(0, 9));  // 36 - 4 - 35 = -3
+		assert_eq(1, sc.maxRefGaps(0, 10)); // 40 - 4 - 35 =  1
+		assert_eq(1, sc.maxRefGaps(0, 11)); // 44 - 8 - 45 = -9
+		assert_eq(1, sc.maxRefGaps(0, 12)); // 48 - 8 - 45 = -5
+		assert_eq(1, sc.maxRefGaps(0, 13)); // 52 - 8 - 45 = -1
+		assert_eq(2, sc.maxRefGaps(0, 14)); // 56 - 8 - 45 =  3
+		
+		assert_eq(0, sc.maxReadGaps(0, 8));   // 32 - 0 - 35 = -3
+		assert_eq(1, sc.maxReadGaps(0, 9));   // 36 - 0 - 35 =  1
+		assert_eq(1, sc.maxReadGaps(0, 10));  // 40 - 0 - 45 = -5
+		assert_eq(1, sc.maxReadGaps(0, 11));  // 44 - 0 - 45 = -1
+		assert_eq(2, sc.maxReadGaps(0, 12));  // 48 - 0 - 45 =  3
+		assert_eq(2, sc.maxReadGaps(0, 13));  // 52 - 0 - 55 = -3
+		assert_eq(3, sc.maxReadGaps(0, 14));  // 56 - 0 - 55 =  1
+
+		// N ceiling: const=3, linear=0.4
+		assert_eq(1, sc.nCeil(1));
+		assert_eq(2, sc.nCeil(2));
+		assert_eq(3, sc.nCeil(3));
+		assert_eq(4, sc.nCeil(4));
+		assert_eq(5, sc.nCeil(5));
+		assert_eq(5, sc.nCeil(6));
+		assert_eq(5, sc.nCeil(7));
+		assert_eq(6, sc.nCeil(8));
+		assert_eq(6, sc.nCeil(9));
+
+		for(int i = 0; i < 256; i++) {
+			assert_eq(i, sc.n(i));
+			assert_eq(i, sc.mm(i));
+		}
+
+		assert_eq(5, sc.gapbar);
+
+		cout << "PASSED" << endl;
+	}
+}
+
+#endif /*def SCORING_MAIN*/
diff --git a/scoring.h b/scoring.h
new file mode 100644
index 0000000..63b093b
--- /dev/null
+++ b/scoring.h
@@ -0,0 +1,519 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SCORING_H_
+#define SCORING_H_
+
+#include <limits>
+#include "qual.h"
+#include "simple_func.h"
+
+// Default type of bonus to added for matches
+#define DEFAULT_MATCH_BONUS_TYPE COST_MODEL_CONSTANT
+// When match bonus type is constant, use this constant
+#define DEFAULT_MATCH_BONUS 0
+// Same settings but different defaults for --local mode
+#define DEFAULT_MATCH_BONUS_TYPE_LOCAL COST_MODEL_CONSTANT
+#define DEFAULT_MATCH_BONUS_LOCAL 2
+
+// Default type of penalty to assess against mismatches
+#define DEFAULT_MM_PENALTY_TYPE COST_MODEL_QUAL
+// Default type of penalty to assess against mismatches
+#define DEFAULT_MM_PENALTY_TYPE_IGNORE_QUALS COST_MODEL_CONSTANT
+// When mismatch penalty type is constant, use this constant
+#define DEFAULT_MM_PENALTY_MAX 6
+#define DEFAULT_MM_PENALTY_MIN 2
+
+// Default type of penalty to assess against mismatches
+#define DEFAULT_N_PENALTY_TYPE COST_MODEL_CONSTANT
+// When mismatch penalty type is constant, use this constant
+#define DEFAULT_N_PENALTY 1
+
+// Constant coefficient b in linear function f(x) = ax + b determining
+// minimum valid score f when read length is x
+#define DEFAULT_MIN_CONST (-0.6f)
+// Linear coefficient a
+#define DEFAULT_MIN_LINEAR (-0.6f)
+// Different defaults for --local mode
+#define DEFAULT_MIN_CONST_LOCAL (0.0f)
+#define DEFAULT_MIN_LINEAR_LOCAL (10.0f)
+
+// Constant coefficient b in linear function f(x) = ax + b determining
+// maximum permitted number of Ns f in a read before it is filtered &
+// the maximum number of Ns in an alignment before it is considered
+// invalid.
+#define DEFAULT_N_CEIL_CONST 0.0f
+// Linear coefficient a
+#define DEFAULT_N_CEIL_LINEAR 0.15f
+
+// Default for whether to concatenate mates before the N filter (as opposed to
+// filting each mate separately)
+#define DEFAULT_N_CAT_PAIR false
+
+// Default read gap penalties for when homopolymer calling is reliable	
+#define DEFAULT_READ_GAP_CONST 5
+#define DEFAULT_READ_GAP_LINEAR 3
+
+// Default read gap penalties for when homopolymer calling is not reliable
+#define DEFAULT_READ_GAP_CONST_BADHPOLY 3
+#define DEFAULT_READ_GAP_LINEAR_BADHPOLY 1
+
+// Default reference gap penalties for when homopolymer calling is reliable
+#define DEFAULT_REF_GAP_CONST 5
+#define DEFAULT_REF_GAP_LINEAR 3
+
+// Default reference gap penalties for when homopolymer calling is not reliable
+#define DEFAULT_REF_GAP_CONST_BADHPOLY 3
+#define DEFAULT_REF_GAP_LINEAR_BADHPOLY 1
+
+enum {
+	COST_MODEL_ROUNDED_QUAL = 1,
+	COST_MODEL_QUAL,
+	COST_MODEL_CONSTANT
+};
+
+/**
+ * How to penalize various types of sequence dissimilarity, and other settings
+ * that govern how dynamic programming tables should be filled in and how to
+ * backtrace to find solutions.
+ */
+class Scoring {
+
+	/**
+	 * Init an array that maps quality to penalty or bonus according to 'type'
+	 * and 'cons'
+	 */
+	template<typename T>
+	void initPens(
+		T *pens,     // array to fill
+		int type,    // penalty type; qual | rounded qual | constant
+		int consMin, // constant for when penalty type is constant
+		int consMax) // constant for when penalty type is constant
+	{
+		if(type == COST_MODEL_ROUNDED_QUAL) {
+			for(int i = 0; i < 256; i++) {
+				pens[i] = (T)qualRounds[i];
+			}
+		} else if(type == COST_MODEL_QUAL) {
+			assert_neq(consMin, 0);
+			assert_neq(consMax, 0);
+			for(int i = 0; i < 256; i++) {
+				int ii = min(i, 40); // TODO: Bit hacky, this
+				float frac = (float)ii / 40.0f;
+				pens[i] = consMin + (T)(frac * (consMax-consMin));
+				assert_gt(pens[i], 0);
+				//if(pens[i] == 0) {
+				//	pens[i] = ((consMax > 0) ? (T)1 : (T)-1);
+				//}
+			}
+		} else if(type == COST_MODEL_CONSTANT) {
+			for(int i = 0; i < 256; i++) {
+				pens[i] = (T)consMax;
+			}
+		} else {
+			throw 1;
+		}
+	}
+
+public:
+
+	Scoring(
+		int   mat,          // reward for a match
+		int   mmcType,      // how to penalize mismatches
+	    int   mmpMax_,      // maximum mismatch penalty
+	    int   mmpMin_,      // minimum mismatch penalty
+		const SimpleFunc& scoreMin_,   // minimum score for valid alignment; const coeff
+		const SimpleFunc& nCeil_,      // max # ref Ns allowed in alignment; const coeff
+	    int   nType,        // how to penalize Ns in the read
+	    int   n,            // constant if N pelanty is a constant
+		bool  ncat,         // whether to concatenate mates before N filtering
+	    int   rdGpConst,    // constant coeff for cost of gap in the read
+	    int   rfGpConst,    // constant coeff for cost of gap in the ref
+	    int   rdGpLinear,   // coeff of linear term for cost of gap in read
+	    int   rfGpLinear,   // coeff of linear term for cost of gap in ref
+		int   gapbar_,      // # rows at top/bot can only be entered diagonally
+        int   cp_ = 0,      // canonical splicing penalty
+        int   ncp_ = 12,    // non-canonical splicing penalty
+        int   csp_ = 24,    // conflicting splice site penalty
+        const SimpleFunc* ip_ = NULL)      // penalty as to intron length
+	{
+		matchType    = COST_MODEL_CONSTANT;
+		matchConst   = mat;
+		mmcostType   = mmcType;
+		mmpMax       = mmpMax_;
+		mmpMin       = mmpMin_;
+		scoreMin     = scoreMin_;
+		nCeil        = nCeil_;
+		npenType     = nType;
+		npen         = n;
+		ncatpair     = ncat;
+		rdGapConst   = rdGpConst;
+		rfGapConst   = rfGpConst;
+		rdGapLinear  = rdGpLinear;
+		rfGapLinear  = rfGpLinear;
+		qualsMatter_ = mmcostType != COST_MODEL_CONSTANT;
+		gapbar       = gapbar_;
+		monotone     = matchType == COST_MODEL_CONSTANT && matchConst == 0;
+		initPens<int>(mmpens, mmcostType, mmpMin_, mmpMax_);
+		initPens<int>(npens, npenType, npen, npen);
+		initPens<float>(matchBonuses, matchType, matchConst, matchConst);
+        cp = cp_;
+        ncp = ncp_;
+        csp = csp_;
+        if(ip_ != NULL) ip = *ip_;
+		assert(repOk());
+	}
+	
+	/**
+	 * Set a constant match bonus.
+	 */
+	void setMatchBonus(int bonus) {
+		matchType  = COST_MODEL_CONSTANT;
+		matchConst = bonus;
+		initPens<float>(matchBonuses, matchType, matchConst, matchConst);
+		assert(repOk());
+	}
+	
+	/**
+	 * Set the mismatch penalty.
+	 */
+	void setMmPen(int mmType_, int mmpMax_, int mmpMin_) {
+		mmcostType = mmType_;
+		mmpMax     = mmpMax_;
+		mmpMin     = mmpMin_;
+		initPens<int>(mmpens, mmcostType, mmpMin, mmpMax);
+	}
+	
+	/**
+	 * Set the N penalty.
+	 */
+	void setNPen(int nType, int n) {
+		npenType     = nType;
+		npen         = n;
+		initPens<int>(npens, npenType, npen, npen);
+	}
+	
+#ifndef NDEBUG
+	/**
+	 * Check that scoring scheme is internally consistent.
+	 */
+	bool repOk() const {
+		assert_geq(matchConst, 0);
+		assert_gt(rdGapConst, 0);
+		assert_gt(rdGapLinear, 0);
+		assert_gt(rfGapConst, 0);
+		assert_gt(rfGapLinear, 0);
+        return true;
+	}
+#endif
+
+	/**
+	 * Return a linear function of x where 'cnst' is the constant coefficiant
+	 * and 'lin' is the linear coefficient.
+	 */
+	static float linearFunc(int64_t x, float cnst, float lin) {
+		return (float)((double)cnst + ((double)lin * x));
+	}
+
+	/**
+	 * Return the penalty incurred by a mismatch at an alignment column
+	 * with read character 'rdc' reference mask 'refm' and quality 'q'.
+	 *
+	 * qs should be clamped to 63 on the high end before this query.
+	 */
+	inline int mm(int rdc, int refm, int q) const {
+		assert_range(0, 255, q);
+		return (rdc > 3 || refm > 15) ? npens[q] : mmpens[q];
+	}
+	
+	/**
+	 * Return the score of the given read character with the given quality
+	 * aligning to the given reference mask.  Take Ns into account.
+	 */
+	inline int score(int rdc, int refm, int q) const {
+		assert_range(0, 255, q);
+		if(rdc > 3 || refm > 15) {
+			return -npens[q];
+		}
+		if((refm & (1 << rdc)) != 0) {
+			return (int)matchBonuses[q];
+		} else {
+			return -mmpens[q];
+		}
+	}
+
+	/**
+	 * Return the score of the given read character with the given quality
+	 * aligning to the given reference mask.  Take Ns into account.  Increment
+	 * a counter if it's an N.
+	 */
+	inline int score(int rdc, int refm, int q, int& ns) const {
+		assert_range(0, 255, q);
+		if(rdc > 3 || refm > 15) {
+			ns++;
+			return -npens[q];
+		}
+		if((refm & (1 << rdc)) != 0) {
+			return (int)matchBonuses[q];
+		} else {
+			return -mmpens[q];
+		}
+	}
+
+	/**
+	 * Return the penalty incurred by a mismatch at an alignment column
+	 * with read character 'rdc' and quality 'q'.  We assume the
+	 * reference character is non-N.
+	 */
+	inline int mm(int rdc, int q) const {
+		assert_range(0, 255, q);
+		return (rdc > 3) ? npens[q] : mmpens[q];
+	}
+	
+	/**
+	 * Return the marginal penalty incurred by a mismatch at a read
+	 * position with quality 'q'.
+	 */
+	inline int mm(int q) const {
+		assert_geq(q, 0);
+		return q < 255 ? mmpens[q] : mmpens[255];
+	}
+
+	/**
+	 * Return the marginal penalty incurred by a mismatch at a read
+	 * position with quality 30.
+	 */
+	inline int64_t match() const {
+		return match(30);
+	}
+
+	/**
+	 * Return the marginal penalty incurred by a mismatch at a read
+	 * position with quality 'q'.
+	 */
+	inline int64_t match(int q) const {
+		assert_geq(q, 0);
+		return (int64_t)((q < 255 ? matchBonuses[q] : matchBonuses[255]) + 0.5f);
+	}
+	
+	/**
+	 * Return the best score achievable by a read of length 'rdlen'.
+	 */
+	inline int64_t perfectScore(size_t rdlen) const {
+		if(monotone) {
+			return 0;
+		} else {
+			return rdlen * match(30);
+		}
+	}
+
+	/**
+	 * Return true iff the penalities are such that two reads with the
+	 * same sequence but different qualities might yield different
+	 * alignments.
+	 */
+	inline bool qualitiesMatter() const { return qualsMatter_; }
+	
+	/**
+	 * Return the marginal penalty incurred by an N mismatch at a read
+	 * position with quality 'q'.
+	 */
+	inline int n(int q) const {
+		assert_geq(q, 0);
+		return q < 255 ? npens[q] : npens[255];
+	}
+
+	
+	/**
+	 * Return the marginal penalty incurred by a gap in the read,
+	 * given that this is the 'ext'th extension of the gap (0 = open,
+	 * 1 = first, etc).
+	 */
+	inline int ins(int ext) const {
+		assert_geq(ext, 0);
+		if(ext == 0) return readGapOpen();
+		return readGapExtend();
+	}
+
+	/**
+	 * Return the marginal penalty incurred by a gap in the reference,
+	 * given that this is the 'ext'th extension of the gap (0 = open,
+	 * 1 = first, etc).
+	 */
+	inline int del(int ext) const {
+		assert_geq(ext, 0);
+		if(ext == 0) return refGapOpen();
+		return refGapExtend();
+	}
+
+	/**
+	 * Return true iff a read of length 'rdlen' passes the score filter, i.e.,
+	 * has enough characters to rise above the minimum score threshold.
+	 */
+	bool scoreFilter(
+		int64_t minsc,
+		size_t rdlen) const;
+
+	/**
+	 * Given the score floor for valid alignments and the length of the read,
+	 * calculate the maximum possible number of read gaps that could occur in a
+	 * valid alignment.
+	 */
+	int maxReadGaps(
+		int64_t minsc,
+		size_t rdlen) const;
+
+	/**
+	 * Given the score floor for valid alignments and the length of the read,
+	 * calculate the maximum possible number of reference gaps that could occur
+	 * in a valid alignment.
+	 */
+	int maxRefGaps(
+		int64_t minsc,
+		size_t rdlen) const;
+    
+	/**
+	 * Given a read sequence, return true iff the read passes the N filter.
+	 * The N filter rejects reads with more than the number of Ns calculated by
+	 * taking nCeilConst + nCeilLinear * read length.
+	 */
+	bool nFilter(const BTDnaString& rd, size_t& ns) const;
+
+	/**
+	 * Given a read sequence, return true iff the read passes the N filter.
+	 * The N filter rejects reads with more than the number of Ns calculated by
+	 * taking nCeilConst + nCeilLinear * read length.
+	 *
+	 * For paired-end reads, there is a	question of how to apply the filter.
+	 * The filter could be applied to both mates separately, which might then
+	 * prevent paired-end alignment.  Or the filter could be applied to the
+	 * reads as though they're concatenated together.  The latter approach has
+	 * pros and cons.  The pro is that we can use paired-end information to
+	 * recover alignments for mates that would not have passed the N filter on
+	 * their own.  The con is that we might not want to do that, since the
+	 * non-N portion of the bad mate might contain particularly unreliable
+	 * information.
+	 */
+	void nFilterPair(
+		const BTDnaString* rd1, // mate 1
+		const BTDnaString* rd2, // mate 2
+		size_t& ns1,            // # Ns in mate 1
+		size_t& ns2,            // # Ns in mate 2
+		bool& filt1,            // true -> mate 1 rejected by filter
+		bool& filt2)            // true -> mate 2 rejected by filter
+		const;
+	
+	/**
+	 * The penalty associated with opening a new read gap.
+	 */
+	inline int readGapOpen() const { 
+		return rdGapConst + rdGapLinear;
+	}
+
+	/**
+	 * The penalty associated with opening a new ref gap.
+	 */
+	inline int refGapOpen() const { 
+		return rfGapConst + rfGapLinear;
+	}
+
+	/**
+	 * The penalty associated with extending a read gap by one character.
+	 */
+	inline int readGapExtend() const { 
+		return rdGapLinear;
+	}
+
+	/**
+	 * The penalty associated with extending a ref gap by one character.
+	 */
+	inline int refGapExtend() const { 
+		return rfGapLinear;
+	}
+    
+    // avg. known score: -22.96, avg. random score: -33.70
+    inline int canSpl(int intronlen = 0, int minanchor = 100, float probscore = 0.0f) const {
+        int penintron = (intronlen > 0 ? ip.f<int>((double)intronlen) : 0);
+        if(penintron < 0) penintron = 0;
+        if(minanchor < 10 && probscore < -24.0f + (10 - minanchor)) {
+            return 10000;
+        }
+        return penintron + cp;
+    }
+    
+    inline int noncanSpl(int intronlen = 0, int minanchor = 100, float probscore = 0.0f) const {
+        if(minanchor < 14) return 10000;
+        int penintron = (intronlen > 0 ? ip.f<int>((double)intronlen) : 0);
+        if(penintron < 0) penintron = 0;
+        return penintron + ncp;
+    }
+    
+    inline int conflictSpl() const { return csp; }
+
+	int     matchType;    // how to reward matches
+	int     matchConst;   // reward for a match
+	int     mmcostType;   // based on qual? rounded? just a constant?
+	int     mmpMax;       // maximum mismatch penalty
+	int     mmpMin;       // minimum mismatch penalty
+	SimpleFunc scoreMin;  // minimum score for valid alignment, constant coeff
+	SimpleFunc nCeil;     // max # Ns involved in alignment, constant coeff
+	int     npenType;     // N: based on qual? rounded? just a constant?
+	int     npen;         // N: if mmcosttype=constant, this is the const
+	bool    ncatpair;     // true -> do N filtering on concated pair
+	int     rdGapConst;   // constant term coeffecient in extend cost
+	int     rfGapConst;   // constant term coeffecient in extend cost
+	int     rdGapLinear;  // linear term coeffecient in extend cost
+	int     rfGapLinear;  // linear term coeffecient in extend cost
+	int     gapbar;       // # rows at top/bot can only be entered diagonally
+	bool    monotone;     // scores can only go down?
+	float   matchBonuses[256]; // map from qualities to match bonus
+	int     mmpens[256];       // map from qualities to mm penalty
+	int     npens[256];        // map from N qualities to penalty
+    int     cp;           // canonical splicing penalty
+    int     ncp;          // non-canonical splicing penalty
+    int     csp;          // conflicting splice site penalty
+    SimpleFunc     ip;           // intron length penalty
+
+	static Scoring base1() {
+		const double DMAX = std::numeric_limits<double>::max();
+		SimpleFunc scoreMin(SIMPLE_FUNC_LINEAR, 0.0f, DMAX, 37.0f, 0.3f);
+		SimpleFunc nCeil(SIMPLE_FUNC_LINEAR, 0.0f, DMAX, 2.0f, 0.1f);
+		return Scoring(
+			1,                       // reward for a match
+			COST_MODEL_CONSTANT,     // how to penalize mismatches
+			3,                       // max mismatch pelanty
+			3,                       // min mismatch pelanty
+			scoreMin,                // score min: 37 + 0.3x
+			nCeil,                   // n ceiling: 2 + 0.1x
+			COST_MODEL_CONSTANT,     // how to penalize Ns in the read
+			3,                       // constant if N pelanty is a constant
+			false,                   // concatenate mates before N filtering?
+			11,                      // constant coeff for gap in read
+			11,                      // constant coeff for gap in ref
+			4,                       // linear coeff for gap in read
+			4,                       // linear coeff for gap in ref
+			5);                      // 5 rows @ top/bot diagonal-entrance-only
+	}
+
+protected:
+
+	bool qualsMatter_;
+};
+
+#endif /*SCORING_H_*/
diff --git a/search_globals.h b/search_globals.h
new file mode 100644
index 0000000..bd2704f
--- /dev/null
+++ b/search_globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SEARCH_GLOBALS_H_
+#define SEARCH_GLOBALS_H_
+
+#include <stdint.h>
+
+// declared in ebwt_search.cpp
+extern bool     gColor;
+extern bool     gColorExEnds;
+extern bool     gReportOverhangs;
+extern bool     gColorSeq;
+extern bool     gColorEdit;
+extern bool     gColorQual;
+extern bool     gNoMaqRound;
+extern bool     gStrandFix;
+extern bool     gRangeMode;
+extern int      gVerbose;
+extern int      gQuiet;
+extern bool     gNofw;
+extern bool     gNorc;
+extern bool     gMate1fw;
+extern bool     gMate2fw;
+extern int      gMinInsert;
+extern int      gMaxInsert;
+extern int      gTrim5;
+extern int      gTrim3;
+extern int      gGapBarrier;
+extern int      gAllowRedundant;
+
+#endif /* SEARCH_GLOBALS_H_ */
diff --git a/sequence_io.h b/sequence_io.h
new file mode 100644
index 0000000..5a2cd6f
--- /dev/null
+++ b/sequence_io.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SEQUENCE_IO_H_
+#define SEQUENCE_IO_H_
+
+#include <string>
+#include <stdexcept>
+#include <fstream>
+#include <stdio.h>
+#include "assert_helpers.h"
+#include "ds.h"
+#include "filebuf.h"
+#include "sstring.h"
+
+using namespace std;
+
+/**
+ * Parse the fasta file 'infile'.  Store 
+ */
+template<typename TFnStr>
+static void parseFastaLens(
+	const TFnStr&  infile,   // filename
+	EList<size_t>& namelens, // destination for fasta name lengths
+	EList<size_t>& seqlens)  // destination for fasta sequence lengths
+{
+	FILE *in = fopen(sstr_to_cstr(infile), "r");
+	if(in == NULL) {
+		cerr << "Could not open sequence file" << endl;
+		throw 1;
+	}
+	FileBuf fb(in);
+	while(!fb.eof()) {
+		namelens.expand(); namelens.back() = 0;
+		seqlens.expand();  seqlens.back() = 0;
+		fb.parseFastaRecordLength(namelens.back(), seqlens.back());
+		if(seqlens.back() == 0) {
+			// Couldn't read a record.  We're probably done with this file.
+			namelens.pop_back();
+			seqlens.pop_back();
+			continue;
+		}
+	}
+	fb.close();
+}
+
+/**
+ * Parse the fasta file 'infile'.  Store each name record in 'names', each
+ * sequence record  in 'seqs', and the lengths of each 
+ */
+template<typename TFnStr, typename TNameStr, typename TSeqStr>
+static void parseFasta(
+	const TFnStr&    infile,   // filename
+	EList<TNameStr>& names,    // destination for fasta names
+	EList<size_t>&   namelens, // destination for fasta name lengths
+	EList<TSeqStr>&  seqs,     // destination for fasta sequences
+	EList<size_t>&   seqlens)  // destination for fasta sequence lengths
+{
+	assert_eq(namelens.size(), seqlens.size());
+	assert_eq(names.size(),    namelens.size());
+	assert_eq(seqs.size(),     seqlens.size());
+	size_t cur = namelens.size();
+	parseFastaLens(infile, namelens, seqlens);
+	FILE *in = fopen(sstr_to_cstr(infile), "r");
+	if(in == NULL) {
+		cerr << "Could not open sequence file" << endl;
+		throw 1;
+	}
+	FileBuf fb(in);
+	while(!fb.eof()) {
+		// Add a new empty record to the end
+		names.expand();
+		seqs.expand();
+		names.back() = new char[namelens[cur]+1];
+		seqs.back() = new char[seqlens[cur]+1];
+		fb.parseFastaRecord(names.back(), seqs.back());
+		if(seqs.back().empty()) {
+			// Couldn't read a record.  We're probably done with this file.
+			names.pop_back();
+			seqs.pop_back();
+			continue;
+		}
+	}
+	fb.close();
+}
+
+/**
+ * Read a set of FASTA sequence files of the given format and alphabet type.
+ * Store all of the extracted sequences in vector ss.
+ */
+template <typename TFnStr, typename TNameStr, typename TSeqStr>
+static void parseFastas(
+	const EList<TFnStr>& infiles, // filenames
+	EList<TNameStr>& names,    // destination for fasta names
+	EList<size_t>&   namelens, // destination for fasta name lengths
+	EList<TSeqStr>&  seqs,     // destination for fasta sequences
+	EList<size_t>&   seqlens)  // destination for fasta sequence lengths
+{
+	for(size_t i = 0; i < infiles.size(); i++) {
+		parseFasta<TFnStr, TNameStr, TSeqStr>(
+			infiles[i],
+			names,
+			namelens,
+			seqs,
+			seqlens);
+	}
+}
+
+#endif /*SEQUENCE_IO_H_*/
diff --git a/shmem.cpp b/shmem.cpp
new file mode 100644
index 0000000..a4853e7
--- /dev/null
+++ b/shmem.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef BOWTIE_SHARED_MEM
+
+#include <iostream>
+#include <string>
+#include <unistd.h>
+#include <sys/shm.h>
+#include <errno.h>
+#include "shmem.h"
+
+using namespace std;
+
+/**
+ * Notify other users of a shared-memory chunk that the leader has
+ * finished initializing it.
+ */
+void notifySharedMem(void *mem, size_t len) {
+	((volatile uint32_t*)((char*)mem + len))[0] = SHMEM_INIT;
+}
+
+/**
+ * Wait until the leader of a shared-memory chunk has finished
+ * initializing it.
+ */
+void waitSharedMem(void *mem, size_t len) {
+	while(((volatile uint32_t*)((char*)mem + len))[0] != SHMEM_INIT) {
+		sleep(1);
+	}
+}
+
+#endif
diff --git a/shmem.h b/shmem.h
new file mode 100644
index 0000000..b36f3ad
--- /dev/null
+++ b/shmem.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SHMEM_H_
+#define SHMEM_H_
+
+#ifdef BOWTIE_SHARED_MEM
+
+#include <string>
+#include <sys/shm.h>
+#include <unistd.h>
+#include <sys/shm.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdexcept>
+#include "str_util.h"
+#include "btypes.h"
+
+extern void notifySharedMem(void *mem, size_t len);
+
+extern void waitSharedMem(void *mem, size_t len);
+
+#define ALLOC_SHARED_U allocSharedMem<TIndexOffU>
+#define ALLOC_SHARED_U8 allocSharedMem<uint8_t>
+#define ALLOC_SHARED_U32 allocSharedMem<uint32_t>
+#define FREE_SHARED shmdt
+#define NOTIFY_SHARED notifySharedMem
+#define WAIT_SHARED waitSharedMem
+
+#define SHMEM_UNINIT  0xafba4242
+#define SHMEM_INIT    0xffaa6161
+
+/**
+ * Tries to allocate a shared-memory chunk for a given file of a given size.
+ */
+template <typename T>
+bool allocSharedMem(std::string fname,
+                    size_t len,
+                    T ** dst,
+                    const char *memName,
+                    bool verbose)
+{
+	using namespace std;
+	int shmid = -1;
+	// Calculate key given string
+	key_t key = (key_t)hash_string(fname);
+	shmid_ds ds;
+	int ret;
+	// Reserve 4 bytes at the end for silly synchronization
+	size_t shmemLen = len + 4;
+	if(verbose) {
+		cerr << "Reading " << len << "+4 bytes into shared memory for " << memName << endl;
+	}
+	T *ptr = NULL;
+	while(true) {
+		// Create the shrared-memory block
+		if((shmid = shmget(key, shmemLen, IPC_CREAT | 0666)) < 0) {
+			if(errno == ENOMEM) {
+				cerr << "Out of memory allocating shared area " << memName << endl;
+			} else if(errno == EACCES) {
+				cerr << "EACCES" << endl;
+			} else if(errno == EEXIST) {
+				cerr << "EEXIST" << endl;
+			} else if(errno == EINVAL) {
+				cerr << "Warning: shared-memory chunk's segment size doesn't match expected size (" << (shmemLen) << ")" << endl
+					 << "Deleteing old shared memory block and trying again." << endl;
+				shmid = shmget(key, 0, 0);
+				if((ret = shmctl(shmid, IPC_RMID, &ds)) < 0) {
+					cerr << "shmctl returned " << ret
+						 << " for IPC_RMID, errno is " << errno
+						 << ", shmid is " << shmid << endl;
+					throw 1;
+				} else {
+					cerr << "Deleted shared mem chunk with shmid " << shmid << endl;
+				}
+				continue;
+			} else if(errno == ENOENT) {
+				cerr << "ENOENT" << endl;
+			} else if(errno == ENOSPC) {
+				cerr << "ENOSPC" << endl;
+			} else {
+				cerr << "shmget returned " << shmid << " for and errno is " << errno << endl;
+			}
+			throw 1;
+		}
+		ptr = (T*)shmat(shmid, 0, 0);
+		if(ptr == (void*)-1) {
+			cerr << "Failed to attach " << memName << " to shared memory with shmat()." << endl;
+			throw 1;
+		}
+		if(ptr == NULL) {
+			cerr << memName << " pointer returned by shmat() was NULL." << endl;
+			throw 1;
+		}
+		// Did I create it, or did I just attach to one created by
+		// another process?
+		if((ret = shmctl(shmid, IPC_STAT, &ds)) < 0) {
+			cerr << "shmctl returned " << ret << " for IPC_STAT and errno is " << errno << endl;
+			throw 1;
+		}
+		if(ds.shm_segsz != shmemLen) {
+			cerr << "Warning: shared-memory chunk's segment size (" << ds.shm_segsz
+				 << ") doesn't match expected size (" << shmemLen << ")" << endl
+				 << "Deleteing old shared memory block and trying again." << endl;
+			if((ret = shmctl(shmid, IPC_RMID, &ds)) < 0) {
+				cerr << "shmctl returned " << ret << " for IPC_RMID and errno is " << errno << endl;
+				throw 1;
+			}
+		} else {
+			break;
+		}
+	} // while(true)
+	*dst = ptr;
+	bool initid = (((volatile uint32_t*)((char*)ptr + len))[0] == SHMEM_INIT);
+	if(ds.shm_cpid == getpid() && !initid) {
+		if(verbose) {
+			cerr << "  I (pid = " << getpid() << ") created the "
+			     << "shared memory for " << memName << endl;
+		}
+		// Set this value just off the end of the chunk to
+		// indicate that the data hasn't been read yet.
+		((volatile uint32_t*)((char*)ptr + len))[0] = SHMEM_UNINIT;
+		return true;
+	} else {
+		if(verbose) {
+			cerr << "  I (pid = " << getpid()
+			     << ") did not create the shared memory for "
+			     << memName << ".  Pid " << ds.shm_cpid << " did." << endl;
+		}
+		return false;
+	}
+}
+
+#else
+
+#define ALLOC_SHARED_U(...) 0
+#define ALLOC_SHARED_U8(...) 0
+#define ALLOC_SHARED_U32(...) 0
+#define FREE_SHARED(...)
+#define NOTIFY_SHARED(...)
+#define WAIT_SHARED(...)
+
+#endif /*BOWTIE_SHARED_MEM*/
+
+#endif /* SHMEM_H_ */
diff --git a/simple_func.cpp b/simple_func.cpp
new file mode 100644
index 0000000..a5b0859
--- /dev/null
+++ b/simple_func.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include "simple_func.h"
+#include "ds.h"
+#include "mem_ids.h"
+
+int SimpleFunc::parseType(const std::string& otype) {
+	string type = otype;
+	if(type == "C" || type == "Constant") {
+		return SIMPLE_FUNC_CONST;
+	} else if(type == "L" || type == "Linear") {
+		return SIMPLE_FUNC_LINEAR;
+	} else if(type == "S" || type == "Sqrt") {
+		return SIMPLE_FUNC_SQRT;
+	} else if(type == "G" || type == "Log") {
+		return SIMPLE_FUNC_LOG;
+	}
+	std::cerr << "Error: Bad function type '" << otype.c_str()
+			  << "'.  Should be C (constant), L (linear), "
+			  << "S (square root) or G (natural log)." << std::endl;
+	throw 1;
+}
+
+SimpleFunc SimpleFunc::parse(
+	const std::string& s,
+	double defaultConst,
+	double defaultLinear,
+	double defaultMin,
+	double defaultMax)
+{
+	// Separate value into comma-separated tokens
+	EList<string> ctoks(MISC_CAT);
+	string ctok;
+	istringstream css(s);
+	SimpleFunc fv;
+	while(getline(css, ctok, ',')) {
+		ctoks.push_back(ctok);
+	}
+	if(ctoks.size() >= 1) {
+		fv.setType(parseType(ctoks[0]));
+	}
+	if(ctoks.size() >= 2) {
+		double co;
+		istringstream tmpss(ctoks[1]);
+		tmpss >> co;
+		fv.setConst(co);
+	} else {
+		fv.setConst(defaultConst);
+	}
+	if(ctoks.size() >= 3) {
+		double ce;
+		istringstream tmpss(ctoks[2]);
+		tmpss >> ce;
+		fv.setCoeff(ce);
+	} else {
+		fv.setCoeff(defaultLinear);
+	}
+	if(ctoks.size() >= 4) {
+		double mn;
+		istringstream tmpss(ctoks[3]);
+		tmpss >> mn;
+		fv.setMin(mn);
+	} else {
+		fv.setMin(defaultMin);
+	}
+	if(ctoks.size() >= 5) {
+		double mx;
+		istringstream tmpss(ctoks[4]);
+		tmpss >> mx;
+		fv.setMax(mx);
+	} else {
+		fv.setMax(defaultMax);
+	}
+	return fv;
+}
diff --git a/simple_func.h b/simple_func.h
new file mode 100644
index 0000000..ca76869
--- /dev/null
+++ b/simple_func.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SIMPLE_FUNC_H_
+#define SIMPLE_FUNC_H_
+
+#include <math.h>
+#include <cassert>
+#include <limits>
+#include "tokenize.h"
+
+#define SIMPLE_FUNC_CONST  1
+#define SIMPLE_FUNC_LINEAR 2
+#define SIMPLE_FUNC_SQRT   3
+#define SIMPLE_FUNC_LOG    4
+
+/**
+ * A simple function of one argument, parmeterized by I, X, C and L: min
+ * value, max value, constant term, and coefficient respectively:
+ *
+ * 1. Constant:    f(x) = max(I, min(X, C + L * 0))
+ * 2. Linear:      f(x) = max(I, min(X, C + L * x))
+ * 3. Square root: f(x) = max(I, min(X, C + L * sqrt(x)))
+ * 4. Log:         f(x) = max(I, min(X, C + L * ln(x)))
+ *
+ * Clearly, the return value of the Constant function doesn't depend on x.
+ */
+class SimpleFunc {
+
+public:
+
+	SimpleFunc() : type_(0), I_(0.0), X_(0.0), C_(0.0), L_(0.0) { }
+
+	SimpleFunc(int type, double I, double X, double C, double L) {
+		init(type, I, X, C, L);
+	}
+	
+	void init(int type, double I, double X, double C, double L) {
+		type_ = type; I_ = I; X_ = X; C_ = C; L_ = L;
+	}
+
+	void init(int type, double C, double L) {
+		type_ = type; C_ = C; L_ = L;
+		I_ = -std::numeric_limits<double>::max();
+		X_ = std::numeric_limits<double>::max();
+	}
+	
+	void setType (int type ) { type_ = type; }
+	void setMin  (double mn) { I_ = mn; }
+	void setMax  (double mx) { X_ = mx; }
+	void setConst(double co) { C_ = co; }
+	void setCoeff(double ce) { L_ = ce; }
+
+	int    getType () const { return type_; }
+	double getMin  () const { return I_; }
+	double getMax  () const { return X_; }
+	double getConst() const { return C_; }
+	double getCoeff() const { return L_; }
+	
+	void mult(double x) {
+		if(I_ < std::numeric_limits<double>::max()) {
+			I_ *= x; X_ *= x; C_ *= x; L_ *= x;
+		}
+	}
+	
+	bool initialized() const { return type_ != 0; }
+	void reset() { type_ = 0; }
+	
+	template<typename T>
+	T f(double x) const {
+		assert(type_ >= SIMPLE_FUNC_CONST && type_ <= SIMPLE_FUNC_LOG);
+		double X;
+		if(type_ == SIMPLE_FUNC_CONST) {
+			X = 0.0;
+		} else if(type_ == SIMPLE_FUNC_LINEAR) {
+			X = x;
+		} else if(type_ == SIMPLE_FUNC_SQRT) {
+			X = sqrt(x);
+		} else if(type_ == SIMPLE_FUNC_LOG) {
+			X = log(x);
+		} else {
+			throw 1;
+		}
+		double ret = std::max(I_, std::min(X_, C_ + L_ * X));
+		if(ret == std::numeric_limits<double>::max()) {
+			return std::numeric_limits<T>::max();
+		} else if(ret == std::numeric_limits<double>::min()) {
+			return std::numeric_limits<T>::min();
+		} else {
+			return (T)ret;
+		}
+	}
+	
+	static int parseType(const std::string& otype);
+	
+	static SimpleFunc parse(
+		const std::string& s,
+		double defaultConst = 0.0,
+		double defaultLinear = 0.0,
+		double defaultMin = 0.0,
+		double defaultMax = std::numeric_limits<double>::max());
+
+protected:
+
+	int type_;
+	double I_, X_, C_, L_;
+};
+
+#endif /*ndef SIMPLE_FUNC_H_*/
diff --git a/sse_util.cpp b/sse_util.cpp
new file mode 100644
index 0000000..d6310cf
--- /dev/null
+++ b/sse_util.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sse_util.h"
+#include "aligner_swsse.h"
+#include "limit.h"
+
+/**
+ * Given a column of filled-in cells, save the checkpointed cells in cs_.
+ */
+void Checkpointer::commitCol(
+	__m128i *pvH,
+	__m128i *pvE,
+	__m128i *pvF,
+	size_t coli)
+{
+}
diff --git a/sse_util.h b/sse_util.h
new file mode 100644
index 0000000..b5781f1
--- /dev/null
+++ b/sse_util.h
@@ -0,0 +1,574 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SSE_UTIL_H_
+#define SSE_UTIL_H_
+
+#include "assert_helpers.h"
+#include "ds.h"
+#include "limit.h"
+#include <iostream>
+#include <emmintrin.h>
+
+class EList_m128i {
+public:
+
+	/**
+	 * Allocate initial default of S elements.
+	 */
+	explicit EList_m128i(int cat = 0) :
+		cat_(cat), last_alloc_(NULL), list_(NULL), sz_(0), cur_(0)
+	{
+		assert_geq(cat, 0);
+	}
+
+	/**
+	 * Destructor.
+	 */
+	~EList_m128i() { free(); }
+
+	/**
+	 * Return number of elements.
+	 */
+	inline size_t size() const { return cur_; }
+
+	/**
+	 * Return number of elements allocated.
+	 */
+	inline size_t capacity() const { return sz_; }
+	
+	/**
+	 * Ensure that there is sufficient capacity to expand to include
+	 * 'thresh' more elements without having to expand.
+	 */
+	inline void ensure(size_t thresh) {
+		if(list_ == NULL) lazyInit();
+		expandCopy(cur_ + thresh);
+	}
+
+	/**
+	 * Ensure that there is sufficient capacity to include 'newsz' elements.
+	 * If there isn't enough capacity right now, expand capacity to exactly
+	 * equal 'newsz'.
+	 */
+	inline void reserveExact(size_t newsz) {
+		if(list_ == NULL) lazyInitExact(newsz);
+		expandCopyExact(newsz);
+	}
+
+	/**
+	 * Return true iff there are no elements.
+	 */
+	inline bool empty() const { return cur_ == 0; }
+	
+	/**
+	 * Return true iff list hasn't been initialized yet.
+	 */
+	inline bool null() const { return list_ == NULL; }
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.
+	 */
+	void resize(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInit();
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) {
+			expandCopy(sz);
+		}
+		cur_ = sz;
+	}
+	
+	/**
+	 * Zero out contents of vector.
+	 */
+	void zero() {
+		if(cur_ > 0) {
+			memset(list_, 0, cur_ * sizeof(__m128i));
+		}
+	}
+
+	/**
+	 * If size is less than requested size, resize up to at least sz
+	 * and set cur_ to requested sz.  Do not copy the elements over.
+	 */
+	void resizeNoCopy(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInit();
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) {
+			expandNoCopy(sz);
+		}
+		cur_ = sz;
+	}
+
+	/**
+	 * If size is less than requested size, resize up to exactly sz and set
+	 * cur_ to requested sz.
+	 */
+	void resizeExact(size_t sz) {
+		if(sz > 0 && list_ == NULL) lazyInitExact(sz);
+		if(sz <= cur_) {
+			cur_ = sz;
+			return;
+		}
+		if(sz_ < sz) expandCopyExact(sz);
+		cur_ = sz;
+	}
+
+	/**
+	 * Make the stack empty.
+	 */
+	void clear() {
+		cur_ = 0; // re-use stack memory
+		// Don't clear heap; re-use it
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline __m128i& operator[](size_t i) {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline __m128i operator[](size_t i) const {
+		assert_lt(i, cur_);
+		return list_[i];
+	}
+
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline __m128i& get(size_t i) {
+		return operator[](i);
+	}
+	
+	/**
+	 * Return a reference to the ith element.
+	 */
+	inline __m128i get(size_t i) const {
+		return operator[](i);
+	}
+
+	/**
+	 * Return a pointer to the beginning of the buffer.
+	 */
+	__m128i *ptr() { return list_; }
+
+	/**
+	 * Return a const pointer to the beginning of the buffer.
+	 */
+	const __m128i *ptr() const { return list_; }
+
+	/**
+	 * Return memory category.
+	 */
+	int cat() const { return cat_; }
+
+private:
+
+	/**
+	 * Initialize memory for EList.
+	 */
+	void lazyInit() {
+		assert(list_ == NULL);
+		list_ = alloc(sz_);
+	}
+
+	/**
+	 * Initialize exactly the prescribed number of elements for EList.
+	 */
+	void lazyInitExact(size_t sz) {
+		assert_gt(sz, 0);
+		assert(list_ == NULL);
+		sz_ = sz;
+		list_ = alloc(sz);
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	__m128i *alloc(size_t sz) {
+		__m128i* last_alloc_;
+		try {
+			last_alloc_ = new __m128i[sz + 2];
+		} catch(std::bad_alloc& e) {
+			std::cerr << "Error: Out of memory allocating " << sz << " __m128i's for DP matrix: '" << e.what() << "'" << std::endl;
+			throw e;
+		}
+		__m128i* tmp = last_alloc_;
+		size_t tmpint = (size_t)tmp;
+		// Align it!
+		if((tmpint & 0xf) != 0) {
+			tmpint += 15;
+			tmpint &= (~0xf);
+			tmp = reinterpret_cast<__m128i*>(tmpint);
+		}
+		assert_eq(0, (tmpint & 0xf)); // should be 16-byte aligned
+		assert(tmp != NULL);
+		gMemTally.add(cat_, sz);
+		return tmp;
+	}
+
+	/**
+	 * Allocate a T array of length sz_ and store in list_.  Also,
+	 * tally into the global memory tally.
+	 */
+	void free() {
+		if(list_ != NULL) {
+			delete[] last_alloc_;
+			gMemTally.del(cat_, sz_);
+			list_ = NULL;
+			sz_ = cur_ = 0;
+		}
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.  Size
+	 * increases quadratically with number of expansions.  Copy old contents
+	 * into new buffer using operator=.
+	 */
+	void expandCopy(size_t thresh) {
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		expandCopyExact(newsz);
+	}
+
+	/**
+	 * Expand the list_ buffer until it has exactly 'newsz' elements.  Copy
+	 * old contents into new buffer using operator=.
+	 */
+	void expandCopyExact(size_t newsz) {
+		if(newsz <= sz_) return;
+		__m128i* tmp = alloc(newsz);
+		assert(tmp != NULL);
+		size_t cur = cur_;
+		if(list_ != NULL) {
+ 			for(size_t i = 0; i < cur_; i++) {
+				// Note: operator= is used
+				tmp[i] = list_[i];
+			}
+			free();
+		}
+		list_ = tmp;
+		sz_ = newsz;
+		cur_ = cur;
+	}
+
+	/**
+	 * Expand the list_ buffer until it has at least 'thresh' elements.
+	 * Size increases quadratically with number of expansions.  Don't copy old
+	 * contents into the new buffer.
+	 */
+	void expandNoCopy(size_t thresh) {
+		assert(list_ != NULL);
+		if(thresh <= sz_) return;
+		size_t newsz = (sz_ * 2)+1;
+		while(newsz < thresh) newsz *= 2;
+		expandNoCopyExact(newsz);
+	}
+
+	/**
+	 * Expand the list_ buffer until it has exactly 'newsz' elements.  Don't
+	 * copy old contents into the new buffer.
+	 */
+	void expandNoCopyExact(size_t newsz) {
+		assert(list_ != NULL);
+		assert_gt(newsz, 0);
+		free();
+		__m128i* tmp = alloc(newsz);
+		assert(tmp != NULL);
+		list_ = tmp;
+		sz_ = newsz;
+		assert_gt(sz_, 0);
+	}
+
+	int      cat_;        // memory category, for accounting purposes
+	__m128i* last_alloc_; // what new[] originally returns
+	__m128i *list_;       // list ptr, aligned version of what new[] returns
+	size_t   sz_;         // capacity
+	size_t   cur_;        // occupancy (AKA size)
+};
+
+struct  CpQuad {
+	CpQuad() { reset(); }
+	
+	void reset() { sc[0] = sc[1] = sc[2] = sc[3] = 0; }
+	
+	bool operator==(const CpQuad& o) const {
+		return sc[0] == o.sc[0] &&
+		       sc[1] == o.sc[1] &&
+			   sc[2] == o.sc[2] &&
+			   sc[3] == o.sc[3];
+	}
+
+	int16_t sc[4];
+};
+
+/**
+ * Encapsulates a collection of checkpoints.  Assumes the scheme is to
+ * checkpoint adjacent pairs of anti-diagonals.
+ */
+class Checkpointer {
+
+public:
+
+	Checkpointer() { reset(); }
+	
+	/**
+	 * Set the checkpointer up for a new rectangle.
+	 */
+	void init(
+		size_t nrow,          // # of rows
+		size_t ncol,          // # of columns
+		size_t perpow2,       // checkpoint every 1 << perpow2 diags (& next)
+		int64_t perfectScore, // what is a perfect score?  for sanity checks
+		bool is8,             // 8-bit?
+		bool doTri,           // triangle shaped?
+		bool local,           // is alignment local?  for sanity checks
+		bool debug)           // gather debug checkpoints?
+	{
+		assert_gt(perpow2, 0);
+		nrow_ = nrow;
+		ncol_ = ncol;
+		perpow2_ = perpow2;
+		per_ = 1 << perpow2;
+		lomask_ = ~(0xffffffff << perpow2);
+		perf_ = perfectScore;
+		local_ = local;
+		ndiag_ = (ncol + nrow - 1 + 1) / per_;
+		locol_ = MAX_SIZE_T;
+		hicol_ = MIN_SIZE_T;
+//		debug_ = debug;
+		debug_ = true;
+		commitMap_.clear();
+		firstCommit_ = true;
+		size_t perword = (is8 ? 16 : 8);
+		is8_ = is8;
+		niter_ = ((nrow_ + perword - 1) / perword);
+		if(doTri) {
+			// Save a pair of anti-diagonals every per_ anti-diagonals for
+			// backtrace purposes
+			qdiag1s_.resize(ndiag_ * nrow_);
+			qdiag2s_.resize(ndiag_ * nrow_);
+		} else {
+			// Save every per_ columns and rows for backtrace purposes
+			qrows_.resize((nrow_ / per_) * ncol_);
+			qcols_.resize((ncol_ / per_) * (niter_ << 2));
+		}
+		if(debug_) {
+			// Save all columns for debug purposes
+			qcolsD_.resize(ncol_ * (niter_ << 2));
+		}
+	}
+	
+	/**
+	 * Return true iff we've been collecting debug cells.
+	 */
+	bool debug() const { return debug_; }
+	
+	/**
+	 * Check whether the given score matches the saved score at row, col, hef.
+	 */
+	int64_t debugCell(size_t row, size_t col, int hef) const {
+		assert(debug_);
+		const __m128i* ptr = qcolsD_.ptr() + hef;
+		// Fast forward to appropriate column
+		ptr += ((col * niter_) << 2);
+		size_t mod = row % niter_; // which m128i
+		size_t div = row / niter_; // offset into m128i
+		// Fast forward to appropriate word
+		ptr += (mod << 2);
+		// Extract score
+		int16_t sc = (is8_ ? ((uint8_t*)ptr)[div] : ((int16_t*)ptr)[div]);
+		int64_t asc = MIN_I64;
+		// Convert score
+		if(is8_) {
+			if(local_) {
+				asc = sc;
+			} else {
+				if(sc == 0) asc = MIN_I64;
+				else asc = sc - 0xff;
+			}
+		} else {
+			if(local_) {
+				asc = sc + 0x8000;
+			} else {
+				if(sc != MIN_I16) asc = sc - 0x7fff;
+			}
+		}
+		return asc;
+	}
+	
+	/**
+	 * Return true iff the given row/col is checkpointed.
+	 */
+	bool isCheckpointed(size_t row, size_t col) const {
+		assert_leq(col, hicol_);
+		assert_geq(col, locol_);
+		size_t mod = (row + col) & lomask_;
+		assert_lt(mod, per_);
+		return mod >= per_ - 2;
+	}
+
+	/**
+	 * Return the checkpointed H, E, or F score from the given cell.
+	 */
+	inline int64_t scoreTriangle(size_t row, size_t col, int hef) const {
+		assert(isCheckpointed(row, col));
+		bool diag1 = ((row + col) & lomask_) == per_ - 2;
+		size_t off = (row + col) >> perpow2_;
+		if(diag1) {
+			if(qdiag1s_[off * nrow_ + row].sc[hef] == MIN_I16) {
+				return MIN_I64;
+			} else {
+				return qdiag1s_[off * nrow_ + row].sc[hef];
+			}
+		} else {
+			if(qdiag2s_[off * nrow_ + row].sc[hef] == MIN_I16) {
+				return MIN_I64;
+			} else {
+				return qdiag2s_[off * nrow_ + row].sc[hef];
+			}
+		}
+	}
+
+	/**
+	 * Return the checkpointed H, E, or F score from the given cell.
+	 */
+	inline int64_t scoreSquare(size_t row, size_t col, int hef) const {
+		// Is it in a checkpointed row?  Note that checkpointed rows don't
+		// necessarily have the horizontal contributions calculated, so we want
+		// to use the column info in that case.
+		if((row & lomask_) == lomask_ && hef != 1) {
+			int64_t sc = qrows_[(row >> perpow2_) * ncol_ + col].sc[hef];
+			if(sc == MIN_I16) return MIN_I64;
+			return sc;
+		}
+		hef--;
+		if(hef == -1) hef = 2;
+		// It must be in a checkpointed column
+		assert_eq(lomask_, (col & lomask_));
+		// Fast forward to appropriate column
+		const __m128i* ptr = qcols_.ptr() + hef;
+		ptr += (((col >> perpow2_) * niter_) << 2);
+		size_t mod = row % niter_; // which m128i
+		size_t div = row / niter_; // offset into m128i
+		// Fast forward to appropriate word
+		ptr += (mod << 2);
+		// Extract score
+		int16_t sc = (is8_ ? ((uint8_t*)ptr)[div] : ((int16_t*)ptr)[div]);
+		int64_t asc = MIN_I64;
+		// Convert score
+		if(is8_) {
+			if(local_) {
+				asc = sc;
+			} else {
+				if(sc == 0) asc = MIN_I64;
+				else asc = sc - 0xff;
+			}
+		} else {
+			if(local_) {
+				asc = sc + 0x8000;
+			} else {
+				if(sc != MIN_I16) asc = sc - 0x7fff;
+			}
+		}
+		return asc;
+	}
+
+	/**
+	 * Given a column of filled-in cells, save the checkpointed cells in cs_.
+	 */
+	void commitCol(__m128i *pvH, __m128i *pvE, __m128i *pvF, size_t coli);
+	
+	/**
+	 * Reset the state of the Checkpointer.
+	 */
+	void reset() {
+		perpow2_ = per_ = lomask_ = nrow_ = ncol_ = 0;
+		local_ = false;
+		niter_ = ndiag_ = locol_ = hicol_ = 0;
+		perf_ = 0;
+		firstCommit_ = true;
+		is8_ = debug_ = false;
+	}
+	
+	/**
+	 * Return true iff the Checkpointer has been initialized.
+	 */
+	bool inited() const {
+		return nrow_ > 0;
+	}
+	
+	size_t per()     const { return per_;     }
+	size_t perpow2() const { return perpow2_; }
+	size_t lomask()  const { return lomask_;  }
+	size_t locol()   const { return locol_;   }
+	size_t hicol()   const { return hicol_;   }
+	size_t nrow()    const { return nrow_;    }
+	size_t ncol()    const { return ncol_;    }
+	
+	const CpQuad* qdiag1sPtr() const { return qdiag1s_.ptr(); }
+	const CpQuad* qdiag2sPtr() const { return qdiag2s_.ptr(); }
+
+	size_t   perpow2_;   // 1 << perpow2_ - 2 is the # of uncheckpointed
+	                     // anti-diags between checkpointed anti-diag pairs
+	size_t   per_;       // 1 << perpow2_
+	size_t   lomask_;    // mask for extracting low bits
+	size_t   nrow_;      // # rows in current rectangle
+	size_t   ncol_;      // # cols in current rectangle
+	int64_t  perf_;      // perfect score
+	bool     local_;     // local alignment?
+	
+	size_t   ndiag_;     // # of double-diags
+	
+	size_t   locol_;     // leftmost column committed
+	size_t   hicol_;     // rightmost column committed
+
+	// Map for committing scores from vector columns to checkpointed diagonals
+	EList<size_t> commitMap_;
+	bool          firstCommit_;
+	
+	EList<CpQuad> qdiag1s_; // checkpoint H/E/F values for diagonal 1
+	EList<CpQuad> qdiag2s_; // checkpoint H/E/F values for diagonal 2
+
+	EList<CpQuad> qrows_;   // checkpoint H/E/F values for rows
+	
+	// We store columns in this way to reduce overhead of populating them
+	bool          is8_;     // true -> fill used 8-bit cells
+	size_t        niter_;   // # __m128i words per column
+	EList_m128i   qcols_;   // checkpoint E/F/H values for select columns
+	
+	bool          debug_;   // get debug checkpoints? (i.e. fill qcolsD_?)
+	EList_m128i   qcolsD_;  // checkpoint E/F/H values for all columns (debug)
+};
+
+#endif
diff --git a/sstring.cpp b/sstring.cpp
new file mode 100644
index 0000000..3b26587
--- /dev/null
+++ b/sstring.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef MAIN_SSTRING
+
+#include <string.h>
+#include <iostream>
+#include "ds.h"
+#include "sstring.h"
+
+using namespace std;
+
+int main(void) {
+	cerr << "Test inter-class comparison operators...";
+	{
+		SString<int> s(2);
+		s.set('a', 0);
+		s.set('b', 1);
+		assert(sstr_eq(s, (const char *)"ab"));
+		assert(!sstr_neq(s, (const char *)"ab"));
+		assert(!sstr_lt(s, (const char *)"ab"));
+		assert(!sstr_gt(s, (const char *)"ab"));
+		assert(sstr_leq(s, (const char *)"ab"));
+		assert(sstr_geq(s, (const char *)"ab"));
+		
+		SStringExpandable<int> s2;
+		s2.append('a');
+		s2.append('b');
+		assert(sstr_eq(s, s2));
+		assert(sstr_eq(s2, (const char *)"ab"));
+		assert(!sstr_neq(s, s2));
+		assert(!sstr_neq(s2, (const char *)"ab"));
+		assert(!sstr_lt(s, s2));
+		assert(!sstr_lt(s2, (const char *)"ab"));
+		assert(!sstr_gt(s, s2));
+		assert(!sstr_gt(s2, (const char *)"ab"));
+		assert(sstr_leq(s, s2));
+		assert(sstr_leq(s2, (const char *)"ab"));
+		assert(sstr_geq(s, s2));
+		assert(sstr_geq(s2, (const char *)"ab"));
+
+		SStringFixed<int, 12> s3;
+		s3.append('a');
+		s3.append('b');
+		assert(sstr_eq(s, s3));
+		assert(sstr_eq(s2, s3));
+		assert(sstr_eq(s3, (const char *)"ab"));
+		assert(!sstr_neq(s, s3));
+		assert(!sstr_neq(s2, s3));
+		assert(!sstr_neq(s3, (const char *)"ab"));
+		assert(!sstr_lt(s, s3));
+		assert(!sstr_lt(s2, s3));
+		assert(!sstr_lt(s3, (const char *)"ab"));
+		assert(!sstr_gt(s, s3));
+		assert(!sstr_gt(s2, s3));
+		assert(!sstr_gt(s3, (const char *)"ab"));
+		assert(sstr_geq(s, s3));
+		assert(sstr_geq(s2, s3));
+		assert(sstr_geq(s3, (const char *)"ab"));
+		assert(sstr_leq(s, s3));
+		assert(sstr_leq(s2, s3));
+		assert(sstr_leq(s3, (const char *)"ab"));
+	}
+	cerr << "PASSED" << endl;
+	
+	cerr << "Test flag for whether to consider end-of-word < other chars ...";
+	{
+		SString<char> ss("String");
+		SString<char> sl("String1");
+		assert(sstr_lt(ss, sl));
+		assert(sstr_gt(ss, sl, false));
+		assert(sstr_leq(ss, sl));
+		assert(sstr_geq(ss, sl, false));
+	}
+	cerr << "PASSED" << endl;
+	
+	cerr << "Test toZBuf and toZBufXForm ...";
+	{
+		SString<uint32_t> s(10);
+		for(int i = 0; i < 10; i++) {
+			s[i] = (uint32_t)i;
+		}
+		assert(strcmp(s.toZBufXForm("0123456789"), "0123456789") == 0);
+	}
+	cerr << "PASSED" << endl;
+
+	cerr << "Test S2bDnaString ...";
+	{
+		const char *str =
+			"ACGTACGTAC" "ACGTACGTAC" "ACGTACGTAC"
+			"ACGTACGTAC" "ACGTACGTAC" "ACGTACGTAC";
+		const char *gs =
+			"GGGGGGGGGG" "GGGGGGGGGG" "GGGGGGGGGG"
+			"GGGGGGGGGG" "GGGGGGGGGG" "GGGGGGGGGG";
+		for(size_t i = 0; i < 60; i++) {
+			S2bDnaString s(str, i, true);
+			S2bDnaString sr;
+			BTDnaString s2(str, i, true);
+			assert(sstr_eq(s, s2));
+			if(i >= 10) {
+				BTDnaString s3;
+				s.windowGetDna(s3, true, false, 3, 4);
+				assert(sstr_eq(s3.toZBuf(), (const char*)"TACG"));
+				s.windowGetDna(s3, false, false, 3, 4);
+				assert(sstr_eq(s3.toZBuf(), (const char*)"CGTA"));
+				assert_eq('A', s.toChar(0));
+				assert_eq('G', s.toChar(2));
+				assert_eq('A', s.toChar(4));
+				assert_eq('G', s.toChar(6));
+				assert_eq('A', s.toChar(8));
+				
+				s.reverseWindow(1, 8);
+				s2.reverseWindow(1, 8);
+				
+				assert_eq('A', s.toChar(1));
+				assert_eq('T', s.toChar(2));
+				assert_eq('G', s.toChar(3));
+				assert_eq('C', s.toChar(4));
+				assert_eq('A', s.toChar(5));
+				assert_eq('T', s.toChar(6));
+				assert_eq('G', s.toChar(7));
+				assert_eq('C', s.toChar(8));
+				assert(sstr_eq(s, s2));
+
+				s.reverseWindow(1, 8);
+				s2.reverseWindow(1, 8);
+				assert(sstr_eq(s, s2));
+			}
+			if(i > 1) {
+				s.reverse();
+				sr.installReverseChars(str, i);
+				s2.reverse();
+				assert(sstr_eq(s, s2));
+				assert(sstr_eq(sr, s2));
+				s.reverse();
+				sr.reverse();
+				assert(sstr_neq(s, s2));
+				assert(sstr_neq(sr, s2));
+				s.fill(2);
+				s2.reverse();
+				assert(sstr_leq(s, gs));
+				assert(sstr_gt(s, s2));
+				assert(sstr_gt(s, sr));
+				s2.fill(2);
+				sr.fill(2);
+				assert(sstr_eq(s, s2));
+				assert(sstr_eq(s, sr));
+			}
+		}
+		S2bDnaString s(str, true);
+		S2bDnaString sr;
+		BTDnaString s2(str, true);
+		assert(sstr_eq(s2.toZBuf(), str));
+		assert(sstr_eq(s, s2));
+		s.reverse();
+		sr.installReverseChars(str);
+		s2.reverse();
+		assert(sstr_eq(s, s2));
+		assert(sstr_eq(sr, s2));
+		s.reverse();
+		sr.reverse();
+		assert(sstr_neq(s, s2));
+		assert(sstr_neq(sr, s2));
+	}
+	cerr << "PASSED" << endl;
+
+	cerr << "Test operator=() ...";
+	{
+		S2bDnaString s;
+		s.installChars(string("gtcagtca"));
+		assert(sstr_eq(s.toZBuf(), (const char *)"GTCAGTCA"));
+	}
+	cerr << "PASSED" << endl;
+	
+	cerr << "Conversions from string ...";
+	{
+		SStringExpandable<char> se(string("hello"));
+		EList<SStringExpandable<char> > sel;
+		sel.push_back(SStringExpandable<char>(string("hello")));
+	}
+	cerr << "PASSED" << endl;
+	
+	cerr << "PASSED" << endl;
+}
+
+#endif /*def MAIN_SSTRING*/
diff --git a/sstring.h b/sstring.h
new file mode 100644
index 0000000..a4d7ded
--- /dev/null
+++ b/sstring.h
@@ -0,0 +1,3537 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SSTRING_H_
+#define SSTRING_H_
+
+#include <string.h>
+#include <iostream>
+#include <stdlib.h>     /* exit, EXIT_FAILURE */
+#include <bitset>
+#include <vector>
+#include "assert_helpers.h"
+#include "alphabet.h"
+#include "random_source.h"
+
+/**
+ * Four kinds of strings defined here:
+ *
+ * SString:
+ *   A fixed-length string using heap memory with size set at construction time
+ *   or when install() member is called.
+ *
+ * S2bDnaString:
+ *   Like SString, but stores a list uint32_t words where each word is divided
+ *   into 16 2-bit slots interpreted as holding one A/C/G/T nucleotide each.
+ *
+ * TODO: S3bDnaString allowing N.  S4bDnaString allowing nucleotide masks.
+ *
+ * SStringExpandable:
+ *   A string using heap memory where the size of the backing store is
+ *   automatically resized as needed.  Supports operations like append, insert,
+ *   erase, etc.
+ *
+ * SStringFixed:
+ *   A fixed-length string using stack memory where size is set at compile
+ *   time.
+ *
+ * All string classes have some extra facilities that make it easy to print the
+ * string, including when the string uses an encoded alphabet.  See toZBuf()
+ * and toZBufXForm().
+ *
+ * Global lt, eq, and gt template functions are supplied.  They are capable of
+ * doing lexicographical comparisons between any of the three categories of
+ * strings defined here.
+ */
+
+template<typename T>
+class Class_sstr_len {
+public:
+	static inline size_t sstr_len(const T& s) {
+		return s.length();
+	}
+};
+
+template<unsigned N>
+class Class_sstr_len<const char[N]> {
+public:
+	static inline size_t sstr_len(const char s[N]) {
+		return strlen(s);
+	}
+};
+
+template<>
+class Class_sstr_len<const char *> {
+public:
+	static inline size_t sstr_len(const char *s) {
+		return strlen(s);
+	}
+};
+
+template<>
+class Class_sstr_len<const unsigned char *> {
+public:
+	static inline size_t sstr_len(const unsigned char *s) {
+		return strlen((const char *)s);
+	}
+};
+
+template<typename T1, typename T2>
+static inline bool sstr_eq(const T1& s1, const T2& s2) {
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
+	if(len1 != len2) return false;
+	for(size_t i = 0; i < len1; i++) {
+		if(s1[i] != s2[i]) return false;
+	}
+	return true;
+}
+
+template<typename T1, typename T2>
+static inline bool sstr_neq(const T1& s1, const T2& s2) {
+	return !sstr_eq(s1, s2);
+}
+
+/**
+ * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
+ * to upto characters.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_upto_eq(
+	const T1& s1, size_t suf1,
+	const T2& s2, size_t suf2,
+	size_t upto,
+	bool endlt = true)
+{
+	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
+	if(len1 > upto) len1 = upto;
+	if(len2 > upto) len2 = upto;
+	if(len1 != len2) return false;
+	for(size_t i = 0; i < len1; i++) {
+		if(s1[suf1+i] != s2[suf2+i]) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
+ * to upto characters.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_upto_neq(
+	const T1& s1, size_t suf1,
+	const T2& s2, size_t suf2,
+	size_t upto,
+	bool endlt = true)
+{
+	return !sstr_suf_upto_eq(s1, suf1, s2, suf2, upto, endlt);
+}
+
+/**
+ * Return true iff s1 is less than s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_lt(const T1& s1, const T2& s2, bool endlt = true) {
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] < s2[i]) {
+			return true;
+		} else if(s1[i] > s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return false;
+	return (len1 < len2) == endlt;
+}
+
+/**
+ * Return true iff the given suffix of s1 is less than the given suffix of s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_lt(
+	const T1& s1, size_t suf1,
+	const T2& s2, size_t suf2,
+	bool endlt = true)
+{
+	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[suf1+i] < s2[suf2+i]) {
+			return true;
+		} else if(s1[suf1+i] > s2[suf2+i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return false;
+	return (len1 < len2) == endlt;
+}
+
+/**
+ * Return true iff the given suffix of s1 is less than the given suffix of s2.
+ * Treat s1 and s2 as though they have lengths len1/len2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_lt(
+	const T1& s1, size_t suf1, size_t len1,
+	const T2& s2, size_t suf2, size_t len2,
+	bool endlt = true)
+{
+	assert_leq(suf1, len1);
+	assert_leq(suf2, len2);
+	size_t left1 = len1 - suf1;
+	size_t left2 = len2 - suf2;
+	size_t minleft = (left1 < left2 ? left1 : left2);
+	for(size_t i = 0; i < minleft; i++) {
+		if(s1[suf1+i] < s2[suf2+i]) {
+			return true;
+		} else if(s1[suf1+i] > s2[suf2+i]) {
+			return false;
+		}
+	}
+	if(left1 == left2) return false;
+	return (left1 < left2) == endlt;
+}
+
+/**
+ * Return true iff the given suffix of s1 is less than the given suffix of s2
+ * up to upto characters.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_upto_lt(
+	const T1& s1, size_t suf1,
+	const T2& s2, size_t suf2,
+	size_t upto,
+	bool endlt = true)
+{
+	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
+	if(len1 > upto) len1 = upto;
+	if(len2 > upto) len2 = upto;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[suf1+i] < s2[suf2+i]) {
+			return true;
+		} else if(s1[suf1+i] > s2[suf2+i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return false;
+	return (len1 < len2) == endlt;
+}
+
+/**
+ * Return true iff the given prefix of s1 is less than the given prefix of s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_pre_lt(
+	const T1& s1, size_t pre1,
+	const T2& s2, size_t pre2,
+	bool endlt = true)
+{
+	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = pre1;
+	size_t len2 = pre2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] < s2[i]) {
+			return true;
+		} else if(s1[i] > s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return false;
+	return (len1 < len2) == endlt;
+}
+
+/**
+ * Return true iff s1 is less than or equal to s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_leq(const T1& s1, const T2& s2, bool endlt = true) {
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] < s2[i]) {
+			return true;
+		} else if(s1[i] > s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return true;
+	return (len1 < len2) == endlt;
+}
+
+/**
+ * Return true iff the given suffix of s1 is less than or equal to the given
+ * suffix of s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_leq(
+	const T1& s1, size_t suf1,
+	const T2& s2, size_t suf2,
+	bool endlt = true)
+{
+	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[suf1+i] < s2[suf2+i]) {
+			return true;
+		} else if(s1[suf1+i] > s2[suf2+i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return true;
+	return (len1 < len2) == endlt;
+}
+
+/**
+ * Return true iff the given prefix of s1 is less than or equal to the given
+ * prefix of s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_pre_leq(
+	const T1& s1, size_t pre1,
+	const T2& s2, size_t pre2,
+	bool endlt = true)
+{
+	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = pre1;
+	size_t len2 = pre2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] < s2[i]) {
+			return true;
+		} else if(s1[i] > s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return true;
+	return (len1 < len2) == endlt;
+}
+
+/**
+ * Return true iff s1 is greater than s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_gt(const T1& s1, const T2& s2, bool endlt = true) {
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] > s2[i]) {
+			return true;
+		} else if(s1[i] < s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return false;
+	return (len1 > len2) == endlt;
+}
+
+/**
+ * Return true iff the given suffix of s1 is greater than the given suffix of
+ * s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_gt(
+	const T1& s1, size_t suf1,
+	const T2& s2, size_t suf2,
+	bool endlt = true)
+{
+	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[suf1+i] > s2[suf2+i]) {
+			return true;
+		} else if(s1[suf1+i] < s2[suf2+i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return false;
+	return (len1 > len2) == endlt;
+}
+
+/**
+ * Return true iff the given prefix of s1 is greater than the given prefix of
+ * s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_pre_gt(
+	const T1& s1, size_t pre1,
+	const T2& s2, size_t pre2,
+	bool endlt = true)
+{
+	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = pre1;
+	size_t len2 = pre2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] > s2[i]) {
+			return true;
+		} else if(s1[i] < s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return false;
+	return (len1 > len2) == endlt;
+}
+
+/**
+ * Return true iff s1 is greater than or equal to s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_geq(const T1& s1, const T2& s2, bool endlt = true) {
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] > s2[i]) {
+			return true;
+		} else if(s1[i] < s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return true;
+	return (len1 > len2) == endlt;
+}
+
+/**
+ * Return true iff the given suffix of s1 is greater than or equal to the given
+ * suffix of s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_suf_geq(
+	const T1& s1, size_t suf1,
+	const T2& s2, size_t suf2,
+	bool endlt = true)
+{
+	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
+	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[suf1+i] > s2[suf2+i]) {
+			return true;
+		} else if(s1[suf1+i] < s2[suf2+i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return true;
+	return (len1 > len2) == endlt;
+}
+
+/**
+ * Return true iff the given prefix of s1 is greater than or equal to the given
+ * prefix of s2.
+ */
+template<typename T1, typename T2>
+static inline bool sstr_pre_geq(
+	const T1& s1, size_t pre1,
+	const T2& s2, size_t pre2,
+	bool endlt = true)
+{
+	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
+	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
+	size_t len1 = pre1;
+	size_t len2 = pre2;
+	size_t minlen = (len1 < len2 ? len1 : len2);
+	for(size_t i = 0; i < minlen; i++) {
+		if(s1[i] > s2[i]) {
+			return true;
+		} else if(s1[i] < s2[i]) {
+			return false;
+		}
+	}
+	if(len1 == len2) return true;
+	return (len1 > len2) == endlt;
+}
+
+template<typename T>
+static inline const char * sstr_to_cstr(const T& s) {
+	return s.toZBuf();
+}
+
+template<>
+inline const char * sstr_to_cstr<std::basic_string<char> >(
+	const std::basic_string<char>& s)
+{
+	return s.c_str();
+}
+
+/**
+ * Simple string class with backing memory whose size is managed by the user
+ * using the constructor and install() member function.  No behind-the-scenes
+ * reallocation or copying takes place.
+ */
+template<typename T>
+class SString {
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
+public:
+
+	explicit SString() :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{ }
+
+	explicit SString(size_t sz) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		resize(sz);
+	}
+
+	/**
+	 * Create an SStringExpandable from another SStringExpandable.
+	 */
+	SString(const SString<T>& o) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		*this = o;
+	}
+
+	/**
+	 * Create an SStringExpandable from a std::basic_string of the
+	 * appropriate type.
+	 */
+	explicit SString(const std::basic_string<T>& str) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		install(str.c_str(), str.length());
+	}
+
+	/**
+	 * Create an SStringExpandable from an array and size.
+	 */
+	explicit SString(const T* b, size_t sz) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		install(b, sz);
+	}
+
+	/**
+	 * Create an SStringExpandable from a zero-terminated array.
+	 */
+	explicit SString(const T* b) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		install(b, strlen(b));
+	}
+
+	/**
+	 * Destroy the expandable string object.
+	 */
+	virtual ~SString() {
+		if(cs_ != NULL) {
+			delete[] cs_;
+			cs_ = NULL;
+		}
+		if(printcs_ != NULL) {
+			delete[] printcs_;
+			printcs_ = NULL;
+		}
+		len_ = 0;
+	}
+
+	/**
+	 * Assignment to other SString.
+	 */
+	SString<T>& operator=(const SString<T>& o) {
+		install(o.cs_, o.len_);
+		return *this;
+	}
+
+	/**
+	 * Assignment to other SString.
+	 */
+	SString<T>& operator=(const std::basic_string<T>& o) {
+		install(o);
+		return *this;
+	}
+
+	/**
+	 * Resizes the string without preserving its contents.
+	 */
+	void resize(size_t sz) {
+		if(cs_ != NULL) {
+			delete cs_;
+			cs_ = NULL;
+		}
+		if(printcs_ != NULL) {
+			delete printcs_;
+			printcs_ = NULL;
+		}
+		if(sz != 0) {
+			cs_ = new T[sz+1];
+		}
+		len_ = sz;
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse version of the read.
+	 */
+	T windowGet(
+		size_t i,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_lt(i, len);
+		assert_leq(len, len_ - depth);
+		return fw ? cs_[depth+i] : cs_[depth+len-i-1];
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse-complement version of the read.
+	 */
+	void windowGet(
+		T& ret,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_leq(len, len_ - depth);
+		ret.resize(len);
+		for(size_t i = 0; i < len; i++) {
+			ret.set(fw ? cs_[depth+i] : cs_[depth+len-i-1], i);
+		}
+	}
+
+	/**
+	 * Set character at index 'idx' to 'c'.
+	 */
+	inline void set(int c, size_t idx) {
+		assert_lt(idx, len_);
+		cs_[idx] = c;
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	inline const T& operator[](size_t i) const {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Retrieve mutable version of element i.
+	 */
+	inline T& operator[](size_t i) {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	inline const T& get(size_t i) const {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.  memcpy is used, not
+	 * operator=.
+	 */
+	virtual void install(const T* b, size_t sz) {
+		if(sz == 0) return;
+		resize(sz);
+		memcpy(cs_, b, sz * sizeof(T));
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.  memcpy is used, not
+	 * operator=.
+	 */
+	virtual void install(const std::basic_string<T>& b) {
+		size_t sz = b.length();
+		if(sz == 0) return;
+		resize(sz);
+		memcpy(cs_, b.c_str(), sz * sizeof(T));
+	}
+
+	/**
+	 * Copy all bytes from zero-terminated buffer 'b' into this string.
+	 */
+	void install(const T* b) {
+		install(b, strlen(b));
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const char* b, size_t sz) {
+		if(sz == 0) return;
+		resize(sz);
+		for(size_t i = 0; i < sz; i++) {
+			cs_[i] = b[sz-i-1];
+		}
+		len_ = sz;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const SString<T>& b) {
+		installReverse(b.cs_, b.len_);
+	}
+	
+	/**
+	 * Return true iff the two strings are equal.
+	 */
+	bool operator==(const SString<T>& o) {
+		return sstr_eq(*this, o);
+	}
+
+	/**
+	 * Return true iff the two strings are not equal.
+	 */
+	bool operator!=(const SString<T>& o) {
+		return sstr_neq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than given string.
+	 */
+	bool operator<(const SString<T>& o) {
+		return sstr_lt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than given string.
+	 */
+	bool operator>(const SString<T>& o) {
+		return sstr_gt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than or equal to given string.
+	 */
+	bool operator<=(const SString<T>& o) {
+		return sstr_leq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than or equal to given string.
+	 */
+	bool operator>=(const SString<T>& o) {
+		return sstr_geq(*this, o);
+	}
+
+	/**
+	 * Reverse the buffer in place.
+	 */
+	void reverse() {
+		for(size_t i = 0; i < (len_ >> 1); i++) {
+			T tmp = get(i);
+			set(get(len_-i-1), i);
+			set(tmp, len_-i-1);
+		}
+	}
+
+	/**
+	 * Reverse a substring of the buffer in place.
+	 */
+	void reverseWindow(size_t off, size_t len) {
+		assert_leq(off, len_);
+		assert_leq(off + len, len_);
+		size_t mid = len >> 1;
+		for(size_t i = 0; i < mid; i++) {
+			T tmp = get(off+i);
+			set(get(off+len-i-1), off+i);
+			set(tmp, off+len-i-1);
+		}
+	}
+
+	/**
+	 * Set the first len elements of the buffer to el.
+	 */
+	void fill(size_t len, const T& el) {
+		assert_leq(len, len_);
+		for(size_t i = 0; i < len; i++) {
+			set(el, i);
+		}
+	}
+
+	/**
+	 * Set all elements of the buffer to el.
+	 */
+	void fill(const T& el) {
+		fill(len_, el);
+	}
+
+	/**
+	 * Return the length of the string.
+	 */
+	inline size_t length() const { return len_; }
+
+	/**
+	 * Clear the buffer.
+	 */
+	void clear() { len_ = 0; }
+
+	/**
+	 * Return true iff the buffer is empty.
+	 */
+	inline bool empty() const { return len_ == 0; }
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	const char* toZBufXForm(const char *xform) const {
+		ASSERT_ONLY(size_t xformElts = strlen(xform));
+		// Lazily allocate space for print buffer
+		if(printcs_ == NULL) {
+			const_cast<char*&>(printcs_) = new char[len_+1];
+		}
+		char* printcs = const_cast<char*>(printcs_);
+		assert(printcs != NULL);
+		for(size_t i = 0; i < len_; i++) {
+			assert_lt(cs_[i], (int)xformElts);
+			printcs[i] = xform[cs_[i]];
+		}
+		printcs[len_] = 0;
+		return printcs_;
+	}
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	virtual const T* toZBuf() const {
+		const_cast<T*>(cs_)[len_] = 0;
+		return cs_;
+	}
+
+	/**
+	 * Return a const version of the raw buffer.
+	 */
+	const T* buf() const { return cs_; }
+
+	/**
+	 * Return a writeable version of the raw buffer.
+	 */
+	T* wbuf() { return cs_; }
+
+protected:
+
+	T *cs_;      // +1 so that we have the option of dropping in a terminating "\0"
+	char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
+	size_t len_; // # elements
+};
+
+/**
+ * Simple string class with backing memory whose size is managed by the user
+ * using the constructor and install() member function.  No behind-the-scenes
+ * reallocation or copying takes place.
+ */
+class S2bDnaString {
+
+public:
+
+	explicit S2bDnaString() :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{ }
+
+	explicit S2bDnaString(size_t sz) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		resize(sz);
+	}
+
+	/**
+	 * Copy another object of the same class.
+	 */
+	S2bDnaString(const S2bDnaString& o) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		*this = o;
+	}
+
+	/**
+	 * Create an SStringExpandable from a std::basic_string of the
+	 * appropriate type.
+	 */
+	explicit S2bDnaString(
+		const std::basic_string<char>& str,
+		bool chars = false,
+		bool colors = false) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		if(chars) {
+			if(colors) {
+				installColors(str.c_str(), str.length());
+			} else {
+				installChars(str.c_str(), str.length());
+			}
+		} else {
+			install(str.c_str(), str.length());
+		}
+	}
+
+	/**
+	 * Create an SStringExpandable from an array and size.
+	 */
+	explicit S2bDnaString(
+		const char* b,
+		size_t sz,
+		bool chars = false,
+		bool colors = false) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		if(chars) {
+			if(colors) {
+				installColors(b, sz);
+			} else {
+				installChars(b, sz);
+			}
+		} else {
+			install(b, sz);
+		}
+	}
+
+	/**
+	 * Create an SStringFixed from a zero-terminated string.
+	 */
+	explicit S2bDnaString(
+		const char* b,
+		bool chars = false,
+		bool colors = false) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0)
+	{
+		if(chars) {
+			if(colors) {
+				installColors(b, strlen(b));
+			} else {
+				installChars(b, strlen(b));
+			}
+		} else {
+			install(b, strlen(b));
+		}
+	}
+
+	/**
+	 * Destroy the expandable string object.
+	 */
+	virtual ~S2bDnaString() {
+		if(cs_ != NULL) {
+			delete[] cs_;
+			cs_ = NULL;
+		}
+		if(printcs_ != NULL) {
+			delete[] printcs_;
+			printcs_ = NULL;
+		}
+		len_ = 0;
+	}
+
+	/**
+	 * Assignment to other SString.
+	 */
+	template<typename T>
+	S2bDnaString& operator=(const T& o) {
+		install(o.c_str(), o.length());
+		return *this;
+	}
+
+	/**
+	 * Assignment from a std::basic_string
+	 */
+	template<typename T>
+	S2bDnaString& operator=(const std::basic_string<char>& o) {
+		install(o);
+		return *this;
+	}
+
+	/**
+	 * Resizes the string without preserving its contents.
+	 */
+	void resize(size_t sz) {
+		if(cs_ != NULL) {
+			delete cs_;
+			cs_ = NULL;
+		}
+		if(printcs_ != NULL) {
+			delete printcs_;
+			printcs_ = NULL;
+		}
+		len_ = sz;
+		if(sz != 0) {
+			cs_ = new uint32_t[nwords()];
+		}
+	}
+
+	/**
+	 * Return DNA character corresponding to element 'idx'.
+	 */
+	char toChar(size_t idx) const {
+		int c = (int)get(idx);
+		assert_range(0, 3, c);
+		return "ACGT"[c];
+	}
+
+	/**
+	 * Return color character corresponding to element 'idx'.
+	 */
+	char toColor(size_t idx) const {
+		int c = (int)get(idx);
+		assert_range(0, 3, c);
+		return "0123"[c];
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse version of the read.
+	 */
+	char windowGet(
+		size_t i,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_lt(i, len);
+		assert_leq(len, len_ - depth);
+		return fw ? get(depth+i) : get(depth+len-i-1);
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse-complement version of the read.
+	 */
+	template<typename T>
+	void windowGet(
+		T& ret,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_leq(len, len_ - depth);
+		ret.resize(len);
+		for(size_t i = 0; i < len; i++) {
+			ret.set((fw ? get(depth+i) : get(depth+len-i-1)), i);
+		}
+	}
+	
+	/**
+	 * Return length in 32-bit words.
+	 */
+	size_t nwords() const {
+		return (len_ + 15) >> 4;
+	}
+
+	/**
+	 * Set character at index 'idx' to 'c'.
+	 */
+	void set(int c, size_t idx) {
+		assert_lt(idx, len_);
+		assert_range(0, 3, c);
+		size_t word = idx >> 4;
+		size_t bpoff = (idx & 15) << 1;
+		cs_[word] = cs_[word] & ~(uint32_t)(3 << bpoff);
+		cs_[word] = cs_[word] |  (uint32_t)(c << bpoff);
+	}
+
+	/**
+	 * Set character at index 'idx' to DNA char 'c'.
+	 */
+	void setChar(int c, size_t idx) {
+		assert_in(toupper(c), "ACGT");
+		int bp = asc2dna[c];
+		set(bp, idx);
+	}
+
+	/**
+	 * Set character at index 'idx' to color char 'c'.
+	 */
+	void setColor(int c, size_t idx) {
+		assert_in(toupper(c), "0123");
+		int co = asc2col[c];
+		set(co, idx);
+	}
+
+	/**
+	 * Set the ith 32-bit word to given word.
+	 */
+	void setWord(uint32_t w, size_t i) {
+		assert_lt(i, nwords());
+		cs_[i] = w;
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	char operator[](size_t i) const {
+		assert_lt(i, len_);
+		return get(i);
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	char get(size_t i) const {
+		assert_lt(i, len_);
+		size_t word = i >> 4;
+		size_t bpoff = (i & 15) << 1;
+		return (char)((cs_[word] >> bpoff) & 3);
+	}
+
+	/**
+	 * Copy packed words from string 'b' into this packed string.
+	 */
+	void install(const uint32_t* b, size_t sz) {
+		if(sz == 0) return;
+		resize(sz);
+		memcpy(cs_, b, sizeof(uint32_t)*nwords());
+	}
+
+	/**
+	 * Copy 'sz' DNA characters encoded as integers from buffer 'b' into this
+	 * packed string.
+	 */
+	void install(const char* b, size_t sz) {
+		if(sz == 0) return;
+		resize(sz);
+		size_t wordi = 0;
+		for(size_t i = 0; i < sz; i += 16) {
+			uint32_t word = 0;
+			for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
+				uint32_t bp = (int)b[i+j];
+				uint32_t shift = (uint32_t)j << 1;
+				assert_range(0, 3, (int)bp);
+				word |= (bp << shift);
+			}
+			cs_[wordi++] = word;
+		}
+	}
+
+	/**
+	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
+	 */
+	void installChars(const char* b, size_t sz) {
+		if(sz == 0) return;
+		resize(sz);
+		size_t wordi = 0;
+		for(size_t i = 0; i < sz; i += 16) {
+			uint32_t word = 0;
+			for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
+				char c = b[i+j];
+				assert_in(toupper(c), "ACGT");
+				int bp = asc2dna[(int)c];
+				assert_range(0, 3, (int)bp);
+				uint32_t shift = (uint32_t)j << 1;
+				word |= (bp << shift);
+			}
+			cs_[wordi++] = word;
+		}
+	}
+
+	/**
+	 * Copy 'sz' color characters from buffer 'b' into this packed string.
+	 */
+	void installColors(const char* b, size_t sz) {
+		if(sz == 0) return;
+		resize(sz);
+		size_t wordi = 0;
+		for(size_t i = 0; i < sz; i += 16) {
+			uint32_t word = 0;
+			for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
+				char c = b[i+j];
+				assert_in(c, "0123");
+				int bp = asc2col[(int)c];
+				assert_range(0, 3, (int)bp);
+				uint32_t shift = (uint32_t)j << 1;
+				word |= (bp << shift);
+			}
+			cs_[wordi++] = word;
+		}
+	}
+
+	/**
+	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
+	 */
+	void install(const char* b) {
+		install(b, strlen(b));
+	}
+
+	/**
+	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
+	 */
+	void installChars(const char* b) {
+		installChars(b, strlen(b));
+	}
+
+	/**
+	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
+	 */
+	void installColors(const char* b) {
+		installColors(b, strlen(b));
+	}
+
+	/**
+	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
+	 */
+	void install(const std::basic_string<char>& b) {
+		install(b.c_str(), b.length());
+	}
+
+	/**
+	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
+	 */
+	void installChars(const std::basic_string<char>& b) {
+		installChars(b.c_str(), b.length());
+	}
+
+	/**
+	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
+	 */
+	void installColors(const std::basic_string<char>& b) {
+		installColors(b.c_str(), b.length());
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const char* b, size_t sz) {
+		resize(sz);
+		if(sz == 0) return;
+		size_t wordi = 0;
+		size_t bpi   = 0;
+		cs_[0] = 0;
+		for(size_t i =sz; i > 0; i--) {
+			assert_range(0, 3, (int)b[i-1]);
+			cs_[wordi] |= ((int)b[i-1] << (bpi<<1));
+			if(bpi == 15) {
+				wordi++;
+				cs_[wordi] = 0;
+				bpi = 0;
+			} else bpi++;
+		}
+	}
+
+	/**
+	 * Copy all chars from buffer of DNA characters 'b' into this string,
+	 * reversing them in the process.
+	 */
+	void installReverse(const char* b) {
+		installReverse(b, strlen(b));
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer of DNA characters 'b' into this string,
+	 * reversing them in the process.
+	 */
+	void installReverseChars(const char* b, size_t sz) {
+		resize(sz);
+		if(sz == 0) return;
+		size_t wordi = 0;
+		size_t bpi   = 0;
+		cs_[0] = 0;
+		for(size_t i =sz; i > 0; i--) {
+			char c = b[i-1];
+			assert_in(toupper(c), "ACGT");
+			int bp = asc2dna[(int)c];
+			assert_range(0, 3, bp);
+			cs_[wordi] |= (bp << (bpi<<1));
+			if(bpi == 15) {
+				wordi++;
+				cs_[wordi] = 0;
+				bpi = 0;
+			} else bpi++;
+		}
+	}
+
+	/**
+	 * Copy all chars from buffer of DNA characters 'b' into this string,
+	 * reversing them in the process.
+	 */
+	void installReverseChars(const char* b) {
+		installReverseChars(b, strlen(b));
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer of color characters 'b' into this string,
+	 * reversing them in the process.
+	 */
+	void installReverseColors(const char* b, size_t sz) {
+		resize(sz);
+		if(sz == 0) return;
+		size_t wordi = 0;
+		size_t bpi   = 0;
+		cs_[0] = 0;
+		for(size_t i =sz; i > 0; i--) {
+			char c = b[i-1];
+			assert_in(c, "0123");
+			int bp = asc2col[(int)c];
+			assert_range(0, 3, bp);
+			cs_[wordi] |= (bp << (bpi<<1));
+			if(bpi == 15) {
+				wordi++;
+				cs_[wordi] = 0;
+				bpi = 0;
+			} else bpi++;
+		}
+	}
+
+	/**
+	 * Copy all chars from buffer of color characters 'b' into this string,
+	 * reversing them in the process.
+	 */
+	void installReverseColors(const char* b) {
+		installReverseColors(b, strlen(b));
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const S2bDnaString& b) {
+		resize(b.len_);
+		if(b.len_ == 0) return;
+		size_t wordi = 0;
+		size_t bpi   = 0;
+		size_t wordb = b.nwords()-1;
+		size_t bpb   = (b.len_-1) & 15;
+		cs_[0] = 0;
+		for(size_t i = b.len_; i > 0; i--) {
+			int bbp = (int)((b[wordb] >> (bpb << 1)) & 3);
+			assert_range(0, 3, bbp);
+			cs_[wordi] |= (bbp << (bpi << 1));
+			if(bpi == 15) {
+				wordi++;
+				cs_[wordi] = 0;
+				bpi = 0;
+			} else bpi++;
+			if(bpb == 0) {
+				wordb--;
+				bpi = 15;
+			} else bpi--;
+		}
+	}
+
+	/**
+	 * Return true iff the two strings are equal.
+	 */
+	bool operator==(const S2bDnaString& o) {
+		return sstr_eq(*this, o);
+	}
+
+	/**
+	 * Return true iff the two strings are not equal.
+	 */
+	bool operator!=(const S2bDnaString& o) {
+		return sstr_neq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than given string.
+	 */
+	bool operator<(const S2bDnaString& o) {
+		return sstr_lt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than given string.
+	 */
+	bool operator>(const S2bDnaString& o) {
+		return sstr_gt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than or equal to given string.
+	 */
+	bool operator<=(const S2bDnaString& o) {
+		return sstr_leq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than or equal to given string.
+	 */
+	bool operator>=(const S2bDnaString& o) {
+		return sstr_geq(*this, o);
+	}
+
+	/**
+	 * Reverse the 2-bit encoded DNA string in-place.
+	 */
+	void reverse() {
+		if(len_ <= 1) return;
+		size_t wordf = nwords()-1;
+		size_t bpf   = (len_-1) & 15;
+		size_t wordi = 0;
+		size_t bpi   = 0;
+		while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
+			int f = (cs_[wordf] >> (bpf << 1)) & 3;
+			int i = (cs_[wordi] >> (bpi << 1)) & 3;
+			cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
+			cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
+			cs_[wordf] |=  (uint32_t)(i << (bpf << 1));
+			cs_[wordi] |=  (uint32_t)(f << (bpi << 1));
+			if(bpf == 0) {
+				bpf = 15;
+				wordf--;
+			} else bpf--;
+			if(bpi == 15) {
+				bpi = 0;
+				wordi++;
+			} else bpi++;
+		}
+	}
+	
+	/**
+	 * Reverse a substring of the buffer in place.
+	 */
+	void reverseWindow(size_t off, size_t len) {
+		assert_leq(off, len_);
+		assert_leq(off+len, len_);
+		if(len <= 1) return;
+		size_t wordf = (off+len-1) >> 4;
+		size_t bpf   = (off+len-1) & 15;
+		size_t wordi = (off      ) >> 4;
+		size_t bpi   = (off      ) & 15;
+		while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
+			int f = (cs_[wordf] >> (bpf << 1)) & 3;
+			int i = (cs_[wordi] >> (bpi << 1)) & 3;
+			cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
+			cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
+			cs_[wordf] |=  (uint32_t)(i << (bpf << 1));
+			cs_[wordi] |=  (uint32_t)(f << (bpi << 1));
+			if(bpf == 0) {
+				bpf = 15;
+				wordf--;
+			} else bpf--;
+			if(bpi == 15) {
+				bpi = 0;
+				wordi++;
+			} else bpi++;
+		}
+	}
+
+
+	/**
+	 * Set the first len elements of the buffer to el.
+	 */
+	void fill(size_t len, char el) {
+		assert_leq(len, len_);
+		assert_range(0, 3, (int)el);
+		size_t word = 0;
+		if(len > 32) {
+			// Copy el throughout block
+			uint32_t bl = (uint32_t)el;
+			bl |= (bl << 2);
+			bl |= (bl << 4);
+			bl |= (bl << 8);
+			bl |= (bl << 16);
+			// Fill with blocks
+			size_t blen = len >> 4;
+			for(; word < blen; word++) {
+				cs_[word] = bl;
+			}
+			len = len & 15;
+		}
+		size_t bp = 0;
+		for(size_t i = 0; i < len; i++) {
+			cs_[word] &= ~(uint32_t)(3  << (bp << 1));
+			cs_[word] |=  (uint32_t)(el << (bp << 1));
+			if(bp == 15) {
+				word++;
+				bp = 0;
+			} else bp++;
+		}
+	}
+
+	/**
+	 * Set all elements of the buffer to el.
+	 */
+	void fill(char el) {
+		fill(len_, el);
+	}
+
+	/**
+	 * Return the ith character in the window defined by fw, color, depth and
+	 * len.
+	 */
+	char windowGetDna(
+		size_t i,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_lt(i, len);
+		assert_leq(len, len_ - depth);
+		if(fw) {
+			return get(depth+i);
+		} else {
+			return
+				color ?
+					get(depth+len-i-1) :
+					compDna(get(depth+len-i-1));
+		}
+	}
+
+	/**
+	 * Fill the given DNA buffer with the substring specified by fw,
+	 * color, depth and len.
+	 */
+	template<typename T>
+	void windowGetDna(
+		T&     buf,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_leq(len, len_ - depth);
+		buf.resize(len);
+		for(size_t i = 0; i < len; i++) {
+			buf.set(
+				(fw ?
+					get(depth+i) :
+					(color ?
+						get(depth+len-i-1) :
+						compDna(get(depth+len-i-1)))), i);
+		}
+	}
+
+	/**
+	 * Return the length of the string.
+	 */
+	inline size_t length() const { return len_; }
+
+	/**
+	 * Clear the buffer.
+	 */
+	void clear() { len_ = 0; }
+
+	/**
+	 * Return true iff the buffer is empty.
+	 */
+	inline bool empty() const { return len_ == 0; }
+
+	/**
+	 * Return a const version of the raw buffer.
+	 */
+	const uint32_t* buf() const { return cs_; }
+
+	/**
+	 * Return a writeable version of the raw buffer.
+	 */
+	uint32_t* wbuf() { return cs_; }
+
+	/**
+	 * Note: the size of the string once it's stored in the print buffer is 4
+	 * times as large as the string as stored in compact 2-bit-per-char words.
+	 */
+	const char* toZBuf() const {
+		if(printcs_ == NULL) {
+			const_cast<char*&>(printcs_) = new char[len_+1];
+		}
+		char *printcs = const_cast<char*>(printcs_);
+		size_t word = 0, bp = 0;
+		for(size_t i = 0; i < len_; i++) {
+			int c = (cs_[word] >> (bp << 1)) & 3;
+			printcs[i] = "ACGT"[c];
+			if(bp == 15) {
+				word++;
+				bp = 0;
+			} else bp++;
+		}
+		printcs[len_] = '\0';
+		return printcs_;
+	}
+
+protected:
+
+	uint32_t *cs_; // 2-bit packed words
+	char *printcs_;
+	size_t len_;   // # elements
+};
+
+/**
+ * Simple string class with backing memory that automatically expands as needed.
+ */
+template<typename T, int S = 1024, int M = 2>
+class SStringExpandable {
+
+public:
+
+	explicit SStringExpandable() :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0),
+		sz_(0)
+	{ }
+
+	explicit SStringExpandable(size_t sz) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0),
+		sz_(0)
+	{
+		expandNoCopy(sz);
+	}
+
+	/**
+	 * Create an SStringExpandable from another SStringExpandable.
+	 */
+	SStringExpandable(const SStringExpandable<T, S>& o) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0),
+		sz_(0)
+	{
+		*this = o;
+	}
+
+	/**
+	 * Create an SStringExpandable from a std::basic_string of the
+	 * appropriate type.
+	 */
+	explicit SStringExpandable(const std::basic_string<T>& str) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0),
+		sz_(0)
+	{
+		install(str.c_str(), str.length());
+	}
+
+	/**
+	 * Create an SStringExpandable from an array and size.
+	 */
+	explicit SStringExpandable(const T* b, size_t sz) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0),
+		sz_(0)
+	{
+		install(b, sz);
+	}
+
+	/**
+	 * Create an SStringExpandable from a zero-terminated array.
+	 */
+	explicit SStringExpandable(const T* b) :
+		cs_(NULL),
+		printcs_(NULL),
+		len_(0),
+		sz_(0)
+	{
+		install(b, strlen(b));
+	}
+
+	/**
+	 * Destroy the expandable string object.
+	 */
+	virtual ~SStringExpandable() {
+		if(cs_ != NULL) {
+			delete[] cs_;
+			cs_ = NULL;
+		}
+		if(printcs_ != NULL) {
+			delete[] printcs_;
+			printcs_ = NULL;
+		}
+		sz_ = len_ = 0;
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse-complement version of the read.
+	 */
+	T windowGet(
+		size_t i,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_lt(i, len);
+		assert_leq(len, len_ - depth);
+		return fw ? cs_[depth+i] : cs_[depth+len-i-1];
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse-complement version of the read.
+	 */
+	void windowGet(
+		T& ret,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_leq(len, len_ - depth);
+		for(size_t i = 0; i < len; i++) {
+			ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
+		}
+	}
+
+	/**
+	 * Assignment to other SStringFixed.
+	 */
+	SStringExpandable<T,S>& operator=(const SStringExpandable<T,S>& o) {
+		install(o.cs_, o.len_);
+		return *this;
+	}
+
+	/**
+	 * Assignment from a std::basic_string
+	 */
+	SStringExpandable<T,S>& operator=(const std::basic_string<T>& o) {
+		install(o.c_str(), o.length());
+		return *this;
+	}
+
+	/**
+	 * Insert char c before position 'idx'; slide subsequent chars down.
+	 */
+	void insert(const T& c, size_t idx) {
+		assert_lt(idx, len_);
+		if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
+		len_++;
+		// Move everyone down by 1
+		// len_ is the *new* length
+		for(size_t i = len_; i > idx+1; i--) {
+			cs_[i-1] = cs_[i-2];
+		}
+		cs_[idx] = c;
+	}
+
+	/**
+	 * Set character at index 'idx' to 'c'.
+	 */
+	void set(int c, size_t idx) {
+		assert_lt(idx, len_);
+		cs_[idx] = c;
+	}
+
+	/**
+	 * Append char c.
+	 */
+	void append(const T& c) {
+		if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
+		cs_[len_++] = c;
+	}
+
+	/**
+	 * Delete char at position 'idx'; slide subsequent chars up.
+	 */
+	void remove(size_t idx) {
+		assert_lt(idx, len_);
+		assert_gt(len_, 0);
+		for(size_t i = idx; i < len_-1; i++) {
+			cs_[i] = cs_[i+1];
+		}
+		len_--;
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	const T& operator[](size_t i) const {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Retrieve mutable version of element i.
+	 */
+	T& operator[](size_t i) {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	const T& get(size_t i) const {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	const T* get_ptr(size_t i) const {
+		assert_lt(i, len_);
+		return cs_+i;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	virtual void install(const T* b, size_t sz) {
+		if(sz_ < sz) expandNoCopy((sz + S) * M);
+		memcpy(cs_, b, sz * sizeof(T));
+		len_ = sz;
+	}
+
+
+	/**
+	 * Copy all bytes from zero-terminated buffer 'b' into this string.
+	 */
+	void install(const T* b) { install(b, strlen(b)); }
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const char* b, size_t sz) {
+		if(sz_ < sz) expandNoCopy((sz + S) * M);
+		for(size_t i = 0; i < sz; i++) {
+			cs_[i] = b[sz-i-1];
+		}
+		len_ = sz;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const SStringExpandable<T, S>& b) {
+		if(sz_ < b.len_) expandNoCopy((b.len_ + S) * M);
+		for(size_t i = 0; i < b.len_; i++) {
+			cs_[i] = b.cs_[b.len_ - i - 1];
+		}
+		len_ = b.len_;
+	}
+
+	/**
+	 * Return true iff the two strings are equal.
+	 */
+	bool operator==(const SStringExpandable<T, S>& o) {
+		return sstr_eq(*this, o);
+	}
+
+	/**
+	 * Return true iff the two strings are not equal.
+	 */
+	bool operator!=(const SStringExpandable<T, S>& o) {
+		return sstr_neq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than given string.
+	 */
+	bool operator<(const SStringExpandable<T, S>& o) {
+		return sstr_lt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than given string.
+	 */
+	bool operator>(const SStringExpandable<T, S>& o) {
+		return sstr_gt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than or equal to given string.
+	 */
+	bool operator<=(const SStringExpandable<T, S>& o) {
+		return sstr_leq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than or equal to given string.
+	 */
+	bool operator>=(const SStringExpandable<T, S>& o) {
+		return sstr_geq(*this, o);
+	}
+
+	/**
+	 * Reverse the buffer in place.
+	 */
+	void reverse() {
+		for(size_t i = 0; i < (len_ >> 1); i++) {
+			T tmp = get(i);
+			set(get(len_-i-1), i);
+			set(tmp, len_-i-1);
+		}
+	}
+
+	/**
+	 * Reverse a substring of the buffer in place.
+	 */
+	void reverseWindow(size_t off, size_t len) {
+		assert_leq(off, len_);
+		assert_leq(off + len, len_);
+		size_t mid = len >> 1;
+		for(size_t i = 0; i < mid; i++) {
+			T tmp = get(off+i);
+			set(get(off+len-i-1), off+i);
+			set(tmp, off+len-i-1);
+		}
+	}
+
+	/**
+	 * Simply resize the buffer.  If the buffer is resized to be
+	 * longer, the newly-added elements will contain garbage and should
+	 * be initialized immediately.
+	 */
+	void resize(size_t len) {
+		if(sz_ < len) expandCopy((len + S) * M);
+		len_ = len;
+	}
+
+	/**
+	 * Simply resize the buffer.  If the buffer is resized to be
+	 * longer, new elements will be initialized with 'el'.
+	 */
+	void resize(size_t len, const T& el) {
+		if(sz_ < len) expandCopy((len + S) * M);
+		if(len > len_) {
+			for(size_t i = len_; i < len; i++) {
+				cs_[i] = el;
+			}
+		}
+		len_ = len;
+	}
+
+	/**
+	 * Set the first len elements of the buffer to el.
+	 */
+	void fill(size_t len, const T& el) {
+		assert_leq(len, len_);
+		for(size_t i = 0; i < len; i++) {
+			cs_[i] = el;
+		}
+	}
+
+	/**
+	 * Set all elements of the buffer to el.
+	 */
+	void fill(const T& el) {
+		fill(len_, el);
+	}
+
+	/**
+	 * Trim len characters from the beginning of the string.
+	 */
+	void trimBegin(size_t len) {
+		assert_leq(len, len_);
+		if(len == len_) {
+			len_ = 0; return;
+		}
+		for(size_t i = 0; i < len_-len; i++) {
+			cs_[i] = cs_[i+len];
+		}
+		len_ -= len;
+	}
+
+	/**
+	 * Trim len characters from the end of the string.
+	 */
+	void trimEnd(size_t len) {
+		if(len >= len_) len_ = 0;
+		else len_ -= len;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	void append(const T* b, size_t sz) {
+		if(sz_ < len_ + sz) expandCopy((len_ + sz + S) * M);
+		memcpy(cs_ + len_, b, sz * sizeof(T));
+		len_ += sz;
+	}
+
+	/**
+	 * Copy bytes from zero-terminated buffer 'b' into this string.
+	 */
+	void append(const T* b) {
+		append(b, strlen(b));
+	}
+
+	/**
+	 * Return the length of the string.
+	 */
+	size_t length() const { return len_; }
+
+	/**
+	 * Clear the buffer.
+	 */
+	void clear() { len_ = 0; }
+
+	/**
+	 * Return true iff the buffer is empty.
+	 */
+	bool empty() const { return len_ == 0; }
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	const char* toZBufXForm(const char *xform) const {
+		ASSERT_ONLY(size_t xformElts = strlen(xform));
+		if(empty()) {
+			const_cast<char&>(zero_) = 0;
+			return &zero_;
+		}
+		char* printcs = const_cast<char*>(printcs_);
+		// Lazily allocate space for print buffer
+		for(size_t i = 0; i < len_; i++) {
+			assert_lt(cs_[i], (int)xformElts);
+			printcs[i] = xform[(int)cs_[i]];
+		}
+		printcs[len_] = 0;
+		return printcs_;
+	}
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	virtual const T* toZBuf() const {
+		if(empty()) {
+			const_cast<T&>(zeroT_) = 0;
+			return &zeroT_;
+		}
+		assert_leq(len_, sz_);
+		const_cast<T*>(cs_)[len_] = 0;
+		return cs_;
+	}
+
+	/**
+	 * Return true iff this DNA string matches the given nucleotide
+	 * character string.
+	 */
+	bool eq(const char *str) const {
+		const char *self = toZBuf();
+		return strcmp(str, self) == 0;
+	}
+
+	/**
+	 * Return a const version of the raw buffer.
+	 */
+	const T* buf() const { return cs_; }
+
+	/**
+	 * Return a writeable version of the raw buffer.
+	 */
+	T* wbuf() { return cs_; }
+
+protected:
+	/**
+	 * Allocate new, bigger buffer and copy old contents into it.  If
+	 * requested size can be accommodated by current buffer, do nothing.
+	 */
+	void expandCopy(size_t sz) {
+		if(sz_ >= sz) return; // done!
+		T *tmp  = new T[sz + 1];
+		char *ptmp = new char[sz + 1];
+		if(cs_ != NULL) {
+			memcpy(tmp, cs_, sizeof(T)*len_);
+			delete[] cs_;
+		}
+		if(printcs_ != NULL) {
+			memcpy(ptmp, printcs_, sizeof(char)*len_);
+			delete[] printcs_;
+		}
+		cs_ = tmp;
+		printcs_ = ptmp;
+		sz_ = sz;
+	}
+
+	/**
+	 * Allocate new, bigger buffer.  If requested size can be
+	 * accommodated by current buffer, do nothing.
+	 */
+	void expandNoCopy(size_t sz) {
+		if(sz_ >= sz) return; // done!
+		if(cs_      != NULL) delete[] cs_;
+		if(printcs_ != NULL) delete[] printcs_;
+		cs_ = new T[sz + 1];
+		printcs_ = new char[sz + 1];
+		sz_ = sz;
+	}
+
+	T *cs_;      // +1 so that we have the option of dropping in a terminating "\0"
+	char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
+	char zero_;  // 0 terminator for empty string
+	T zeroT_;    // 0 terminator for empty string
+	size_t len_; // # filled-in elements
+	size_t sz_;  // size capacity of cs_
+};
+
+/**
+ * Simple string class with in-object storage.
+ *
+ * All copies induced by, e.g., operator=, the copy constructor,
+ * install() and append(), are shallow (using memcpy/sizeof).  If deep
+ * copies are needed, use a different class.
+ *
+ * Reading from an uninitialized element results in an assert as long
+ * as NDEBUG is not defined.  If NDEBUG is defined, the result is
+ * undefined.
+ */
+template<typename T, int S>
+class SStringFixed {
+public:
+	explicit SStringFixed() : len_(0) { }
+
+	/**
+	 * Create an SStringFixed from another SStringFixed.
+	 */
+	SStringFixed(const SStringFixed<T, S>& o) {
+		*this = o;
+	}
+
+	/**
+	 * Create an SStringFixed from another SStringFixed.
+	 */
+	explicit SStringFixed(const std::basic_string<T>& str) {
+		install(str.c_str(), str.length());
+	}
+
+	/**
+	 * Create an SStringFixed from an array and size.
+	 */
+	explicit SStringFixed(const T* b, size_t sz) {
+		install(b, sz);
+	}
+
+	/**
+	 * Create an SStringFixed from a zero-terminated string.
+	 */
+	explicit SStringFixed(const T* b) {
+		install(b, strlen(b));
+	}
+
+	virtual ~SStringFixed() { } // C++ needs this
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	inline const T& operator[](size_t i) const {
+		return get(i);
+	}
+
+	/**
+	 * Retrieve mutable version of element i.
+	 */
+	inline T& operator[](size_t i) {
+		return get(i);
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	inline const T& get(size_t i) const {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Retrieve mutable version of element i.
+	 */
+	inline T& get(size_t i) {
+		assert_lt(i, len_);
+		return cs_[i];
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse-complement version of the read.
+	 */
+	T windowGet(
+		size_t i,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_lt(i, len);
+		assert_leq(len, len_ - depth);
+		return fw ? cs_[depth+i] : cs_[depth+len-i-1];
+	}
+
+	/**
+	 * Return ith character from the left of either the forward or the
+	 * reverse-complement version of the read.
+	 */
+	void windowGet(
+		T& ret,
+		bool   fw,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = len_;
+		assert_leq(len, len_ - depth);
+		for(size_t i = 0; i < len; i++) {
+			ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
+		}
+	}
+
+	/**
+	 * Assignment to other SStringFixed.
+	 */
+	SStringFixed<T,S>& operator=(const SStringFixed<T,S>& o) {
+		install(o.cs_, o.len_);
+		return *this;
+	}
+
+	/**
+	 * Assignment from a std::basic_string
+	 */
+	SStringFixed<T,S>& operator=(const std::basic_string<T>& o) {
+		install(o);
+		return *this;
+	}
+
+	/**
+	 * Insert char c before position 'idx'; slide subsequent chars down.
+	 */
+	void insert(const T& c, size_t idx) {
+		assert_lt(len_, S);
+		assert_lt(idx, len_);
+		// Move everyone down by 1
+		for(int i = len_; i > idx; i--) {
+			cs_[i] = cs_[i-1];
+		}
+		cs_[idx] = c;
+		len_++;
+	}
+
+	/**
+	 * Set character at index 'idx' to 'c'.
+	 */
+	void set(int c, size_t idx) {
+		assert_lt(idx, len_);
+		cs_[idx] = c;
+	}
+
+	/**
+	 * Append char c.
+	 */
+	void append(const T& c) {
+		assert_lt(len_, S);
+		cs_[len_++] = c;
+	}
+
+	/**
+	 * Delete char at position 'idx'; slide subsequent chars up.
+	 */
+	void remove(size_t idx) {
+		assert_lt(idx, len_);
+		assert_gt(len_, 0);
+		for(size_t i = idx; i < len_-1; i++) {
+			cs_[i] = cs_[i+1];
+		}
+		len_--;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	virtual void install(const T* b, size_t sz) {
+		assert_leq(sz, S);
+		memcpy(cs_, b, sz * sizeof(T));
+		len_ = sz;
+	}
+
+	/**
+	 * Copy all bytes from zero-terminated buffer 'b' into this string.
+	 */
+	void install(const T* b) { install(b, strlen(b)); }
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const char* b, size_t sz) {
+		assert_leq(sz, S);
+		for(size_t i = 0; i < sz; i++) {
+			cs_[i] = b[sz-i-1];
+		}
+		len_ = sz;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
+	 * in the process.
+	 */
+	void installReverse(const SStringFixed<T, S>& b) {
+		assert_leq(b.len_, S);
+		for(size_t i = 0; i < b.len_; i++) {
+			cs_[i] = b.cs_[b.len_ - i - 1];
+		}
+		len_ = b.len_;
+	}
+
+	/**
+	 * Return true iff the two strings are equal.
+	 */
+	bool operator==(const SStringFixed<T, S>& o) {
+		return sstr_eq(*this, o);
+	}
+
+	/**
+	 * Return true iff the two strings are not equal.
+	 */
+	bool operator!=(const SStringFixed<T, S>& o) {
+		return sstr_neq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than given string.
+	 */
+	bool operator<(const SStringFixed<T, S>& o) {
+		return sstr_lt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than given string.
+	 */
+	bool operator>(const SStringFixed<T, S>& o) {
+		return sstr_gt(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is less than or equal to given string.
+	 */
+	bool operator<=(const SStringFixed<T, S>& o) {
+		return sstr_leq(*this, o);
+	}
+
+	/**
+	 * Return true iff this string is greater than or equal to given string.
+	 */
+	bool operator>=(const SStringFixed<T, S>& o) {
+		return sstr_geq(*this, o);
+	}
+
+	/**
+	 * Reverse the buffer in place.
+	 */
+	void reverse() {
+		for(size_t i = 0; i < (len_ >> 1); i++) {
+			T tmp = get(i);
+			set(get(len_-i-1), i);
+			set(tmp, len_-i-1);
+		}
+	}
+
+	/**
+	 * Reverse a substring of the buffer in place.
+	 */
+	void reverseWindow(size_t off, size_t len) {
+		assert_leq(off, len_);
+		assert_leq(off + len, len_);
+		size_t mid = len >> 1;
+		for(size_t i = 0; i < mid; i++) {
+			T tmp = get(off+i);
+			set(get(off+len-i-1), off+i);
+			set(tmp, off+len-i-1);
+		}
+	}
+
+	/**
+	 * Simply resize the buffer.  If the buffer is resized to be
+	 * longer, the newly-added elements will contain garbage and should
+	 * be initialized immediately.
+	 */
+	void resize(size_t len) {
+		assert_lt(len, S);
+		len_ = len;
+	}
+
+	/**
+	 * Simply resize the buffer.  If the buffer is resized to be
+	 * longer, new elements will be initialized with 'el'.
+	 */
+	void resize(size_t len, const T& el) {
+		assert_lt(len, S);
+		if(len > len_) {
+			for(size_t i = len_; i < len; i++) {
+				cs_[i] = el;
+			}
+		}
+		len_ = len;
+	}
+
+	/**
+	 * Set the first len elements of the buffer to el.
+	 */
+	void fill(size_t len, const T& el) {
+		assert_leq(len, len_);
+		for(size_t i = 0; i < len; i++) {
+			cs_[i] = el;
+		}
+	}
+
+	/**
+	 * Set all elements of the buffer to el.
+	 */
+	void fill(const T& el) {
+		fill(len_, el);
+	}
+
+	/**
+	 * Trim len characters from the beginning of the string.
+	 */
+	void trimBegin(size_t len) {
+		assert_leq(len, len_);
+		if(len == len_) {
+			len_ = 0; return;
+		}
+		for(size_t i = 0; i < len_-len; i++) {
+			cs_[i] = cs_[i+len];
+		}
+		len_ -= len;
+	}
+
+	/**
+	 * Trim len characters from the end of the string.
+	 */
+	void trimEnd(size_t len) {
+		if(len >= len_) len_ = 0;
+		else len_ -= len;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	void append(const T* b, size_t sz) {
+		assert_leq(sz + len_, S);
+		memcpy(cs_ + len_, b, sz * sizeof(T));
+		len_ += sz;
+	}
+
+	/**
+	 * Copy bytes from zero-terminated buffer 'b' into this string.
+	 */
+	void append(const T* b) {
+		append(b, strlen(b));
+	}
+
+	/**
+	 * Return the length of the string.
+	 */
+	size_t length() const { return len_; }
+
+	/**
+	 * Clear the buffer.
+	 */
+	void clear() { len_ = 0; }
+
+	/**
+	 * Return true iff the buffer is empty.
+	 */
+	bool empty() const { return len_ == 0; }
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	virtual const T* toZBuf() const {
+		const_cast<T*>(cs_)[len_] = 0;
+		return cs_;
+	}
+
+	/**
+	 * Return true iff this DNA string matches the given nucleotide
+	 * character string.
+	 */
+	bool eq(const char *str) const {
+		const char *self = toZBuf();
+		return strcmp(str, self) == 0;
+	}
+	
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	const char* toZBufXForm(const char *xform) const {
+		ASSERT_ONLY(size_t xformElts = strlen(xform));
+		char* printcs = const_cast<char*>(printcs_);
+		for(size_t i = 0; i < len_; i++) {
+			assert_lt(cs_[i], (int)xformElts);
+			printcs[i] = xform[cs_[i]];
+		}
+		printcs[len_] = 0;
+		return printcs_;
+	}
+
+	/**
+	 * Return a const version of the raw buffer.
+	 */
+	const T* buf() const { return cs_; }
+
+	/**
+	 * Return a writeable version of the raw buffer.
+	 */
+	T* wbuf() { return cs_; }
+
+protected:
+	T cs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
+	char printcs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
+	size_t len_;
+};
+
+//
+// Stream put operators
+//
+
+template <typename T, int S, int M>
+std::ostream& operator<< (std::ostream& os, const SStringExpandable<T, S, M>& str) {
+	os << str.toZBuf();
+	return os;
+}
+
+template <typename T, int S>
+std::ostream& operator<< (std::ostream& os, const SStringFixed<T, S>& str) {
+	os << str.toZBuf();
+	return os;
+}
+
+extern uint8_t asc2dna[];
+extern uint8_t asc2col[];
+
+/**
+ * Encapsulates a fixed-length DNA string with characters encoded as
+ * chars.  Only capable of encoding A, C, G, T and N.  The length is
+ * specified via the template parameter S.
+ */
+template<int S>
+class SDnaStringFixed : public SStringFixed<char, S> {
+public:
+
+	explicit SDnaStringFixed() : SStringFixed<char, S>() { }
+
+	/**
+	 * Create an SStringFixed from another SStringFixed.
+	 */
+	SDnaStringFixed(const SDnaStringFixed<S>& o) :
+		SStringFixed<char, S>(o) { }
+
+	/**
+	 * Create an SStringFixed from a C++ basic_string.
+	 */
+	explicit SDnaStringFixed(const std::basic_string<char>& str) :
+		SStringFixed<char, S>(str) { }
+
+	/**
+	 * Create an SStringFixed from an array and size.
+	 */
+	explicit SDnaStringFixed(const char* b, size_t sz) :
+		SStringFixed<char, S>(b, sz) { }
+
+	/**
+	 * Create an SStringFixed from a zero-terminated string.
+	 */
+	explicit SDnaStringFixed(
+		const char* b,
+		bool chars = false,
+		bool colors = false) :
+		SStringFixed<char, S>()
+	{
+		if(chars) {
+			if(colors) {
+				installColors(b, strlen(b));
+			} else {
+				installChars(b, strlen(b));
+			}
+		} else {
+			install(b, strlen(b));
+		}
+	}
+
+	virtual ~SDnaStringFixed() { } // C++ needs this
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
+	 * complementing them in the process, assuming an encoding where
+	 * 0=A, 1=C, 2=G, 3=T, 4=N.
+	 */
+	void installReverseComp(const char* b, size_t sz) {
+		assert_leq(sz, S);
+		for(size_t i = 0; i < sz; i++) {
+			this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
+	 * complementing them in the process, assuming an encoding where
+	 * 0=A, 1=C, 2=G, 3=T, 4=N.
+	 */
+	void installReverseComp(const SDnaStringFixed<S>& b) {
+		assert_leq(b.len_, S);
+		for(size_t i = 0; i < b.len_; i++) {
+			this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
+		}
+		this->len_ = b.len_;
+	}
+
+	/**
+	 * Either reverse or reverse-complement (depending on "color") this
+	 * DNA buffer in-place.
+	 */
+	void reverseComp(bool color = false) {
+		if(color) {
+			this->reverse();
+		} else {
+			for(size_t i = 0; i < (this->len_ >> 1); i++) {
+				char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
+				char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
+				this->cs_[i] = tmp2;
+				this->cs_[this->len_-i-1] = tmp1;
+			}
+			// Do middle element iff there are an odd number
+			if((this->len_ & 1) != 0) {
+				char tmp = this->cs_[this->len_ >> 1];
+				tmp = (tmp == 4 ? 4 : tmp ^ 3);
+				this->cs_[this->len_ >> 1] = tmp;
+			}
+		}
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	virtual void install(const char* b, size_t sz) {
+		assert_leq(sz, S);
+		memcpy(this->cs_, b, sz);
+#ifndef NDEBUG
+		for(size_t i = 0; i < sz; i++) {
+			assert_leq(this->cs_[i], 4);
+			assert_geq(this->cs_[i], 0);
+		}
+#endif
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy buffer 'b' of ASCII DNA characters into normal DNA
+	 * characters.
+	 */
+	virtual void installChars(const char* b, size_t sz) {
+		assert_leq(sz, S);
+		for(size_t i = 0; i < sz; i++) {
+			assert_in(toupper(b[i]), "ACGTN-");
+			this->cs_[i] = asc2dna[(int)b[i]];
+			assert_geq(this->cs_[i], 0);
+			assert_leq(this->cs_[i], 4);
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy buffer 'b' of ASCII color characters into normal DNA
+	 * characters.
+	 */
+	virtual void installColors(const char* b, size_t sz) {
+		assert_leq(sz, S);
+		for(size_t i = 0; i < sz; i++) {
+			assert_in(b[i], "0123.");
+			this->cs_[i] = asc2col[(int)b[i]];
+			assert_geq(this->cs_[i], 0);
+			assert_leq(this->cs_[i], 4);
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy C++ string of ASCII DNA characters into normal DNA
+	 * characters.
+	 */
+	virtual void installChars(const std::basic_string<char>& str) {
+		installChars(str.c_str(), str.length());
+	}
+
+	/**
+	 * Copy C++ string of ASCII color characters into normal DNA
+	 * characters.
+	 */
+	virtual void installColors(const std::basic_string<char>& str) {
+		installColors(str.c_str(), str.length());
+	}
+
+	/**
+	 * Set DNA character at index 'idx' to 'c'.
+	 */
+	void set(int c, size_t idx) {
+		assert_lt(idx, this->len_);
+		assert_leq(c, 4);
+		assert_geq(c, 0);
+		this->cs_[idx] = c;
+	}
+
+	/**
+	 * Append DNA char c.
+	 */
+	void append(const char& c) {
+		assert_lt(this->len_, S);
+		assert_leq(c, 4);
+		assert_geq(c, 0);
+		this->cs_[this->len_++] = c;
+	}
+
+	/**
+	 * Set DNA character at index 'idx' to 'c'.
+	 */
+	void setChar(char c, size_t idx) {
+		assert_lt(idx, this->len_);
+		assert_in(toupper(c), "ACGTN");
+		this->cs_[idx] = asc2dna[(int)c];
+	}
+
+	/**
+	 * Append DNA character.
+	 */
+	void appendChar(char c) {
+		assert_lt(this->len_, S);
+		assert_in(toupper(c), "ACGTN");
+		this->cs_[this->len_++] = asc2dna[(int)c];
+	}
+
+	/**
+	 * Return DNA character corresponding to element 'idx'.
+	 */
+	char toChar(size_t idx) const {
+		assert_geq((int)this->cs_[idx], 0);
+		assert_leq((int)this->cs_[idx], 4);
+		return "ACGTN"[(int)this->cs_[idx]];
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	const char& operator[](size_t i) const {
+		return this->get(i);
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	const char& get(size_t i) const {
+		assert_lt(i, this->len_);
+		assert_leq(this->cs_[i], 4);
+		assert_geq(this->cs_[i], 0);
+		return this->cs_[i];
+	}
+
+	/**
+	 * Return the ith character in the window defined by fw, color,
+	 * depth and len.
+	 */
+	char windowGetDna(
+		size_t i,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = this->len_;
+		assert_lt(i, len);
+		assert_leq(len, this->len_ - depth);
+		if(fw) return this->cs_[depth+i];
+		else   return color ? this->cs_[depth+len-i-1] :
+		                      compDna(this->cs_[depth+len-i-1]);
+	}
+
+	/**
+	 * Fill the given DNA buffer with the substring specified by fw,
+	 * color, depth and len.
+	 */
+	void windowGetDna(
+		SDnaStringFixed<S>& buf,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = this->len_;
+		assert_leq(len, this->len_ - depth);
+		for(size_t i = 0; i < len; i++) {
+			buf.append(fw ? this->cs_[depth+i] :
+			                (color ? this->cs_[depth+len-i-1] :
+			                         compDna(this->cs_[depth+len-i-1])));
+		}
+	}
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
+};
+
+/**
+ * Encapsulates a fixed-length DNA string with characters encoded as
+ * chars.  Only capable of encoding A, C, G, T and N.  The length is
+ * specified via the template parameter S.
+ */
+
+template<int S = 1024, int M = 2>
+class SDnaStringExpandable : public SStringExpandable<char, S, M> {
+public:
+
+	explicit SDnaStringExpandable() : SStringExpandable<char, S, M>() { }
+
+	/**
+	 * Create an SStringFixed from another SStringFixed.
+	 */
+	SDnaStringExpandable(const SDnaStringExpandable<S, M>& o) :
+		SStringExpandable<char, S, M>(o) { }
+
+	/**
+	 * Create an SStringFixed from a C++ basic_string.
+	 */
+	explicit SDnaStringExpandable(
+		const std::basic_string<char>& str,
+		bool chars = false,
+		bool colors = false) :
+		SStringExpandable<char, S, M>()
+	{
+		if(chars) {
+			if(colors) {
+				installColors(str);
+			} else {
+				installChars(str);
+			}
+		} else {
+			//FIXME FB: Commented out install(str) as it does not conform with the function definition
+			//install(str);
+			throw std::invalid_argument("chars=false, colors=false not implemented");
+		}
+	}
+
+	/**
+	 * Create an SStringFixed from an array and size.
+	 */
+	explicit SDnaStringExpandable(
+		const char* b,
+		size_t sz,
+		bool chars = false,
+		bool colors = false) :
+		SStringExpandable<char, S, M>()
+	{
+		if(chars) {
+			if(colors) {
+				installColors(b, sz);
+			} else {
+				installChars(b, sz);
+			}
+		} else {
+			install(b, sz);
+		}
+	}
+
+	/**
+	 * Create an SStringFixed from a zero-terminated string.
+	 */
+	explicit SDnaStringExpandable(
+		const char* b,
+		bool chars = false,
+		bool colors = false) :
+		SStringExpandable<char, S, M>()
+	{
+		install(b, chars, colors);
+	}
+
+	virtual ~SDnaStringExpandable() { } // C++ needs this
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
+	 * complementing them in the process, assuming an encoding where
+	 * 0=A, 1=C, 2=G, 3=T, 4=N.
+	 */
+	void installReverseComp(const char* b, size_t sz) {
+		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
+		for(size_t i = 0; i < sz; i++) {
+			this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
+	 * complementing them in the process, assuming an encoding where
+	 * 0=A, 1=C, 2=G, 3=T, 4=N.
+	 */
+	void installReverseComp(const SDnaStringExpandable<S, M>& b) {
+		if(this->sz_ < b.len_) this->expandCopy((b.len_ + S) * M);
+		for(size_t i = 0; i < b.len_; i++) {
+			this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
+		}
+		this->len_ = b.len_;
+	}
+
+	/**
+	 * Either reverse or reverse-complement (depending on "color") this
+	 * DNA buffer in-place.
+	 */
+	void reverseComp(bool color = false) {
+		if(color) {
+			this->reverse();
+		} else {
+			for(size_t i = 0; i < (this->len_ >> 1); i++) {
+				char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
+				char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
+				this->cs_[i] = tmp2;
+				this->cs_[this->len_-i-1] = tmp1;
+			}
+			// Do middle element iff there are an odd number
+			if((this->len_ & 1) != 0) {
+				char tmp = this->cs_[this->len_ >> 1];
+				tmp = (tmp == 4 ? 4 : tmp ^ 3);
+				this->cs_[this->len_ >> 1] = tmp;
+			}
+		}
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	virtual void install(
+		const char* b,
+		bool chars = false,
+		bool colors = false)
+	{
+		if(chars) {
+			if(colors) {
+				installColors(b, strlen(b));
+			} else {
+				installChars(b, strlen(b));
+			}
+		} else {
+			install(b, strlen(b));
+		}
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	virtual void install(const char* b, size_t sz) {
+		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
+		memcpy(this->cs_, b, sz);
+#ifndef NDEBUG
+		for(size_t i = 0; i < sz; i++) {
+			assert_range(0, 4, (int)this->cs_[i]);
+		}
+#endif
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy buffer 'b' of ASCII DNA characters into normal DNA
+	 * characters.
+	 */
+	virtual void installChars(const char* b, size_t sz) {
+		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
+		for(size_t i = 0; i < sz; i++) {
+			assert_in(toupper(b[i]), "ACGTN-");
+			this->cs_[i] = asc2dna[(int)b[i]];
+			assert_range(0, 4, (int)this->cs_[i]);
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy buffer 'b' of ASCII color characters into normal DNA
+	 * characters.
+	 */
+	virtual void installColors(const char* b, size_t sz) {
+		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
+		for(size_t i = 0; i < sz; i++) {
+			assert_in(b[i], "0123.");
+			this->cs_[i] = asc2col[(int)b[i]];
+			assert_range(0, 4, (int)this->cs_[i]);
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy C++ string of ASCII DNA characters into normal DNA
+	 * characters.
+	 */
+	virtual void installChars(const std::basic_string<char>& str) {
+		installChars(str.c_str(), str.length());
+	}
+
+	/**
+	 * Copy C++ string of ASCII color characters into normal DNA
+	 * characters.
+	 */
+	virtual void installColors(const std::basic_string<char>& str) {
+		installColors(str.c_str(), str.length());
+	}
+
+	/**
+	 * Set DNA character at index 'idx' to 'c'.
+	 */
+	void set(int c, size_t idx) {
+		assert_lt(idx, this->len_);
+		assert_range(0, 4, c);
+		this->cs_[idx] = c;
+	}
+
+	/**
+	 * Append DNA char c.
+	 */
+	void append(const char& c) {
+		if(this->sz_ < this->len_ + 1) {
+			this->expandCopy((this->len_ + 1 + S) * M);
+		}
+		assert_range(0, 4, (int)c);
+		this->cs_[this->len_++] = c;
+	}
+
+	/**
+	 * Set DNA character at index 'idx' to 'c'.
+	 */
+	void setChar(char c, size_t idx) {
+		assert_lt(idx, this->len_);
+		assert_in(toupper(c), "ACGTN");
+		this->cs_[idx] = asc2dna[(int)c];
+	}
+
+	/**
+	 * Append DNA character.
+	 */
+	void appendChar(char c) {
+		if(this->sz_ < this->len_ + 1) {
+			this->expandCopy((this->len_ + 1 + S) * M);
+		}
+		assert_in(toupper(c), "ACGTN");
+		this->cs_[this->len_++] = asc2dna[(int)c];
+	}
+
+	/**
+	 * Return DNA character corresponding to element 'idx'.
+	 */
+	char toChar(size_t idx) const {
+		assert_range(0, 4, (int)this->cs_[idx]);
+		return "ACGTN"[(int)this->cs_[idx]];
+	}
+//
+//	// call with uint32_t or uint64_t
+//	template<typename T>
+//	T* uint_kmers (size_t begin, size_t end) {
+//		size_t t_size = sizeof(T) * 8;
+//		assert_lt(end, this->len_);
+//		end = min(end, this->len_-1);
+//
+//		// number of kmers: ceiling of len / t_size
+//		size_t n_kmers = ((end-begin) % t_size) ? 
+//			(end-begin) / t_size + 1 : 
+//			(end-begin) / t_size;
+//		T kmers [n_kmers];
+//
+//		// go through _cs in steps of t_size (16 or 32 for uint32_t and uint64_t, resp)
+//		for(size_t i = 0; i <= end; i += t_size) {
+//
+//			// each step gives one word / kmer
+//			T word = 0;
+//			int bp = (int)this->cs_[begin+i+j];
+//			assert_range(0, 3, (int)bp);
+//
+//			// create bitmask, and combine word with new bitmask
+//			T shift = (T)j << 1;
+//			word |= (bp << shift);
+//			if (i % t_size == 0) {
+//				kmers[i] = word;
+//				word
+//			}
+//		}
+//	}
+//
+	//
+	/**
+	 * update word to the next kmer by shifting off the first two bits, and shifting on the ones from pos
+	 * @param word
+	 * @param pos
+	 */
+	template<typename UINT>
+	UINT next_kmer(UINT word, size_t pos) const {
+		// shift the first two bits off the word
+		word = word << 2;
+		// put the base-pair code from pos at that position
+		UINT bp = (UINT)this->cs_[pos];
+
+		return (word |= bp);
+	}
+
+	/**
+	 * get kmer of appropriate size from cs_
+	 * @param begin start position of kmrt
+	 * @param end end position of kmer
+	 */
+	template<typename UINT>
+	UINT int_kmer(size_t begin,size_t end) const {
+		const size_t k_size = sizeof(UINT) * 4;  // size of the kmer, two bits are used per nucleotide
+		assert_leq(end, this->len_);
+
+		UINT word = 0;
+		// go through _cs until end or kmer-size is reached
+		for (size_t j = 0; j < k_size && (size_t)(begin+j) < end; j++) {
+				int bp = (int)this->cs_[begin+j];
+				// assert_range(0, 3, (int)bp); //
+				// cerr << (begin+j) << ":" << "ACGTXYZ"[bp] << " "; //
+				if (bp < 0 || bp > 3) {
+					// skip non-ACGT bases
+					continue;
+				}
+				// shift the first two bits off the word
+				word = word << 2;
+				// put the base-pair code from pos at that position
+				word |= bp;
+		}
+		//cerr << endl;
+		return (word);
+	}
+
+	vector<uint64_t> get_all_kmers(size_t begin,size_t len,bool rev=false) const {
+		vector<uint64_t> kmers(max(1,31 - (int)len));
+		size_t i = begin;
+		size_t j = 1;
+		kmers[0] = this->int_kmer<uint64_t>(begin,begin+len, rev);
+		while (i+32 < len) {
+			kmers[j] = this->next_kmer(kmers[j-1],i, rev);
+			++i; ++j;
+		}
+		return kmers;
+	}
+
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	inline const char& operator[](size_t i) const {
+		return this->get(i);
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	inline const char& get(size_t i) const {
+		assert_lt(i, this->len_);
+		assert_range(0, 4, (int)this->cs_[i]);
+		return this->cs_[i];
+	}
+
+	/**
+	 * Return the ith character in the window defined by fw, color,
+	 * depth and len.
+	 */
+	char windowGetDna(
+		size_t i,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = this->len_;
+		assert_lt(i, len);
+		assert_leq(len, this->len_ - depth);
+		if(fw) return this->cs_[depth+i];
+		else   return color ? this->cs_[depth+len-i-1] :
+		                      compDna(this->cs_[depth+len-i-1]);
+	}
+
+	/**
+	 * Fill the given DNA buffer with the substring specified by fw,
+	 * color, depth and len.
+	 */
+	void windowGetDna(
+		SDnaStringExpandable<S, M>& buf,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = this->len_;
+		assert_leq(len, this->len_ - depth);
+		for(size_t i = 0; i < len; i++) {
+			buf.append(fw ? this->cs_[depth+i] :
+			                (color ? this->cs_[depth+len-i-1] :
+			                         compDna(this->cs_[depth+len-i-1])));
+		}
+	}
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
+};
+
+/**
+ * Encapsulates an expandable DNA string with characters encoded as
+ * char-sized masks.  Encodes A, C, G, T, and all IUPAC, as well as the
+ * empty mask indicating "matches nothing."
+ */
+template<int S = 16, int M = 2>
+class SDnaMaskString : public SStringExpandable<char, S, M> {
+public:
+
+	explicit SDnaMaskString() : SStringExpandable<char, S, M>() { }
+
+	/**
+	 * Create an SStringFixed from another SStringFixed.
+	 */
+	SDnaMaskString(const SDnaMaskString<S, M>& o) :
+		SStringExpandable<char, S, M>(o) { }
+
+	/**
+	 * Create an SStringFixed from a C++ basic_string.
+	 */
+	explicit SDnaMaskString(const std::basic_string<char>& str) :
+		SStringExpandable<char, S, M>(str) { }
+
+	/**
+	 * Create an SStringFixed from an array and size.
+	 */
+	explicit SDnaMaskString(const char* b, size_t sz) :
+		SStringExpandable<char, S, M>(b, sz) { }
+
+	/**
+	 * Create an SStringFixed from a zero-terminated string.
+	 */
+	explicit SDnaMaskString(const char* b, bool chars = false) :
+		SStringExpandable<char, S, M>()
+	{
+		if(chars) {
+			installChars(b, strlen(b));
+		} else {
+			install(b, strlen(b));
+		}
+	}
+
+	virtual ~SDnaMaskString() { }
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
+	 * complementing them in the process, assuming an encoding where
+	 * 0=A, 1=C, 2=G, 3=T, 4=N.
+	 */
+	void installReverseComp(const char* b, size_t sz) {
+		while(this->sz_ < sz) {
+			this->expandNoCopy((sz + S) * M);
+		}
+		for(size_t i = 0; i < sz; i++) {
+			this->cs_[i] = maskcomp[(int)b[sz-i-1]];
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
+	 * complementing them in the process, assuming an encoding where
+	 * 0=A, 1=C, 2=G, 3=T, 4=N.
+	 */
+	void installReverseComp(const SDnaMaskString<S, M>& b) {
+		while(this->sz_ < b.len_) {
+			this->expandNoCopy((b.len_ + S) * M);
+		}
+		for(size_t i = 0; i < b.len_; i++) {
+			this->cs_[i] = maskcomp[(int)b.cs_[b.len_-i-1]];
+		}
+		this->len_ = b.len_;
+	}
+
+	/**
+	 * Either reverse or reverse-complement (depending on "color") this
+	 * DNA buffer in-place.
+	 */
+	void reverseComp(bool color = false) {
+		if(color) {
+			this->reverse();
+		} else {
+			for(size_t i = 0; i < (this->len_ >> 1); i++) {
+				char tmp1 = maskcomp[(int)this->cs_[i]];
+				char tmp2 = maskcomp[(int)this->cs_[this->len_-i-1]];
+				this->cs_[i] = tmp2;
+				this->cs_[this->len_-i-1] = tmp1;
+			}
+			// Do middle element iff there are an odd number
+			if((this->len_ & 1) != 0) {
+				char tmp = this->cs_[this->len_ >> 1];
+				tmp = maskcomp[(int)tmp];
+				this->cs_[this->len_ >> 1] = tmp;
+			}
+		}
+	}
+
+	/**
+	 * Copy 'sz' bytes from buffer 'b' into this string.
+	 */
+	virtual void install(const char* b, size_t sz) {
+		while(this->sz_ < sz) {
+			this->expandNoCopy((sz + S) * M);
+		}
+		memcpy(this->cs_, b, sz);
+#ifndef NDEBUG
+		for(size_t i = 0; i < sz; i++) {
+			assert_range((int)this->cs_[i], 0, 15);
+		}
+#endif
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy buffer 'b' of ASCII DNA characters into DNA masks.
+	 */
+	virtual void installChars(const char* b, size_t sz) {
+		while(this->sz_ < sz) {
+			this->expandNoCopy((sz + S) * M);
+		}
+		for(size_t i = 0; i < sz; i++) {
+			assert_in(b[i], iupacs);
+			this->cs_[i] = asc2dnamask[(int)b[i]];
+			assert_range((int)this->cs_[i], 0, 15);
+		}
+		this->len_ = sz;
+	}
+
+	/**
+	 * Copy C++ string of ASCII DNA characters into normal DNA
+	 * characters.
+	 */
+	virtual void installChars(const std::basic_string<char>& str) {
+		installChars(str.c_str(), str.length());
+	}
+
+	/**
+	 * Set DNA character at index 'idx' to 'c'.
+	 */
+	void set(int c, size_t idx) {
+		assert_lt(idx, this->len_);
+		assert_range(c, 0, 15);
+		this->cs_[idx] = c;
+	}
+
+	/**
+	 * Append DNA char c.
+	 */
+	void append(const char& c) {
+		while(this->sz_ < this->len_+1) {
+			this->expandNoCopy((this->len_ + 1 + S) * M);
+		}
+		assert_range((int)c, 0, 15);
+		this->cs_[this->len_++] = c;
+	}
+
+	/**
+	 * Set DNA character at index 'idx' to 'c'.
+	 */
+	void setChar(char c, size_t idx) {
+		assert_lt(idx, this->len_);
+		assert_in(toupper(c), iupacs);
+		this->cs_[idx] = asc2dnamask[(int)c];
+	}
+
+	/**
+	 * Append DNA character.
+	 */
+	void appendChar(char c) {
+		while(this->sz_ < this->len_+1) {
+			expandNoCopy((this->len_ + 1 + S) * M);
+		}
+		assert_in(toupper(c), iupacs);
+		this->cs_[this->len_++] = asc2dnamask[(int)c];
+	}
+
+	/**
+	 * Return DNA character corresponding to element 'idx'.
+	 */
+	char toChar(size_t idx) const {
+		assert_range((int)this->cs_[idx], 0, 15);
+		return mask2iupac[(int)this->cs_[idx]];
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	const char& operator[](size_t i) const {
+		return this->get(i);
+	}
+
+	/**
+	 * Retrieve mutable version of element i.
+	 */
+	char& operator[](size_t i) {
+		return this->get(i);
+	}
+
+	/**
+	 * Retrieve constant version of element i.
+	 */
+	const char& get(size_t i) const {
+		assert_lt(i, this->len_);
+		assert_range((int)this->cs_[i], 0, 15);
+		return this->cs_[i];
+	}
+
+	/**
+	 * Retrieve mutable version of element i.
+	 */
+	char& get(size_t i) {
+		assert_lt(i, this->len_);
+		assert_range((int)this->cs_[i], 0, 15);
+		return this->cs_[i];
+	}
+
+	/**
+	 * Return the ith character in the window defined by fw, color,
+	 * depth and len.
+	 */
+	char windowGetDna(
+		size_t i,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = this->len_;
+		assert_lt(i, len);
+		assert_leq(len, this->len_ - depth);
+		if(fw) return this->cs_[depth+i];
+		else   return color ? this->cs_[depth+len-i-1] :
+		                      maskcomp[this->cs_[depth+len-i-1]];
+	}
+
+	/**
+	 * Fill the given DNA buffer with the substring specified by fw,
+	 * color, depth and len.
+	 */
+	void windowGetDna(
+		SDnaStringFixed<S>& buf,
+		bool   fw,
+		bool   color,
+		size_t depth = 0,
+		size_t len = 0) const
+	{
+		if(len == 0) len = this->len_;
+		assert_leq(len, this->len_ - depth);
+		for(size_t i = 0; i < len; i++) {
+			buf.append(fw ? this->cs_[depth+i] :
+			                (color ? this->cs_[depth+len-i-1] :
+			                         maskcomp[this->cs_[depth+len-i-1]]));
+		}
+	}
+
+	/**
+	 * Sample a random substring of the given length from this DNA
+	 * string and install the result in 'dst'.
+	 */
+	template<typename T>
+	void randSubstr(
+		RandomSource& rnd,  // pseudo-random generator
+		T& dst,             // put sampled substring here
+		size_t len,         // length of substring to extract
+		bool watson = true, // true -> possibly extract from Watson strand
+		bool crick = true)  // true -> possibly extract from Crick strand
+	{
+		assert(watson || crick);
+		assert_geq(this->len_, len);
+		size_t poss = this->len_ - len + 1;
+		assert_gt(poss, 0);
+		uint32_t rndoff = (uint32_t)(rnd.nextU32() % poss);
+		bool fw;
+		if     (watson && !crick) fw = true;
+		else if(!watson && crick) fw = false;
+		else {
+			fw = rnd.nextBool();
+		}
+		if(fw) {
+			// Install Watson substring
+			for(size_t i = 0; i < len; i++) {
+				dst[i] = this->cs_[i + rndoff];
+			}
+		} else {
+			// Install Crick substring
+			for(size_t i = 0; i < len; i++) {
+				dst[i] = maskcomp[(int)this->cs_[i + rndoff + (len - i - 1)]];
+			}
+		}
+	}
+
+	/**
+	 * Put a terminator in the 'len_'th element and then return a
+	 * pointer to the buffer.  Useful for printing.
+	 */
+	virtual const char* toZBuf() const { return this->toZBufXForm(iupacs); }
+};
+
+typedef SStringExpandable<char, 1024, 2> BTString;
+typedef SDnaStringExpandable<1024, 2>    BTDnaString;
+typedef SDnaMaskString<32, 2>            BTDnaMask;
+
+#endif /* SSTRING_H_ */
diff --git a/str_util.h b/str_util.h
new file mode 100644
index 0000000..48dae17
--- /dev/null
+++ b/str_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef STR_UTIL_H_
+#define STR_UTIL_H_
+
+#include <string>
+
+/**
+ * Given a string, return an int hash for it.
+ */
+static inline int
+hash_string(const std::string& s) {
+	int ret = 0;
+	int a = 63689;
+	int b = 378551;
+	for(size_t i = 0; i < s.length(); i++) {
+		ret = (ret * a) + (int)s[i];
+		if(a == 0) {
+			a += b;
+		} else {
+			a *= b;
+		}
+		if(a == 0) {
+			a += b;
+		}
+	}
+	return ret;
+}
+
+#endif /* STR_UTIL_H_ */
diff --git a/taxonomy.h b/taxonomy.h
new file mode 100644
index 0000000..8fe2545
--- /dev/null
+++ b/taxonomy.h
@@ -0,0 +1,338 @@
+/*
+ * taxonomy.h
+ *
+ *  Created on: Feb 10, 2016
+ *      Author: fbreitwieser
+ */
+
+#ifndef TAXONOMY_H_
+#define TAXONOMY_H_
+
+#include<map>
+#include<utility>
+#include<string>
+
+enum {
+    RANK_UNKNOWN = 0,
+    RANK_STRAIN,
+    RANK_SPECIES,
+    RANK_GENUS,
+    RANK_FAMILY,
+    RANK_ORDER,
+    RANK_CLASS,
+    RANK_PHYLUM,
+    RANK_KINGDOM,
+    RANK_DOMAIN,
+    RANK_FORMA,
+    RANK_INFRA_CLASS,
+    RANK_INFRA_ORDER,
+    RANK_PARV_ORDER,
+    RANK_SUB_CLASS,
+    RANK_SUB_FAMILY,
+    RANK_SUB_GENUS,
+    RANK_SUB_KINGDOM,
+    RANK_SUB_ORDER,
+    RANK_SUB_PHYLUM,
+    RANK_SUB_SPECIES,
+    RANK_SUB_TRIBE,
+    RANK_SUPER_CLASS,
+    RANK_SUPER_FAMILY,
+    RANK_SUPER_KINGDOM,
+    RANK_SUPER_ORDER,
+    RANK_SUPER_PHYLUM,
+    RANK_TRIBE,
+    RANK_VARIETAS,
+    RANK_MAX
+};
+
+extern uint8_t tax_rank_num[RANK_MAX];
+
+struct TaxonomyNode {
+    uint64_t parent_tid;
+    uint8_t  rank;
+    uint8_t  leaf;
+
+    TaxonomyNode(uint64_t _parent_tid, uint8_t  _rank, uint8_t _leaf):
+    	parent_tid(_parent_tid), rank(_rank), leaf(_leaf) {};
+
+    TaxonomyNode(): parent_tid(0), rank(RANK_UNKNOWN), leaf(false) {};
+};
+
+struct TaxonomyPathTable {
+    static const size_t nranks = 7;
+
+    map<uint64_t, uint32_t> tid_to_pid;  // from taxonomic ID to path ID
+    ELList<uint64_t> paths;
+
+    static uint8_t rank_to_pathID(uint8_t rank) {
+        switch(rank) {
+            case RANK_STRAIN:
+            case RANK_SUB_SPECIES:
+                return 0;
+            case RANK_SPECIES:
+                return 1;
+            case RANK_GENUS:
+                return 2;
+            case RANK_FAMILY:
+                return 3;
+            case RANK_ORDER:
+                return 4;
+            case RANK_CLASS:
+                return 5;
+            case RANK_PHYLUM:
+                return 6;
+            default:
+                return std::numeric_limits<uint8_t>::max();
+        }
+    }
+
+    void buildPaths(const EList<pair<string, uint64_t> >& uid_to_tid,
+                    const std::map<uint64_t, TaxonomyNode>& tree)
+    {
+        map<uint32_t, uint32_t> rank_map;
+        rank_map[RANK_STRAIN]      = 0;
+        rank_map[RANK_SUB_SPECIES] = 0;
+        rank_map[RANK_SPECIES]     = 1;
+        rank_map[RANK_GENUS]       = 2;
+        rank_map[RANK_FAMILY]      = 3;
+        rank_map[RANK_ORDER]       = 4;
+        rank_map[RANK_CLASS]       = 5;
+        rank_map[RANK_PHYLUM]      = 6;
+
+        tid_to_pid.clear();
+        paths.clear();
+        for(size_t i = 0; i < uid_to_tid.size(); i++) {
+            uint64_t tid = uid_to_tid[i].second;
+            if(tid_to_pid.find(tid) != tid_to_pid.end())
+                continue;
+            if(tree.find(tid) == tree.end())
+                continue;
+            tid_to_pid[tid] = (uint32_t)paths.size();
+            paths.expand();
+            EList<uint64_t>& path = paths.back();
+            path.resizeExact(nranks);
+            path.fillZero();
+            bool first = true;
+            while(true) {
+                std::map<uint64_t, TaxonomyNode>::const_iterator itr = tree.find(tid);
+                if(itr == tree.end()) {
+                    break;
+                }
+                const TaxonomyNode& node = itr->second;
+                uint32_t rank = std::numeric_limits<uint32_t>::max();
+                if(first && node.rank == RANK_UNKNOWN) {
+                    rank = rank_map[RANK_STRAIN];
+                } else if(rank_map.find(node.rank) != rank_map.end()) {
+                    rank = rank_map[node.rank];
+                }
+                if(rank < path.size() && path[rank] == 0) {
+                    path[rank] = tid;
+                }
+
+                first = false;
+                if(node.parent_tid == tid) {
+                    break;
+                }
+                tid = node.parent_tid;
+            }
+        }
+    }
+
+    void getPath(uint64_t tid, EList<uint64_t>& path) const {
+        map<uint64_t, uint32_t>::const_iterator itr = tid_to_pid.find(tid);
+        if(itr != tid_to_pid.end()) {
+            uint32_t pid = itr->second;
+            assert_lt(pid, paths.size());
+            path = paths[pid];
+        } else {
+            path.clear();
+        }
+    }
+};
+
+typedef std::map<uint64_t, TaxonomyNode> TaxonomyTree;
+
+inline static void initial_tax_rank_num() {
+    uint8_t rank = 0;
+    
+    tax_rank_num[RANK_SUB_SPECIES] = rank;
+    tax_rank_num[RANK_STRAIN] = rank++;
+    
+    tax_rank_num[RANK_SPECIES] = rank++;
+    
+    tax_rank_num[RANK_SUB_GENUS] = rank;
+    tax_rank_num[RANK_GENUS] = rank++;
+    
+    tax_rank_num[RANK_SUB_FAMILY] = rank;
+    tax_rank_num[RANK_FAMILY] = rank;
+    tax_rank_num[RANK_SUPER_FAMILY] = rank++;
+    
+    tax_rank_num[RANK_SUB_ORDER] = rank;
+    tax_rank_num[RANK_INFRA_ORDER] = rank;
+    tax_rank_num[RANK_PARV_ORDER] = rank;
+    tax_rank_num[RANK_ORDER] = rank;
+    tax_rank_num[RANK_SUPER_ORDER] = rank++;
+    
+    tax_rank_num[RANK_INFRA_CLASS] = rank;
+    tax_rank_num[RANK_SUB_CLASS] = rank;
+    tax_rank_num[RANK_CLASS] = rank;
+    tax_rank_num[RANK_SUPER_CLASS] = rank++;
+    
+    tax_rank_num[RANK_SUB_PHYLUM] = rank;
+    tax_rank_num[RANK_PHYLUM] = rank;
+    tax_rank_num[RANK_SUPER_PHYLUM] = rank++;
+    
+    tax_rank_num[RANK_SUB_KINGDOM] = rank;
+    tax_rank_num[RANK_KINGDOM] = rank;
+    tax_rank_num[RANK_SUPER_KINGDOM] = rank++;
+    
+    tax_rank_num[RANK_DOMAIN] = rank;
+    tax_rank_num[RANK_FORMA] = rank;
+    tax_rank_num[RANK_SUB_TRIBE] = rank;
+    tax_rank_num[RANK_TRIBE] = rank;
+    tax_rank_num[RANK_VARIETAS] = rank;
+    tax_rank_num[RANK_UNKNOWN] = rank;
+}
+
+inline static const char* get_tax_rank_string(uint8_t rank) {
+    switch(rank) {
+        case RANK_STRAIN:        return "strain";
+        case RANK_SPECIES:       return "species";
+        case RANK_GENUS:         return "genus";
+        case RANK_FAMILY:        return "family";
+        case RANK_ORDER:         return "order";
+        case RANK_CLASS:         return "class";
+        case RANK_PHYLUM:        return "phylum";
+        case RANK_KINGDOM:       return "kingdom";
+        case RANK_FORMA:         return "forma";
+        case RANK_INFRA_CLASS:   return "infraclass";
+        case RANK_INFRA_ORDER:   return "infraorder";
+        case RANK_PARV_ORDER:    return "parvorder";
+        case RANK_SUB_CLASS:     return "subclass";
+        case RANK_SUB_FAMILY:    return "subfamily";
+        case RANK_SUB_GENUS:     return "subgenus";
+        case RANK_SUB_KINGDOM:   return "subkingdom";
+        case RANK_SUB_ORDER:     return "suborder";
+        case RANK_SUB_PHYLUM:    return "subphylum";
+        case RANK_SUB_SPECIES:   return "subspecies";
+        case RANK_SUB_TRIBE:     return "subtribe";
+        case RANK_SUPER_CLASS:   return "superclass";
+        case RANK_SUPER_FAMILY:  return "superfamily";
+        case RANK_SUPER_KINGDOM: return "superkingdom";
+        case RANK_SUPER_ORDER:   return "superorder";
+        case RANK_SUPER_PHYLUM:  return "superphylum";
+        case RANK_TRIBE:         return "tribe";
+        case RANK_VARIETAS:      return "varietas";
+        default:                 return "no rank";
+    };
+}
+
+inline static uint8_t get_tax_rank_id(const char* rank) {
+    if(strcmp(rank, "strain") == 0) {
+        return RANK_STRAIN;
+    } else if(strcmp(rank, "species") == 0) {
+        return RANK_SPECIES;
+    } else if(strcmp(rank, "genus") == 0) {
+        return RANK_GENUS;
+    } else if(strcmp(rank, "family") == 0) {
+        return RANK_FAMILY;
+    } else if(strcmp(rank, "order") == 0) {
+        return RANK_ORDER;
+    } else if(strcmp(rank, "class") == 0) {
+        return RANK_CLASS;
+    } else if(strcmp(rank, "phylum") == 0) {
+        return RANK_PHYLUM;
+    } else if(strcmp(rank, "kingdom") == 0) {
+        return RANK_KINGDOM;
+    } else if(strcmp(rank, "forma") == 0) {
+        return RANK_FORMA;
+    } else if(strcmp(rank, "infraclass") == 0) {
+        return RANK_INFRA_CLASS;
+    } else if(strcmp(rank, "infraorder") == 0) {
+        return RANK_INFRA_ORDER;
+    } else if(strcmp(rank, "parvorder") == 0) {
+        return RANK_PARV_ORDER;
+    } else if(strcmp(rank, "subclass") == 0) {
+        return RANK_SUB_CLASS;
+    } else if(strcmp(rank, "subfamily") == 0) {
+        return RANK_SUB_FAMILY;
+    } else if(strcmp(rank, "subgenus") == 0) {
+        return RANK_SUB_GENUS;
+    } else if(strcmp(rank, "subkingdom") == 0) {
+        return RANK_SUB_KINGDOM;
+    } else if(strcmp(rank, "suborder") == 0) {
+        return RANK_SUB_ORDER;
+    } else if(strcmp(rank, "subphylum") == 0) {
+        return RANK_SUB_PHYLUM;
+    } else if(strcmp(rank, "subspecies") == 0) {
+        return RANK_SUB_SPECIES;
+    } else if(strcmp(rank, "subtribe") == 0) {
+        return RANK_SUB_TRIBE;
+    } else if(strcmp(rank, "superclass") == 0) {
+        return RANK_SUPER_CLASS;
+    } else if(strcmp(rank, "superfamily") == 0) {
+        return RANK_SUPER_FAMILY;
+    } else if(strcmp(rank, "superkingdom") == 0) {
+        return RANK_SUPER_KINGDOM;
+    } else if(strcmp(rank, "superorder") == 0) {
+        return RANK_SUPER_ORDER;
+    } else if(strcmp(rank, "superphylum") == 0) {
+        return RANK_SUPER_PHYLUM;
+    } else if(strcmp(rank, "tribe") == 0) {
+        return RANK_TRIBE;
+    } else if(strcmp(rank, "varietas") == 0) {
+        return RANK_VARIETAS;
+    } else {
+        return RANK_UNKNOWN;
+    }
+}
+
+inline static uint64_t get_taxid_at_parent_rank(const TaxonomyTree& tree, uint64_t taxid, uint8_t at_rank) {
+	while (true) {
+		TaxonomyTree::const_iterator itr = tree.find(taxid);
+		if(itr == tree.end()) {
+			break;
+		}
+		const TaxonomyNode& node = itr->second;
+
+		if (node.rank == at_rank) {
+			return taxid;
+		} else if (node.rank > at_rank || node.parent_tid == taxid) {
+			return 0;
+		}
+
+		taxid = node.parent_tid;
+	}
+	return 0;
+}
+
+inline static TaxonomyTree read_taxonomy_tree(string taxonomy_fname) {
+	TaxonomyTree tree;
+	ifstream taxonomy_file(taxonomy_fname.c_str(), ios::in);
+	if(taxonomy_file.is_open()) {
+		char line[1024];
+		while(!taxonomy_file.eof()) {
+			line[0] = 0;
+			taxonomy_file.getline(line, sizeof(line));
+			if(line[0] == 0 || line[0] == '#') continue;
+			istringstream cline(line);
+			uint64_t tid, parent_tid;
+			char dummy; string rank_string;
+			cline >> tid >> dummy >> parent_tid >> dummy >> rank_string;
+			if(tree.find(tid) != tree.end()) {
+				cerr << "Warning: " << tid << " already has a parent!" << endl;
+				continue;
+			}
+
+			tree[tid] = TaxonomyNode(parent_tid, get_tax_rank_id(rank_string.c_str()), false);
+		}
+		taxonomy_file.close();
+	} else {
+		cerr << "Error: " << taxonomy_fname << " doesn't exist!" << endl;
+		throw 1;
+	}
+	return tree;
+}
+
+
+#endif /* TAXONOMY_H_ */
diff --git a/third_party/MurmurHash3.cpp b/third_party/MurmurHash3.cpp
new file mode 100644
index 0000000..aa7982d
--- /dev/null
+++ b/third_party/MurmurHash3.cpp
@@ -0,0 +1,335 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#define	FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64 ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+    
+    h1 ^= k1;
+    h1 = ROTL32(h1,13); 
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+} 
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b; 
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5; 
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i*4+0);
+    uint32_t k2 = getblock32(blocks,i*4+1);
+    uint32_t k3 = getblock32(blocks,i*4+2);
+    uint32_t k4 = getblock32(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock64(blocks,i*2+0);
+    uint64_t k2 = getblock64(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
+  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
+  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
+  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
+  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
+  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
+  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
+  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
+  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
+  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
+  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
+  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
+  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
+  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+
diff --git a/third_party/MurmurHash3.h b/third_party/MurmurHash3.h
new file mode 100644
index 0000000..e1c6d34
--- /dev/null
+++ b/third_party/MurmurHash3.h
@@ -0,0 +1,37 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
diff --git a/third_party/cpuid.h b/third_party/cpuid.h
new file mode 100644
index 0000000..6a9688f
--- /dev/null
+++ b/third_party/cpuid.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ * 
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ * 
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+/* %ecx */
+#define bit_SSE3	(1 << 0)
+#define bit_PCLMUL	(1 << 1)
+#define bit_SSSE3	(1 << 9)
+#define bit_FMA		(1 << 12)
+#define bit_CMPXCHG16B	(1 << 13)
+#define bit_SSE4_1	(1 << 19)
+#define bit_SSE4_2	(1 << 20)
+#define bit_MOVBE	(1 << 22)
+#define bit_POPCNT	(1 << 23)
+#define bit_AES		(1 << 25)
+#define bit_XSAVE	(1 << 26)
+#define bit_OSXSAVE	(1 << 27)
+#define bit_AVX		(1 << 28)
+#define bit_F16C	(1 << 29)
+#define bit_RDRND	(1 << 30)
+
+/* %edx */
+#define bit_CMPXCHG8B	(1 << 8)
+#define bit_CMOV	(1 << 15)
+#define bit_MMX		(1 << 23)
+#define bit_FXSAVE	(1 << 24)
+#define bit_SSE		(1 << 25)
+#define bit_SSE2	(1 << 26)
+
+/* Extended Features */
+/* %ecx */
+#define bit_LAHF_LM	(1 << 0)
+#define bit_ABM		(1 << 5)
+#define bit_SSE4a	(1 << 6)
+#define bit_XOP         (1 << 11)
+#define bit_LWP 	(1 << 15)
+#define bit_FMA4        (1 << 16)
+#define bit_TBM         (1 << 21)
+
+/* %edx */
+#define bit_LM		(1 << 29)
+#define bit_3DNOWP	(1 << 30)
+#define bit_3DNOW	(1 << 31)
+
+/* Extended Features (%eax == 7) */
+#define bit_FSGSBASE	(1 << 0)
+#define bit_BMI		(1 << 3)
+
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx may be the PIC register.  */
+#if __GNUC__ >= 3
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchgl\t%%ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchgl\t%%ebx, %1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %1\n\t"			\
+	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+#else
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+
+/* Return highest supported input value for cpuid instruction.  ext can
+   be either 0x0 or 0x8000000 to return highest supported value for
+   basic or extended cpuid information.  Function returns 0 if cpuid
+   is not supported or whatever cpuid returns in eax register.  If sig
+   pointer is non-null, then first four bytes of the signature
+   (as found in ebx register) are returned in location pointed by sig.  */
+
+static __inline unsigned int
+__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
+{
+  unsigned int __eax, __ebx, __ecx, __edx;
+
+#ifndef __x86_64__
+#if __GNUC__ >= 3
+  /* See if we can use cpuid.  On AMD64 we always can.  */
+  __asm__ ("pushf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "mov{l}\t{%0, %1|%1, %0}\n\t"
+	   "xor{l}\t{%2, %0|%0, %2}\n\t"
+	   "push{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+  __asm__ ("pushfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "movl\t%0, %1\n\t"
+	   "xorl\t%2, %0\n\t"
+	   "pushl\t%0\n\t"
+	   "popfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "popfl\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#endif
+
+  if (!((__eax ^ __ebx) & 0x00200000))
+    return 0;
+#endif
+
+  /* Host supports cpuid.  Return highest supported cpuid input value.  */
+  __cpuid (__ext, __eax, __ebx, __ecx, __edx);
+
+  if (__sig)
+    *__sig = __ebx;
+
+  return __eax;
+}
+
+/* Return cpuid data for requested cpuid level, as found in returned
+   eax, ebx, ecx and edx registers.  The function checks if cpuid is
+   supported and returns 1 for valid cpuid information or 0 for
+   unsupported cpuid level.  All pointers are required to be non-null.  */
+
+static __inline int
+__get_cpuid (unsigned int __level,
+	     unsigned int *__eax, unsigned int *__ebx,
+	     unsigned int *__ecx, unsigned int *__edx)
+{
+  unsigned int __ext = __level & 0x80000000;
+
+  if (__get_cpuid_max (__ext, 0) < __level)
+    return 0;
+
+  __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
+  return 1;
+}
diff --git a/threading.h b/threading.h
new file mode 100644
index 0000000..fca4086
--- /dev/null
+++ b/threading.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef THREADING_H_
+#define THREADING_H_
+
+#include <iostream>
+#include "tinythread.h"
+#include "fast_mutex.h"
+
+#ifdef NO_SPINLOCK
+#   define MUTEX_T tthread::mutex
+#else
+#  	define MUTEX_T tthread::fast_mutex
+#endif /* NO_SPINLOCK */
+
+
+/**
+ * Wrap a lock; obtain lock upon construction, release upon destruction.
+ */
+class ThreadSafe {
+public:
+    ThreadSafe(MUTEX_T* ptr_mutex, bool locked = true) {
+		if(locked) {
+		    this->ptr_mutex = ptr_mutex;
+		    ptr_mutex->lock();
+		}
+		else
+		    this->ptr_mutex = NULL;
+	}
+
+	~ThreadSafe() {
+	    if (ptr_mutex != NULL)
+	        ptr_mutex->unlock();
+	}
+    
+private:
+	MUTEX_T *ptr_mutex;
+};
+
+#endif
diff --git a/timer.h b/timer.h
new file mode 100644
index 0000000..5d0c844
--- /dev/null
+++ b/timer.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TIMER_H_
+#define TIMER_H_
+
+#include <ctime>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+using namespace std;
+
+/**
+ * Use time() call to keep track of elapsed time between creation and
+ * destruction.  If verbose is true, Timer will print a message showing
+ * elapsed time to the given output stream upon destruction.
+ */
+class Timer {
+public:
+	Timer(ostream& out = cout, const char *msg = "", bool verbose = true) :
+		_t(time(0)), _out(out), _msg(msg), _verbose(verbose) { }
+
+	/// Optionally print message
+	~Timer() {
+		if(_verbose) write(_out);
+	}
+	
+	/// Return elapsed time since Timer object was created
+	time_t elapsed() const {
+		return time(0) - _t;
+	}
+	
+	void write(ostream& out) {
+		time_t passed = elapsed();
+		// Print the message supplied at construction time followed
+		// by time elapsed formatted HH:MM:SS 
+		time_t hours   = (passed / 60) / 60;
+		time_t minutes = (passed / 60) % 60;
+		time_t seconds = (passed % 60);
+		std::ostringstream oss;
+		oss << _msg << setfill ('0') << setw (2) << hours << ":"
+		           << setfill ('0') << setw (2) << minutes << ":"
+		           << setfill ('0') << setw (2) << seconds << endl;
+		out << oss.str().c_str();
+	}
+	
+private:
+	time_t      _t;
+	ostream&    _out;
+	const char *_msg;
+	bool        _verbose;
+};
+
+static inline void logTime(std::ostream& os, bool nl = true) {
+	struct tm *current;
+	time_t now;
+	time(&now);
+	current = localtime(&now);
+	std::ostringstream oss;
+	oss << setfill('0') << setw(2)
+	    << current->tm_hour << ":"
+	    << setfill('0') << setw(2)
+	    << current->tm_min << ":"
+	    << setfill('0') << setw(2)
+	    << current->tm_sec;
+	if(nl) oss << std::endl;
+	os << oss.str().c_str();
+}
+
+#endif /*TIMER_H_*/
diff --git a/tinythread.cpp b/tinythread.cpp
new file mode 100755
index 0000000..690ecee
--- /dev/null
+++ b/tinythread.cpp
@@ -0,0 +1,303 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
+Copyright (c) 2010-2012 Marcus Geelnard
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+#include <exception>
+#include "tinythread.h"
+
+#if defined(_TTHREAD_POSIX_)
+  #include <unistd.h>
+  #include <map>
+#elif defined(_TTHREAD_WIN32_)
+  #include <process.h>
+#endif
+
+
+namespace tthread {
+
+//------------------------------------------------------------------------------
+// condition_variable
+//------------------------------------------------------------------------------
+// NOTE 1: The Win32 implementation of the condition_variable class is based on
+// the corresponding implementation in GLFW, which in turn is based on a
+// description by Douglas C. Schmidt and Irfan Pyarali:
+// http://www.cs.wustl.edu/~schmidt/win32-cv-1.html
+//
+// NOTE 2: Windows Vista actually has native support for condition variables
+// (InitializeConditionVariable, WakeConditionVariable, etc), but we want to
+// be portable with pre-Vista Windows versions, so TinyThread++ does not use
+// Vista condition variables.
+//------------------------------------------------------------------------------
+
+#if defined(_TTHREAD_WIN32_)
+  #define _CONDITION_EVENT_ONE 0
+  #define _CONDITION_EVENT_ALL 1
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+condition_variable::condition_variable() : mWaitersCount(0)
+{
+  mEvents[_CONDITION_EVENT_ONE] = CreateEvent(NULL, FALSE, FALSE, NULL);
+  mEvents[_CONDITION_EVENT_ALL] = CreateEvent(NULL, TRUE, FALSE, NULL);
+  InitializeCriticalSection(&mWaitersCountLock);
+}
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+condition_variable::~condition_variable()
+{
+  CloseHandle(mEvents[_CONDITION_EVENT_ONE]);
+  CloseHandle(mEvents[_CONDITION_EVENT_ALL]);
+  DeleteCriticalSection(&mWaitersCountLock);
+}
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+void condition_variable::_wait()
+{
+  // Wait for either event to become signaled due to notify_one() or
+  // notify_all() being called
+  int result = WaitForMultipleObjects(2, mEvents, FALSE, INFINITE);
+
+  // Check if we are the last waiter
+  EnterCriticalSection(&mWaitersCountLock);
+  -- mWaitersCount;
+  bool lastWaiter = (result == (WAIT_OBJECT_0 + _CONDITION_EVENT_ALL)) &&
+                    (mWaitersCount == 0);
+  LeaveCriticalSection(&mWaitersCountLock);
+
+  // If we are the last waiter to be notified to stop waiting, reset the event
+  if(lastWaiter)
+    ResetEvent(mEvents[_CONDITION_EVENT_ALL]);
+}
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+void condition_variable::notify_one()
+{
+  // Are there any waiters?
+  EnterCriticalSection(&mWaitersCountLock);
+  bool haveWaiters = (mWaitersCount > 0);
+  LeaveCriticalSection(&mWaitersCountLock);
+
+  // If we have any waiting threads, send them a signal
+  if(haveWaiters)
+    SetEvent(mEvents[_CONDITION_EVENT_ONE]);
+}
+#endif
+
+#if defined(_TTHREAD_WIN32_)
+void condition_variable::notify_all()
+{
+  // Are there any waiters?
+  EnterCriticalSection(&mWaitersCountLock);
+  bool haveWaiters = (mWaitersCount > 0);
+  LeaveCriticalSection(&mWaitersCountLock);
+
+  // If we have any waiting threads, send them a signal
+  if(haveWaiters)
+    SetEvent(mEvents[_CONDITION_EVENT_ALL]);
+}
+#endif
+
+
+//------------------------------------------------------------------------------
+// POSIX pthread_t to unique thread::id mapping logic.
+// Note: Here we use a global thread safe std::map to convert instances of
+// pthread_t to small thread identifier numbers (unique within one process).
+// This method should be portable across different POSIX implementations.
+//------------------------------------------------------------------------------
+
+#if defined(_TTHREAD_POSIX_)
+static thread::id _pthread_t_to_ID(const pthread_t &aHandle)
+{
+  static mutex idMapLock;
+  static std::map<pthread_t, unsigned long int> idMap;
+  static unsigned long int idCount(1);
+
+  lock_guard<mutex> guard(idMapLock);
+  if(idMap.find(aHandle) == idMap.end())
+    idMap[aHandle] = idCount ++;
+  return thread::id(idMap[aHandle]);
+}
+#endif // _TTHREAD_POSIX_
+
+
+//------------------------------------------------------------------------------
+// thread
+//------------------------------------------------------------------------------
+
+/// Information to pass to the new thread (what to run).
+struct _thread_start_info {
+  void (*mFunction)(void *); ///< Pointer to the function to be executed.
+  void * mArg;               ///< Function argument for the thread function.
+  thread * mThread;          ///< Pointer to the thread object.
+};
+
+// Thread wrapper function.
+#if defined(_TTHREAD_WIN32_)
+unsigned WINAPI thread::wrapper_function(void * aArg)
+#elif defined(_TTHREAD_POSIX_)
+void * thread::wrapper_function(void * aArg)
+#endif
+{
+  // Get thread startup information
+  _thread_start_info * ti = (_thread_start_info *) aArg;
+
+  try
+  {
+    // Call the actual client thread function
+    ti->mFunction(ti->mArg);
+  }
+  catch(...)
+  {
+    // Uncaught exceptions will terminate the application (default behavior
+    // according to C++11)
+    std::terminate();
+  }
+
+  // The thread is no longer executing
+  lock_guard<mutex> guard(ti->mThread->mDataMutex);
+  ti->mThread->mNotAThread = true;
+
+  // The thread is responsible for freeing the startup information
+  delete ti;
+
+  return 0;
+}
+
+thread::thread(void (*aFunction)(void *), void * aArg)
+{
+  // Serialize access to this thread structure
+  lock_guard<mutex> guard(mDataMutex);
+
+  // Fill out the thread startup information (passed to the thread wrapper,
+  // which will eventually free it)
+  _thread_start_info * ti = new _thread_start_info;
+  ti->mFunction = aFunction;
+  ti->mArg = aArg;
+  ti->mThread = this;
+
+  // The thread is now alive
+  mNotAThread = false;
+
+  // Create the thread
+#if defined(_TTHREAD_WIN32_)
+  mHandle = (HANDLE) _beginthreadex(0, 0, wrapper_function, (void *) ti, 0, &mWin32ThreadID);
+#elif defined(_TTHREAD_POSIX_)
+  if(pthread_create(&mHandle, NULL, wrapper_function, (void *) ti) != 0)
+    mHandle = 0;
+#endif
+
+  // Did we fail to create the thread?
+  if(!mHandle)
+  {
+    mNotAThread = true;
+    delete ti;
+  }
+}
+
+thread::~thread()
+{
+  if(joinable())
+    std::terminate();
+}
+
+void thread::join()
+{
+  if(joinable())
+  {
+#if defined(_TTHREAD_WIN32_)
+    WaitForSingleObject(mHandle, INFINITE);
+    CloseHandle(mHandle);
+#elif defined(_TTHREAD_POSIX_)
+    pthread_join(mHandle, NULL);
+#endif
+  }
+}
+
+bool thread::joinable() const
+{
+  mDataMutex.lock();
+  bool result = !mNotAThread;
+  mDataMutex.unlock();
+  return result;
+}
+
+void thread::detach()
+{
+  mDataMutex.lock();
+  if(!mNotAThread)
+  {
+#if defined(_TTHREAD_WIN32_)
+    CloseHandle(mHandle);
+#elif defined(_TTHREAD_POSIX_)
+    pthread_detach(mHandle);
+#endif
+    mNotAThread = true;
+  }
+  mDataMutex.unlock();
+}
+
+thread::id thread::get_id() const
+{
+  if(!joinable())
+    return id();
+#if defined(_TTHREAD_WIN32_)
+  return id((unsigned long int) mWin32ThreadID);
+#elif defined(_TTHREAD_POSIX_)
+  return _pthread_t_to_ID(mHandle);
+#endif
+}
+
+unsigned thread::hardware_concurrency()
+{
+#if defined(_TTHREAD_WIN32_)
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return (int) si.dwNumberOfProcessors;
+#elif defined(_SC_NPROCESSORS_ONLN)
+  return (int) sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+  return (int) sysconf(_SC_NPROC_ONLN);
+#else
+  // The standard requires this function to return zero if the number of
+  // hardware cores could not be determined.
+  return 0;
+#endif
+}
+
+
+//------------------------------------------------------------------------------
+// this_thread
+//------------------------------------------------------------------------------
+
+thread::id this_thread::get_id()
+{
+#if defined(_TTHREAD_WIN32_)
+  return thread::id((unsigned long int) GetCurrentThreadId());
+#elif defined(_TTHREAD_POSIX_)
+  return _pthread_t_to_ID(pthread_self());
+#endif
+}
+
+}
diff --git a/tinythread.h b/tinythread.h
new file mode 100755
index 0000000..aed7b58
--- /dev/null
+++ b/tinythread.h
@@ -0,0 +1,714 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
+Copyright (c) 2010-2012 Marcus Geelnard
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+#ifndef _TINYTHREAD_H_
+#define _TINYTHREAD_H_
+
+/// @file
+/// @mainpage TinyThread++ API Reference
+///
+/// @section intro_sec Introduction
+/// TinyThread++ is a minimal, portable implementation of basic threading
+/// classes for C++.
+///
+/// They closely mimic the functionality and naming of the C++11 standard, and
+/// should be easily replaceable with the corresponding std:: variants.
+///
+/// @section port_sec Portability
+/// The Win32 variant uses the native Win32 API for implementing the thread
+/// classes, while for other systems, the POSIX threads API (pthread) is used.
+///
+/// @section class_sec Classes
+/// In order to mimic the threading API of the C++11 standard, subsets of
+/// several classes are provided. The fundamental classes are:
+/// @li tthread::thread
+/// @li tthread::mutex
+/// @li tthread::recursive_mutex
+/// @li tthread::condition_variable
+/// @li tthread::lock_guard
+/// @li tthread::fast_mutex
+///
+/// @section misc_sec Miscellaneous
+/// The following special keywords are available: #thread_local.
+///
+/// For more detailed information (including additional classes), browse the
+/// different sections of this documentation. A good place to start is:
+/// tinythread.h.
+
+// Which platform are we on?
+#if !defined(_TTHREAD_PLATFORM_DEFINED_)
+  #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
+    #define _TTHREAD_WIN32_
+  #else
+    #define _TTHREAD_POSIX_
+  #endif
+  #define _TTHREAD_PLATFORM_DEFINED_
+#endif
+
+// Platform specific includes
+#if defined(_TTHREAD_WIN32_)
+  #ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+    #define __UNDEF_LEAN_AND_MEAN
+  #endif
+  #include <windows.h>
+  #ifdef __UNDEF_LEAN_AND_MEAN
+    #undef WIN32_LEAN_AND_MEAN
+    #undef __UNDEF_LEAN_AND_MEAN
+  #endif
+#else
+  #include <pthread.h>
+  #include <signal.h>
+  #include <sched.h>
+  #include <unistd.h>
+#endif
+
+// Generic includes
+#include <ostream>
+
+/// TinyThread++ version (major number).
+#define TINYTHREAD_VERSION_MAJOR 1
+/// TinyThread++ version (minor number).
+#define TINYTHREAD_VERSION_MINOR 1
+/// TinyThread++ version (full version).
+#define TINYTHREAD_VERSION (TINYTHREAD_VERSION_MAJOR * 100 + TINYTHREAD_VERSION_MINOR)
+
+// Do we have a fully featured C++11 compiler?
+#if (__cplusplus > 199711L) || (defined(__STDCXX_VERSION__) && (__STDCXX_VERSION__ >= 201001L))
+  #define _TTHREAD_CPP11_
+#endif
+
+// ...at least partial C++11?
+#if defined(_TTHREAD_CPP11_) || defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(__GXX_EXPERIMENTAL_CPP0X__)
+  #define _TTHREAD_CPP11_PARTIAL_
+#endif
+
+// Macro for disabling assignments of objects.
+#ifdef _TTHREAD_CPP11_PARTIAL_
+  #define _TTHREAD_DISABLE_ASSIGNMENT(name) \
+      name(const name&) = delete; \
+      name& operator=(const name&) = delete;
+#else
+  #define _TTHREAD_DISABLE_ASSIGNMENT(name) \
+      name(const name&); \
+      name& operator=(const name&);
+#endif
+
+/// @def thread_local
+/// Thread local storage keyword.
+/// A variable that is declared with the @c thread_local keyword makes the
+/// value of the variable local to each thread (known as thread-local storage,
+/// or TLS). Example usage:
+/// @code
+/// // This variable is local to each thread.
+/// thread_local int variable;
+/// @endcode
+/// @note The @c thread_local keyword is a macro that maps to the corresponding
+/// compiler directive (e.g. @c __declspec(thread)). While the C++11 standard
+/// allows for non-trivial types (e.g. classes with constructors and
+/// destructors) to be declared with the @c thread_local keyword, most pre-C++11
+/// compilers only allow for trivial types (e.g. @c int). So, to guarantee
+/// portable code, only use trivial types for thread local storage.
+/// @note This directive is currently not supported on Mac OS X (it will give
+/// a compiler error), since compile-time TLS is not supported in the Mac OS X
+/// executable format. Also, some older versions of MinGW (before GCC 4.x) do
+/// not support this directive.
+/// @hideinitializer
+
+#if !defined(_TTHREAD_CPP11_) && !defined(thread_local)
+ #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || defined(__IBMCPP__)
+  #define thread_local __thread
+ #else
+  #define thread_local __declspec(thread)
+ #endif
+#endif
+
+
+/// Main name space for TinyThread++.
+/// This namespace is more or less equivalent to the @c std namespace for the
+/// C++11 thread classes. For instance, the tthread::mutex class corresponds to
+/// the std::mutex class.
+namespace tthread {
+
+/// Mutex class.
+/// This is a mutual exclusion object for synchronizing access to shared
+/// memory areas for several threads. The mutex is non-recursive (i.e. a
+/// program may deadlock if the thread that owns a mutex object calls lock()
+/// on that object).
+/// @see recursive_mutex
+class mutex {
+  public:
+    /// Constructor.
+    mutex()
+#if defined(_TTHREAD_WIN32_)
+      : mAlreadyLocked(false)
+#endif
+    {
+#if defined(_TTHREAD_WIN32_)
+      InitializeCriticalSection(&mHandle);
+#else
+      pthread_mutex_init(&mHandle, NULL);
+#endif
+    }
+
+    /// Destructor.
+    ~mutex()
+    {
+#if defined(_TTHREAD_WIN32_)
+      DeleteCriticalSection(&mHandle);
+#else
+      pthread_mutex_destroy(&mHandle);
+#endif
+    }
+
+    /// Lock the mutex.
+    /// The method will block the calling thread until a lock on the mutex can
+    /// be obtained. The mutex remains locked until @c unlock() is called.
+    /// @see lock_guard
+    inline void lock()
+    {
+#if defined(_TTHREAD_WIN32_)
+      EnterCriticalSection(&mHandle);
+      while(mAlreadyLocked) Sleep(1000); // Simulate deadlock...
+      mAlreadyLocked = true;
+#else
+      pthread_mutex_lock(&mHandle);
+#endif
+    }
+
+    /// Try to lock the mutex.
+    /// The method will try to lock the mutex. If it fails, the function will
+    /// return immediately (non-blocking).
+    /// @return @c true if the lock was acquired, or @c false if the lock could
+    /// not be acquired.
+    inline bool try_lock()
+    {
+#if defined(_TTHREAD_WIN32_)
+      bool ret = (TryEnterCriticalSection(&mHandle) ? true : false);
+      if(ret && mAlreadyLocked)
+      {
+        LeaveCriticalSection(&mHandle);
+        ret = false;
+      }
+      return ret;
+#else
+      return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
+#endif
+    }
+
+    /// Unlock the mutex.
+    /// If any threads are waiting for the lock on this mutex, one of them will
+    /// be unblocked.
+    inline void unlock()
+    {
+#if defined(_TTHREAD_WIN32_)
+      mAlreadyLocked = false;
+      LeaveCriticalSection(&mHandle);
+#else
+      pthread_mutex_unlock(&mHandle);
+#endif
+    }
+
+    _TTHREAD_DISABLE_ASSIGNMENT(mutex)
+
+  private:
+#if defined(_TTHREAD_WIN32_)
+    CRITICAL_SECTION mHandle;
+    bool mAlreadyLocked;
+#else
+    pthread_mutex_t mHandle;
+#endif
+
+    friend class condition_variable;
+};
+
+/// Recursive mutex class.
+/// This is a mutual exclusion object for synchronizing access to shared
+/// memory areas for several threads. The mutex is recursive (i.e. a thread
+/// may lock the mutex several times, as long as it unlocks the mutex the same
+/// number of times).
+/// @see mutex
+class recursive_mutex {
+  public:
+    /// Constructor.
+    recursive_mutex()
+    {
+#if defined(_TTHREAD_WIN32_)
+      InitializeCriticalSection(&mHandle);
+#else
+      pthread_mutexattr_t attr;
+      pthread_mutexattr_init(&attr);
+      pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+      pthread_mutex_init(&mHandle, &attr);
+#endif
+    }
+
+    /// Destructor.
+    ~recursive_mutex()
+    {
+#if defined(_TTHREAD_WIN32_)
+      DeleteCriticalSection(&mHandle);
+#else
+      pthread_mutex_destroy(&mHandle);
+#endif
+    }
+
+    /// Lock the mutex.
+    /// The method will block the calling thread until a lock on the mutex can
+    /// be obtained. The mutex remains locked until @c unlock() is called.
+    /// @see lock_guard
+    inline void lock()
+    {
+#if defined(_TTHREAD_WIN32_)
+      EnterCriticalSection(&mHandle);
+#else
+      pthread_mutex_lock(&mHandle);
+#endif
+    }
+
+    /// Try to lock the mutex.
+    /// The method will try to lock the mutex. If it fails, the function will
+    /// return immediately (non-blocking).
+    /// @return @c true if the lock was acquired, or @c false if the lock could
+    /// not be acquired.
+    inline bool try_lock()
+    {
+#if defined(_TTHREAD_WIN32_)
+      return TryEnterCriticalSection(&mHandle) ? true : false;
+#else
+      return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
+#endif
+    }
+
+    /// Unlock the mutex.
+    /// If any threads are waiting for the lock on this mutex, one of them will
+    /// be unblocked.
+    inline void unlock()
+    {
+#if defined(_TTHREAD_WIN32_)
+      LeaveCriticalSection(&mHandle);
+#else
+      pthread_mutex_unlock(&mHandle);
+#endif
+    }
+
+    _TTHREAD_DISABLE_ASSIGNMENT(recursive_mutex)
+
+  private:
+#if defined(_TTHREAD_WIN32_)
+    CRITICAL_SECTION mHandle;
+#else
+    pthread_mutex_t mHandle;
+#endif
+
+    friend class condition_variable;
+};
+
+/// Lock guard class.
+/// The constructor locks the mutex, and the destructor unlocks the mutex, so
+/// the mutex will automatically be unlocked when the lock guard goes out of
+/// scope. Example usage:
+/// @code
+/// mutex m;
+/// int counter;
+///
+/// void increment()
+/// {
+///   lock_guard<mutex> guard(m);
+///   ++ counter;
+/// }
+/// @endcode
+
+template <class T>
+class lock_guard {
+  public:
+    typedef T mutex_type;
+
+    lock_guard() : mMutex(0) {}
+
+    /// The constructor locks the mutex.
+    explicit lock_guard(mutex_type &aMutex)
+    {
+      mMutex = &aMutex;
+      mMutex->lock();
+    }
+
+    /// The destructor unlocks the mutex.
+    ~lock_guard()
+    {
+      if(mMutex)
+        mMutex->unlock();
+    }
+
+  private:
+    mutex_type * mMutex;
+};
+
+/// Condition variable class.
+/// This is a signalling object for synchronizing the execution flow for
+/// several threads. Example usage:
+/// @code
+/// // Shared data and associated mutex and condition variable objects
+/// int count;
+/// mutex m;
+/// condition_variable cond;
+///
+/// // Wait for the counter to reach a certain number
+/// void wait_counter(int targetCount)
+/// {
+///   lock_guard<mutex> guard(m);
+///   while(count < targetCount)
+///     cond.wait(m);
+/// }
+///
+/// // Increment the counter, and notify waiting threads
+/// void increment()
+/// {
+///   lock_guard<mutex> guard(m);
+///   ++ count;
+///   cond.notify_all();
+/// }
+/// @endcode
+class condition_variable {
+  public:
+    /// Constructor.
+#if defined(_TTHREAD_WIN32_)
+    condition_variable();
+#else
+    condition_variable()
+    {
+      pthread_cond_init(&mHandle, NULL);
+    }
+#endif
+
+    /// Destructor.
+#if defined(_TTHREAD_WIN32_)
+    ~condition_variable();
+#else
+    ~condition_variable()
+    {
+      pthread_cond_destroy(&mHandle);
+    }
+#endif
+
+    /// Wait for the condition.
+    /// The function will block the calling thread until the condition variable
+    /// is woken by @c notify_one(), @c notify_all() or a spurious wake up.
+    /// @param[in] aMutex A mutex that will be unlocked when the wait operation
+    ///   starts, an locked again as soon as the wait operation is finished.
+    template <class _mutexT>
+    inline void wait(_mutexT &aMutex)
+    {
+#if defined(_TTHREAD_WIN32_)
+      // Increment number of waiters
+      EnterCriticalSection(&mWaitersCountLock);
+      ++ mWaitersCount;
+      LeaveCriticalSection(&mWaitersCountLock);
+
+      // Release the mutex while waiting for the condition (will decrease
+      // the number of waiters when done)...
+      aMutex.unlock();
+      _wait();
+      aMutex.lock();
+#else
+      pthread_cond_wait(&mHandle, &aMutex.mHandle);
+#endif
+    }
+
+    /// Notify one thread that is waiting for the condition.
+    /// If at least one thread is blocked waiting for this condition variable,
+    /// one will be woken up.
+    /// @note Only threads that started waiting prior to this call will be
+    /// woken up.
+#if defined(_TTHREAD_WIN32_)
+    void notify_one();
+#else
+    inline void notify_one()
+    {
+      pthread_cond_signal(&mHandle);
+    }
+#endif
+
+    /// Notify all threads that are waiting for the condition.
+    /// All threads that are blocked waiting for this condition variable will
+    /// be woken up.
+    /// @note Only threads that started waiting prior to this call will be
+    /// woken up.
+#if defined(_TTHREAD_WIN32_)
+    void notify_all();
+#else
+    inline void notify_all()
+    {
+      pthread_cond_broadcast(&mHandle);
+    }
+#endif
+
+    _TTHREAD_DISABLE_ASSIGNMENT(condition_variable)
+
+  private:
+#if defined(_TTHREAD_WIN32_)
+    void _wait();
+    HANDLE mEvents[2];                  ///< Signal and broadcast event HANDLEs.
+    unsigned int mWaitersCount;         ///< Count of the number of waiters.
+    CRITICAL_SECTION mWaitersCountLock; ///< Serialize access to mWaitersCount.
+#else
+    pthread_cond_t mHandle;
+#endif
+};
+
+
+/// Thread class.
+class thread {
+  public:
+#if defined(_TTHREAD_WIN32_)
+    typedef HANDLE native_handle_type;
+#else
+    typedef pthread_t native_handle_type;
+#endif
+
+    class id;
+
+    /// Default constructor.
+    /// Construct a @c thread object without an associated thread of execution
+    /// (i.e. non-joinable).
+    thread() : mHandle(0), mNotAThread(true)
+#if defined(_TTHREAD_WIN32_)
+    , mWin32ThreadID(0)
+#endif
+    {}
+
+    /// Thread starting constructor.
+    /// Construct a @c thread object with a new thread of execution.
+    /// @param[in] aFunction A function pointer to a function of type:
+    ///          <tt>void fun(void * arg)</tt>
+    /// @param[in] aArg Argument to the thread function.
+    /// @note This constructor is not fully compatible with the standard C++
+    /// thread class. It is more similar to the pthread_create() (POSIX) and
+    /// CreateThread() (Windows) functions.
+    thread(void (*aFunction)(void *), void * aArg);
+
+    /// Destructor.
+    /// @note If the thread is joinable upon destruction, @c std::terminate()
+    /// will be called, which terminates the process. It is always wise to do
+    /// @c join() before deleting a thread object.
+    ~thread();
+
+    /// Wait for the thread to finish (join execution flows).
+    /// After calling @c join(), the thread object is no longer associated with
+    /// a thread of execution (i.e. it is not joinable, and you may not join
+    /// with it nor detach from it).
+    void join();
+
+    /// Check if the thread is joinable.
+    /// A thread object is joinable if it has an associated thread of execution.
+    bool joinable() const;
+
+    /// Detach from the thread.
+    /// After calling @c detach(), the thread object is no longer assicated with
+    /// a thread of execution (i.e. it is not joinable). The thread continues
+    /// execution without the calling thread blocking, and when the thread
+    /// ends execution, any owned resources are released.
+    void detach();
+
+    /// Return the thread ID of a thread object.
+    id get_id() const;
+
+    /// Get the native handle for this thread.
+    /// @note Under Windows, this is a @c HANDLE, and under POSIX systems, this
+    /// is a @c pthread_t.
+    inline native_handle_type native_handle()
+    {
+      return mHandle;
+    }
+
+    /// Determine the number of threads which can possibly execute concurrently.
+    /// This function is useful for determining the optimal number of threads to
+    /// use for a task.
+    /// @return The number of hardware thread contexts in the system.
+    /// @note If this value is not defined, the function returns zero (0).
+    static unsigned hardware_concurrency();
+
+    _TTHREAD_DISABLE_ASSIGNMENT(thread)
+
+  private:
+    native_handle_type mHandle;   ///< Thread handle.
+    mutable mutex mDataMutex;     ///< Serializer for access to the thread private data.
+    bool mNotAThread;             ///< True if this object is not a thread of execution.
+#if defined(_TTHREAD_WIN32_)
+    unsigned int mWin32ThreadID;  ///< Unique thread ID (filled out by _beginthreadex).
+#endif
+
+    // This is the internal thread wrapper function.
+#if defined(_TTHREAD_WIN32_)
+    static unsigned WINAPI wrapper_function(void * aArg);
+#else
+    static void * wrapper_function(void * aArg);
+#endif
+};
+
+/// Thread ID.
+/// The thread ID is a unique identifier for each thread.
+/// @see thread::get_id()
+class thread::id {
+  public:
+    /// Default constructor.
+    /// The default constructed ID is that of thread without a thread of
+    /// execution.
+    id() : mId(0) {};
+
+    id(unsigned long int aId) : mId(aId) {};
+
+    id(const id& aId) : mId(aId.mId) {};
+
+    inline id & operator=(const id &aId)
+    {
+      mId = aId.mId;
+      return *this;
+    }
+
+    inline friend bool operator==(const id &aId1, const id &aId2)
+    {
+      return (aId1.mId == aId2.mId);
+    }
+
+    inline friend bool operator!=(const id &aId1, const id &aId2)
+    {
+      return (aId1.mId != aId2.mId);
+    }
+
+    inline friend bool operator<=(const id &aId1, const id &aId2)
+    {
+      return (aId1.mId <= aId2.mId);
+    }
+
+    inline friend bool operator<(const id &aId1, const id &aId2)
+    {
+      return (aId1.mId < aId2.mId);
+    }
+
+    inline friend bool operator>=(const id &aId1, const id &aId2)
+    {
+      return (aId1.mId >= aId2.mId);
+    }
+
+    inline friend bool operator>(const id &aId1, const id &aId2)
+    {
+      return (aId1.mId > aId2.mId);
+    }
+
+    inline friend std::ostream& operator <<(std::ostream &os, const id &obj)
+    {
+      os << obj.mId;
+      return os;
+    }
+
+  private:
+    unsigned long int mId;
+};
+
+
+// Related to <ratio> - minimal to be able to support chrono.
+typedef long long __intmax_t;
+
+/// Minimal implementation of the @c ratio class. This class provides enough
+/// functionality to implement some basic @c chrono classes.
+template <__intmax_t N, __intmax_t D = 1> class ratio {
+  public:
+    static double _as_double() { return double(N) / double(D); }
+};
+
+/// Minimal implementation of the @c chrono namespace.
+/// The @c chrono namespace provides types for specifying time intervals.
+namespace chrono {
+  /// Duration template class. This class provides enough functionality to
+  /// implement @c this_thread::sleep_for().
+  template <class _Rep, class _Period = ratio<1> > class duration {
+    private:
+      _Rep rep_;
+    public:
+      typedef _Rep rep;
+      typedef _Period period;
+
+      /// Construct a duration object with the given duration.
+      template <class _Rep2>
+        explicit duration(const _Rep2& r) : rep_(r) {};
+
+      /// Return the value of the duration object.
+      rep count() const
+      {
+        return rep_;
+      }
+  };
+
+  // Standard duration types.
+  typedef duration<__intmax_t, ratio<1, 1000000000> > nanoseconds; ///< Duration with the unit nanoseconds.
+  typedef duration<__intmax_t, ratio<1, 1000000> > microseconds;   ///< Duration with the unit microseconds.
+  typedef duration<__intmax_t, ratio<1, 1000> > milliseconds;      ///< Duration with the unit milliseconds.
+  typedef duration<__intmax_t> seconds;                            ///< Duration with the unit seconds.
+  typedef duration<__intmax_t, ratio<60> > minutes;                ///< Duration with the unit minutes.
+  typedef duration<__intmax_t, ratio<3600> > hours;                ///< Duration with the unit hours.
+}
+
+/// The namespace @c this_thread provides methods for dealing with the
+/// calling thread.
+namespace this_thread {
+  /// Return the thread ID of the calling thread.
+  thread::id get_id();
+
+  /// Yield execution to another thread.
+  /// Offers the operating system the opportunity to schedule another thread
+  /// that is ready to run on the current processor.
+  inline void yield()
+  {
+#if defined(_TTHREAD_WIN32_)
+    Sleep(0);
+#else
+    sched_yield();
+#endif
+  }
+
+  /// Blocks the calling thread for a period of time.
+  /// @param[in] aTime Minimum time to put the thread to sleep.
+  /// Example usage:
+  /// @code
+  /// // Sleep for 100 milliseconds
+  /// this_thread::sleep_for(chrono::milliseconds(100));
+  /// @endcode
+  /// @note Supported duration types are: nanoseconds, microseconds,
+  /// milliseconds, seconds, minutes and hours.
+  template <class _Rep, class _Period> void sleep_for(const chrono::duration<_Rep, _Period>& aTime)
+  {
+#if defined(_TTHREAD_WIN32_)
+    Sleep(int(double(aTime.count()) * (1000.0 * _Period::_as_double()) + 0.5));
+#else
+    usleep(int(double(aTime.count()) * (1000000.0 * _Period::_as_double()) + 0.5));
+#endif
+  }
+}
+
+}
+
+// Define/macro cleanup
+#undef _TTHREAD_DISABLE_ASSIGNMENT
+
+#endif // _TINYTHREAD_H_
diff --git a/tokenize.h b/tokenize.h
new file mode 100644
index 0000000..a5e521c
--- /dev/null
+++ b/tokenize.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TOKENIZE_H_
+#define TOKENIZE_H_
+
+#include <string>
+#include <sstream>
+#include <limits>
+
+using namespace std;
+
+/**
+ * Split string s according to given delimiters.  Mostly borrowed
+ * from C++ Programming HOWTO 7.3.
+ */
+template<typename T>
+static inline void tokenize(
+	const string& s,
+	const string& delims,
+	T& ss,
+	size_t max = std::numeric_limits<size_t>::max())
+{
+	//string::size_type lastPos = s.find_first_not_of(delims, 0);
+	string::size_type lastPos = 0;
+	string::size_type pos = s.find_first_of(delims, lastPos);
+	while (string::npos != pos || string::npos != lastPos) {
+		ss.push_back(s.substr(lastPos, pos - lastPos));
+		lastPos = s.find_first_not_of(delims, pos);
+		pos = s.find_first_of(delims, lastPos);
+		if(ss.size() == (max - 1)) {
+			pos = string::npos;
+		}
+	}
+}
+
+template<typename T>
+static inline void tokenize(const std::string& s, char delim, T& ss) {
+	std::string token;
+	std::istringstream iss(s);
+	while(getline(iss, token, delim)) {
+		ss.push_back(token);
+	}
+}
+
+#endif /*TOKENIZE_H_*/
diff --git a/util.h b/util.h
new file mode 100644
index 0000000..3f45f85
--- /dev/null
+++ b/util.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef UTIL_H_
+#define UTIL_H_
+
+#include <stdlib.h>
+#include <limits>
+#include <map>
+#include <string>
+#include <sstream>
+
+/**
+ * C++ version char* style "itoa": Convert integer to string
+ */
+template<typename T>
+char* itoa10(const T& value, char* result) {
+	// Check that base is valid
+	char* out = result;
+	T quotient = value;
+	if(std::numeric_limits<T>::is_signed) {
+		if(quotient <= 0) quotient = -quotient;
+	}
+	// Now write each digit from most to least significant
+	do {
+		*out = "0123456789"[quotient % 10];
+		++out;
+		quotient /= 10;
+	} while (quotient > 0);
+	// Only apply negative sign for base 10
+	if(std::numeric_limits<T>::is_signed) {
+		// Avoid compiler warning in cases where T is unsigned
+		if (value <= 0 && value != 0) *out++ = '-';
+	}
+	reverse( result, out );
+	*out = 0; // terminator
+	return out;
+}
+
+// extract numeric ID from the beginning of a string
+inline
+uint64_t extractIDFromRefName(const string& refName) {
+    uint64_t id = 0;
+    for (size_t ni = 0; ni < refName.length(); ni++) {
+        if (refName[ni] < '0' || refName[ni] > '9')
+            break;
+
+        id *= 10;
+        id += (refName[ni] - '0');
+    }
+    return id;
+}
+
+// Converts a numeric value to std::string (part of C++11)
+template <typename T>
+std::string to_string(T value) {
+ ostringstream ss;
+ ss << value;
+ return ss.str();
+}
+
+/**
+ *
+ */
+template<typename K,typename V>
+inline
+V find_or_use_default(const std::map<K, V>& my_map, const K& query, const V default_value) {
+	typedef typename std::map<K,V>::const_iterator MapIterator;
+	MapIterator itr = my_map.find(query);
+
+	if (itr == my_map.end()) {
+		return default_value;
+	}
+
+	return itr->second;
+}
+
+#endif /*ifndef UTIL_H_*/
diff --git a/word_io.h b/word_io.h
new file mode 100644
index 0000000..2b1c6c3
--- /dev/null
+++ b/word_io.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef WORD_IO_H_
+#define WORD_IO_H_
+
+#include <stdint.h>
+#include <unistd.h>
+#include <iostream>
+#include <fstream>
+#include "assert_helpers.h"
+#include "endian_swap.h"
+
+/**
+ * Write a 32-bit unsigned to an output stream being careful to
+ * re-endianize if caller-requested endianness differs from current
+ * host.
+ */
+static inline void writeU32(std::ostream& out, uint32_t x, bool toBigEndian) {
+	uint32_t y = endianizeU32(x, toBigEndian);
+	out.write((const char*)&y, 4);
+}
+
+/**
+ * Write a 32-bit unsigned to an output stream using the native
+ * endianness.
+ */
+static inline void writeU32(std::ostream& out, uint32_t x) {
+	out.write((const char*)&x, 4);
+}
+
+/**
+ * Write a 32-bit signed int to an output stream being careful to
+ * re-endianize if caller-requested endianness differs from current
+ * host.
+ */
+static inline void writeI32(std::ostream& out, int32_t x, bool toBigEndian) {
+	int32_t y = endianizeI32(x, toBigEndian);
+	out.write((const char*)&y, 4);
+}
+
+/**
+ * Write a 32-bit unsigned to an output stream using the native
+ * endianness.
+ */
+static inline void writeI32(std::ostream& out, int32_t x) {
+	out.write((const char*)&x, 4);
+}
+
+/**
+ * Write a 16-bit unsigned to an output stream being careful to
+ * re-endianize if caller-requested endianness differs from current
+ * host.
+ */
+static inline void writeU16(std::ostream& out, uint16_t x, bool toBigEndian) {
+	uint16_t y = endianizeU16(x, toBigEndian);
+	out.write((const char*)&y, 2);
+}
+
+/**
+ * Write a 16-bit unsigned to an output stream using the native
+ * endianness.
+ */
+static inline void writeU16(std::ostream& out, uint16_t x) {
+	out.write((const char*)&x, 2);
+}
+
+/**
+ * Write a 16-bit signed int to an output stream being careful to
+ * re-endianize if caller-requested endianness differs from current
+ * host.
+ */
+static inline void writeI16(std::ostream& out, int16_t x, bool toBigEndian) {
+	int16_t y = endianizeI16(x, toBigEndian);
+	out.write((const char*)&y, 2);
+}
+
+/**
+ * Write a 16-bit unsigned to an output stream using the native
+ * endianness.
+ */
+static inline void writeI16(std::ostream& out, int16_t x) {
+	out.write((const char*)&x, 2);
+}
+
+/**
+ * Read a 32-bit unsigned from an input stream, inverting endianness
+ * if necessary.
+ */
+static inline uint32_t readU32(std::istream& in, bool swap) {
+	uint32_t x;
+	in.read((char *)&x, 4);
+	assert_eq(4, in.gcount());
+	if(swap) {
+		return endianSwapU32(x);
+	} else {
+		return x;
+	}
+}
+
+/**
+ * Read a 32-bit unsigned from a file descriptor, optionally inverting
+ * endianness.
+ */
+#ifdef BOWTIE_MM
+static inline uint32_t readU32(int in, bool swap) {
+	uint32_t x;
+	if(read(in, (void *)&x, 4) != 4) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapU32(x);
+	} else {
+		return x;
+	}
+}
+#endif
+
+/**
+ * Read a 32-bit unsigned from a FILE*, optionally inverting
+ * endianness.
+ */
+static inline uint32_t readU32(FILE* in, bool swap) {
+	uint32_t x;
+	if(fread((void *)&x, 1, 4, in) != 4) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapU32(x);
+	} else {
+		return x;
+	}
+}
+
+
+/**
+ * Read a 32-bit signed from an input stream, inverting endianness
+ * if necessary.
+ */
+static inline int32_t readI32(std::istream& in, bool swap) {
+	int32_t x;
+	in.read((char *)&x, 4);
+	assert_eq(4, in.gcount());
+	if(swap) {
+		return endianSwapI32(x);
+	} else {
+		return x;
+	}
+}
+
+/**
+ * Read a 32-bit unsigned from a file descriptor, optionally inverting
+ * endianness.
+ */
+#ifdef BOWTIE_MM
+static inline uint32_t readI32(int in, bool swap) {
+	int32_t x;
+	if(read(in, (void *)&x, 4) != 4) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapI32(x);
+	} else {
+		return x;
+	}
+}
+#endif
+
+/**
+ * Read a 32-bit unsigned from a FILE*, optionally inverting
+ * endianness.
+ */
+static inline uint32_t readI32(FILE* in, bool swap) {
+	int32_t x;
+	if(fread((void *)&x, 1, 4, in) != 4) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapI32(x);
+	} else {
+		return x;
+	}
+}
+
+
+/**
+ * Read a 16-bit unsigned from an input stream, inverting endianness
+ * if necessary.
+ */
+static inline uint16_t readU16(std::istream& in, bool swap) {
+	uint16_t x;
+	in.read((char *)&x, 2);
+	assert_eq(2, in.gcount());
+	if(swap) {
+		return endianSwapU16(x);
+	} else {
+		return x;
+	}
+}
+
+/**
+ * Read a 16-bit unsigned from a file descriptor, optionally inverting
+ * endianness.
+ */
+#ifdef BOWTIE_MM
+static inline uint16_t readU16(int in, bool swap) {
+	uint16_t x;
+	if(read(in, (void *)&x, 2) != 2) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapU16(x);
+	} else {
+		return x;
+	}
+}
+#endif
+
+/**
+ * Read a 16-bit unsigned from a FILE*, optionally inverting
+ * endianness.
+ */
+static inline uint16_t readU16(FILE* in, bool swap) {
+	uint16_t x;
+	if(fread((void *)&x, 1, 2, in) != 2) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapU32(x);
+	} else {
+		return x;
+	}
+}
+
+
+/**
+ * Read a 16-bit signed from an input stream, inverting endianness
+ * if necessary.
+ */
+static inline int32_t readI16(std::istream& in, bool swap) {
+	int16_t x;
+	in.read((char *)&x, 2);
+	assert_eq(2, in.gcount());
+	if(swap) {
+		return endianSwapI16(x);
+	} else {
+		return x;
+	}
+}
+
+/**
+ * Read a 16-bit unsigned from a file descriptor, optionally inverting
+ * endianness.
+ */
+#ifdef BOWTIE_MM
+static inline uint16_t readI16(int in, bool swap) {
+	int16_t x;
+	if(read(in, (void *)&x, 2) != 2) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapI16(x);
+	} else {
+		return x;
+	}
+}
+#endif
+
+/**
+ * Read a 16-bit unsigned from a FILE*, optionally inverting
+ * endianness.
+ */
+static inline uint16_t readI16(FILE* in, bool swap) {
+	int16_t x;
+	if(fread((void *)&x, 1, 2, in) != 2) {
+		assert(false);
+	}
+	if(swap) {
+		return endianSwapI16(x);
+	} else {
+		return x;
+	}
+}
+
+template <typename index_t>
+void writeIndex(std::ostream& out, index_t x, bool toBigEndian) {
+	index_t y = endianizeIndex(x, toBigEndian);
+	out.write((const char*)&y, sizeof(index_t));
+}
+
+/**
+ * Read a unsigned from an input stream, inverting endianness
+ * if necessary.
+ */
+template <typename index_t>
+static inline index_t readIndex(std::istream& in, bool swap) {
+	index_t x;
+	in.read((char *)&x, sizeof(index_t));
+	assert_eq(sizeof(index_t), in.gcount());
+	if(swap) {
+		return endianSwapIndex(x);
+	} else {
+		return x;
+	}
+}
+
+/**
+ * Read a unsigned from a file descriptor, optionally inverting
+ * endianness.
+ */
+#ifdef BOWTIE_MM
+template <typename index_t>
+static inline index_t readIndex(int in, bool swap) {
+	index_t x;
+	if(read(in, (void *)&x, sizeof(index_t)) != sizeof(index_t)) {
+		assert(false);
+	}
+	if(swap) {
+		if(sizeof(index_t) == 8) {
+			assert(false);
+			return 0;
+		} else if(sizeof(index_t) == 4) {
+			return endianSwapU32(x);
+		} else {
+			assert_eq(sizeof(index_t), 2);
+			return endianSwapU16(x);
+		}
+	} else {
+		return x;
+	}
+}
+#endif
+
+/**
+ * Read a unsigned from a FILE*, optionally inverting
+ * endianness.
+ */
+template <typename index_t>
+static inline index_t readIndex(FILE* in, bool swap) {
+	index_t x;
+	if(fread((void *)&x, 1, sizeof(index_t), in) != sizeof(index_t)) {
+		assert(false);
+	}
+	if(swap) {
+		if(sizeof(index_t) == 8) {
+			assert(false);
+			return 0;
+		} else if(sizeof(index_t) == 4) {
+			return endianSwapU32((uint32_t)x);
+		} else {
+			assert_eq(sizeof(index_t), 2);
+			return endianSwapU16(x);
+		}
+	} else {
+		return x;
+	}
+}
+
+
+#endif /*WORD_IO_H_*/
diff --git a/zbox.h b/zbox.h
new file mode 100644
index 0000000..6ef1456
--- /dev/null
+++ b/zbox.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea at cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ZBOX_H_
+#define ZBOX_H_
+
+#include "btypes.h"
+
+/**
+ * Fill z with Z-box information for s.  String z will not be resized
+ * and will only be filled up to its size cap.  This is the linear-time
+ * algorithm from Gusfield.  An optional sanity-check uses a naive
+ * algorithm to double-check results.
+ */
+template<typename T>
+void calcZ(const T& s,
+           TIndexOffU off,
+           EList<TIndexOffU>& z,
+           bool verbose = false,
+           bool sanityCheck = false)
+{
+	size_t lCur = 0, rCur = 0;
+	size_t zlen = z.size();
+	size_t slen = s.length();
+	assert_gt(zlen, 0);
+	assert_eq(z[0], 0);
+	//assert_leq(zlen, slen);
+	for (size_t k = 1; k < zlen && k+off < slen; k++) {
+		assert_lt(lCur, k);
+		assert(z[lCur] == 0 || z[lCur] == rCur - lCur + 1);
+		if(k > rCur) {
+			// compare starting at k with prefix starting at 0
+			size_t ki = k;
+			while(off+ki < s.length() && s[off+ki] == s[off+ki-k]) ki++;
+			z[k] = (TIndexOffU)(ki - k);
+			assert_lt(off+z[k], slen);
+			if(z[k] > 0) {
+				lCur = k;
+				rCur = k + z[k] - 1;
+			}
+		} else {
+			// position k is contained in a Z-box
+			size_t betaLen = rCur - k + 1;
+			size_t kPrime = k - lCur;
+			assert_eq(s[off+k], s[off+kPrime]);
+			if(z[kPrime] < betaLen) {
+				z[k] = z[kPrime];
+				assert_lt(off+z[k], slen);
+				// lCur, rCur unchanged
+			} else if (z[kPrime] > 0) {
+				int q = 0;
+				while (off+q+rCur+1 < s.length() && s[off+q+rCur+1] == s[off+betaLen+q]) q++;
+				z[k] = (TIndexOffU)(betaLen + q);
+				assert_lt(off+z[k], slen);
+				rCur = rCur + q;
+				assert_geq(k, lCur);
+				lCur = k;
+			} else {
+				z[k] = 0;
+				assert_lt(off+z[k], slen);
+				// lCur, rCur unchanged
+			}
+		}
+	}
+#ifndef NDEBUG
+	if(sanityCheck) {
+		// Recalculate Z-boxes using naive quadratic-time algorithm and
+		// compare to linear-time result
+		assert_eq(0, z[0]);
+		for(size_t i = 1; i < z.size(); i++) {
+			size_t j;
+			for(j = i; off+j < s.length(); j++) {
+				if(s[off+j] != s[off+j-i]) break;
+			}
+			assert_eq(j-i, z[i]);
+		}
+	}
+#endif
+}
+
+#endif /*ZBOX_H_*/

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/centrifuge.git



More information about the debian-med-commit mailing list