[med-svn] [rapmap] 01/02: Imported Upstream version 0.3.0+dfsg

Tue Sep 6 19:10:19 UTC 2016

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository rapmap.

commit b8d17258a110f3a4727ad3afcd039bacbe26d1a0
Author: Andreas Tille <tille at debian.org>
Date:   Tue Sep 6 19:43:55 2016 +0200

    Imported Upstream version 0.3.0+dfsg
---
 .clang-format                                  |   94 +
 .gitignore                                     |   28 +
 CMakeLists.txt                                 |  297 +++
 README.md                                      |   53 +
 TestingScripts/python/ComputeSyntheticStats.py |  100 +
 cmake/Modules/FindCereal.cmake                 |   22 +
 cmake/Modules/FindJellyfish.cmake              |   26 +
 cmake/Modules/FindJemalloc.cmake               |   46 +
 cmake/Modules/FindSSE.cmake                    |  141 ++
 cmake/Modules/FindTcmalloc.cmake               |   47 +
 include/BooMap.hpp                             |  193 ++
 include/BooPHF.hpp                             | 1221 +++++++++
 include/Const.hpp                              |   36 +
 include/EnumCoder.hpp                          |   53 +
 include/HitManager.hpp                         |  109 +
 include/IndexHeader.hpp                        |   77 +
 include/JFRaw.hpp                              |   30 +
 include/PairAlignmentFormatter.hpp             |   29 +
 include/PairSequenceParser.hpp                 |  193 ++
 include/RapMapConfig.hpp                       |   14 +
 include/RapMapFileSystem.hpp                   |   15 +
 include/RapMapIndex.hpp                        |   52 +
 include/RapMapSAIndex.hpp                      |   63 +
 include/RapMapUtils.hpp                        |  811 ++++++
 include/SACollector.hpp                        |  580 +++++
 include/SASearcher.hpp                         |  631 +++++
 include/ScopedTimer.hpp                        |   22 +
 include/SingleAlignmentFormatter.hpp           |   22 +
 include/SpinLock.hpp                           |   25 +
 include/Type.hpp                               |   30 +
 include/Util.hpp                               |   69 +
 include/bar.h                                  |  148 ++
 include/bit_array.h                            |  552 +++++
 include/bit_macros.h                           |  205 ++
 include/btree/btree.h                          | 2394 ++++++++++++++++++
 include/btree/btree_container.h                |  349 +++
 include/btree/btree_map.h                      |  130 +
 include/kseq.h                                 |  235 ++
 include/macros.h                               |   59 +
 include/rank9b.h                               |   42 +
 include/stringpiece.h                          |  181 ++
 include/xxhash.h                               |  192 ++
 scripts/compile.sh                             |   78 +
 scripts/make-release.sh                        |   59 +
 src/CMakeLists.txt                             |  143 ++
 src/EnumCoder.cpp                              |  264 ++
 src/EnumCoderTest.cpp                          |   46 +
 src/HitManager.cpp                             |  700 ++++++
 src/RapMap.cpp                                 |   74 +
 src/RapMapFileSystem.cpp                       |   37 +
 src/RapMapIndex.cpp                            |  139 ++
 src/RapMapIndexer.cpp                          |  765 ++++++
 src/RapMapMapper.cpp                           | 1427 +++++++++++
 src/RapMapSAIndex.cpp                          |  177 ++
 src/RapMapSAIndexer.cpp                        |  731 ++++++
 src/RapMapSAMapper.cpp                         |  638 +++++
 src/RapMapUtils.cpp                            |  562 +++++
 src/UtilTest.cpp                               |   55 +
 src/bit_array.c                                | 3160 ++++++++++++++++++++++++
 src/rank9b.cpp                                 |   67 +
 src/stringpiece.cc                             |   90 +
 src/xxhash.c                                   |  915 +++++++
 62 files changed, 19743 insertions(+)

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..df7597a
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,94 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: false
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IncludeIsMainRegex: '$'
+IndentCaseLabels: false
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left 
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        4
+UseTab:          Never
+JavaScriptQuotes: Leave
+...
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b8bd026
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,28 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100755
index 0000000..17487b8
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,297 @@
+cmake_minimum_required (VERSION 2.8)
+
+enable_testing()
+
+project (RapMap)
+
+set(CPACK_PACKAGE_VERSION "0.3.0")
+SET(CPACK_PACKAGE_VERSION_MAJOR "0")
+set(CPACK_PACKAGE_VERSION_MINOR "3")
+set(CPACK_PACKAGE_VERSION_PATCH "0")
+set(CPACK_GENERATOR "TGZ")
+set(CPACK_SOURCE_GENERATOR "TGZ")
+set(CPACK_PACKAGE_VENDOR "Stony Brook University")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "RapMap - Wicked-fast quasi/pseudo/lightweight alignment")
+set(CPACK_PACKAGE_NAME
+  "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
+set(CPACK_SOURCE_PACKAGE_FILE_NAME
+  "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}-Source")
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+#include(FindSSE)
+#FindSSE ()
+#if(SSE4_2_FOUND)
+#    message("Enabling popcount")
+#    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -DEMPHF_USE_POPCOUNT")
+#endif(SSE4_2_FOUND)
+
+set (WARNING_IGNORE_FLAGS "-Wno-deprecated-register -Wno-c++11-narrowing -Wno-unknown-pragmas")
+set (BOOST_CXX_FLAGS "-Wno-deprecated-register -std=c++11")
+## Prefer static to dynamic libraries
+SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+
+## Set the standard required compile flags
+if (NO_NATIVE_ARCH)
+  message (STATUS "DISABLING NATIVE ARCH.")
+else()
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+endif()
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -funroll-loops -fPIC -fomit-frame-pointer -O4 -DHAVE_ANSI_TERM -Wall -std=c++11 -Wreturn-type -Werror=return-type")
+
+##
+# OSX is strange (some might say, stupid in this regard).  Deal with it's quirkines here.
+##
+if (APPLE)
+    # To allow ourselves to build a dynamic library, we have to tell the compiler
+    # that, yes, the symbols will be around at runtime.
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -undefined dynamic_lookup")
+    set (LIBSALMON_LINKER_FLAGS "-all_load")
+    # In order to "think different", we also have to use non-standard suffixes
+    # for our shared libraries
+    set(SHARED_LIB_EXTENSION "dylib")
+else()
+    # We're in sane linux world
+   set (SHARED_LIB_EXTENSION "so")
+   set (LIBSALMON_LINKER_FLAGS "")
+endif()
+
+set( BOOST_EXTRA_FLAGS "--layout=tagged" )
+## this get's set differently below if we
+## are on clang & apple
+set (NON_APPLECLANG_LIBS gomp rt)
+set (PTHREAD_LIB)
+
+##
+# Compiler-specific C++11 activation.
+# http://stackoverflow.com/questions/10984442/how-to-detect-c11-support-of-a-compiler-with-cmake
+##
+##
+# First take care of what to do if we have gcc
+##
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+    execute_process(
+        COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    # If we're on OSX
+    if (APPLE AND NOT (GCC_VERSION VERSION_GREATER 4.8.2 OR GCC_VERSION VERSION_EQUAL 4.8.2))
+        message(FATAL_ERROR "When building under OSX, ${PROJECT_NAME} requires "
+                            "either clang or g++ >= 4.8.2")
+    elseif (NOT (GCC_VERSION VERSION_GREATER 4.7 OR GCC_VERSION VERSION_EQUAL 4.7))
+        message(FATAL_ERROR "${PROJECT_NAME} requires g++ 4.7 or greater.")
+    endif ()
+
+    set (GCC TRUE)
+
+    # Put complete static linking on hold for the time-being
+    # If we're not on OSX, make an attempt to compile everything statically
+    #if (NOT APPLE)
+    #set (CMAKE_CXX_FLAGS "-static ${CMAKE_CXX_FLAGS}")
+    #set (CMAKE_EXE_LINK_FLAGS "-static")
+    set (PTHREAD_LIB "pthread")
+    #endif()
+
+    # If we're on Linux (i.e. not OSX) and we're using
+    # gcc, then set the -static-libstdc++ flag
+    if (NOT APPLE)
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
+    endif()
+
+    set (WARNING_IGNORE_FLAGS "${WARNING_IGNORE_FLAGS} -Wno-unused-local-typedefs")
+    set (BOOST_TOOLSET "gcc")
+    set (BOOST_CONFIGURE_TOOLSET "--with-toolset=gcc")
+	set (BCXX_FLAGS "-std=c++11")
+    set (BOOST_EXTRA_FLAGS toolset=gcc cxxflags=${BCXX_FLAGS})
+# Tentatively, we support clang now
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    set(CLANG TRUE)
+    # If we have libc++, then try and use it
+    include(CheckCXXCompilerFlag)
+    check_cxx_compiler_flag(-stdlib=libc++ HAVE_LIBCPP)
+    if (HAVE_LIBCPP)
+        message ("It appears that you're compiling with clang and that libc++ is available, so I'll use that")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
+	    set (BOOST_TOOLSET "clang")
+        set (BOOST_CONFIGURE_TOOLSET "--with-toolset=clang")
+	    set (BCXX_FLAGS "-stdlib=libc++ -DBOOST_HAS_INT128")
+	    set (BOOST_EXTRA_FLAGS toolset=clang cxxflags=${BCXX_FLAGS} linkflags="-stdlib=libc++")
+        set (JELLYFISH_CXX_FLAGS "-stdlib=libc++")
+    # Otherwise, use libstdc++ (and make it static)
+    else()
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
+    endif()
+
+    if (APPLE)
+        set (NON_APPLECLANG_LIBS "")
+    else()
+        set (PTHREAD_LIB "pthread")
+    endif()
+else ()
+    message(FATAL_ERROR "Your C++ compiler does not support C++11.")
+endif ()
+
+include(ExternalProject)
+
+##
+#  Update the CXX flags according to the system and compiler
+##
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_IGNORE_FLAGS}")
+
+if (CMAKE_BUILD_TYPE MATCHES Debug)
+    message ("Making Debug build")
+    set (CMAKE_CXX_FLAGS_DEBUG "-g ${CMAKE_CXX_FLAGS}")
+elseif (CMAKE_BUILD_TYPE MATCHES Release)
+    message ("Making Release build")
+    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS}")
+else ()
+    message ("Making Default build type")
+endif ()
+
+##
+# Record this top-level path
+##
+set (GAT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+# Have CMake tell us what it's doing
+set (CMAKE_VERBOSE_MAKEFILE true)
+
+find_package (ZLIB)
+if (NOT ZLIB_FOUND)
+	message (FATAL_ERROR "zlib must be installed before configuration & building can proceed")
+endif()
+
+
+message("Build system will build libdivsufsort")
+message("==================================================================")
+include(ExternalProject)
+ExternalProject_Add(libdivsufsort
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    URL ${CMAKE_CURRENT_SOURCE_DIR}/external/libdivsufsort.tar.gz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libdivsufsort-master
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
+    BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libdivsufsort-master/build
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR> -DBUILD_DIVSUFSORT64=TRUE -DUSE_OPENMP=TRUE -DBUILD_SHARED_LIBS=FALSE
+)
+set(SUFFARRAY_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
+
+message("Build system will fetch and build SparseHash")
+message("==================================================================")
+ExternalProject_Add(libsparsehash
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/sparsehash/archive/sparsehash-2.0.2.tar.gz -o sparsehash-2.0.2.tar.gz &&
+        tar -xzf sparsehash-2.0.2.tar.gz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/sparsehash-sparsehash-2.0.2
+    BUILD_IN_SOURCE TRUE
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ./configure --prefix=<INSTALL_DIR>"
+    INSTALL_COMMAND make install
+)
+
+
+if (NOT CEREAL_ROOT)
+	set(CEREAL_ROOT ${GAT_SOURCE_DIR}/external/install)
+endif()
+
+find_package (Cereal 1.1.2)
+if (NOT CEREAL_FOUND)
+
+	message("Build system will fetch and build the Cereal serialization library")
+	message("==================================================================")
+	include(ExternalProject)
+	ExternalProject_Add(libcereal
+	    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+	    DOWNLOAD_COMMAND curl -k -L https://github.com/USCiLab/cereal/archive/v1.1.2.tar.gz -o cereal-v1.1.2.tar.gz &&
+		tar -xzvf cereal-v1.1.2.tar.gz
+	    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.1.2
+	    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+	    UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
+	    BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.1.2/build
+	    CONFIGURE_COMMAND ""
+	    BUILD_COMMAND ""
+	    INSTALL_COMMAND sh -c "mkdir -p <INSTALL_DIR>/include && cp -r <SOURCE_DIR>/include/cereal <INSTALL_DIR>/include"
+	)
+
+	set(CEREAL_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
+
+endif()
+
+
+if (NOT JELLYFISH_ROOT)
+	set(JELLYFISH_ROOT ${GAT_SOURCE_DIR}/external/install)
+endif()
+
+find_package(Jellyfish 2.2.5)
+
+if (NOT JELLYFISH_FOUND)
+message("Build system will fetch and build Jellyfish")
+message("==================================================================")
+ExternalProject_Add(libjellyfish
+    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+    DOWNLOAD_COMMAND curl -k -L https://github.com/gmarcais/Jellyfish/releases/download/v2.2.5/jellyfish-2.2.5.tar.gz -o jellyfish-2.2.5.tgz &&
+    	rm -fr jellyfish-2.2.5 &&
+     	tar -xzvf jellyfish-2.2.5.tgz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.5
+    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+    CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.5/configure --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
+    BUILD_COMMAND ${MAKE} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND make install
+)
+endif()
+
+set (FAST_MALLOC_LIB "")
+set (HAVE_FAST_MALLOC FALSE)
+
+# See if we have Jemalloc
+find_package(Jemalloc)
+if (Jemalloc_FOUND)
+    message("Found Jemalloc library --- using this memory allocator")
+    set (FAST_MALLOC_LIB ${JEMALLOC_LIBRARIES})
+    set (HAVE_FAST_MALLOC TRUE)
+endif()
+
+if (NOT HAVE_FAST_MALLOC)
+    # See if we have Tcmalloc
+    find_package(Tcmalloc)
+    if (Tcmalloc_FOUND)
+        message("Fount TCMalloc library --- using this memory allocator")
+        set (TCMALLOC_LIB ${Tcmalloc_LIBRARIES})
+        set (FAST_MALLOC_LIB ${TCMALLOC_LIB})
+        set (HAVE_FAST_MALLOC TRUE)
+    endif()
+endif()
+
+if (NOT HAVE_FAST_MALLOC)
+    message("Build system will fetch and use JEMalloc")
+    message("==================================================================")
+    ExternalProject_Add(libjemalloc
+        DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+        DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/jemalloc/archive/3.6.0.tar.gz -o jemalloc-3.6.0.tar.gz &&
+        tar -xzf jemalloc-3.6.0.tar.gz
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jemalloc-3.6.0
+        BUILD_IN_SOURCE TRUE
+        INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+        CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --prefix=<INSTALL_DIR>"
+        INSTALL_COMMAND cp -r lib <INSTALL_DIR>/ && cp -r include <INSTALL_DIR>/
+        )
+
+    set (FAST_MALLOC_LIB ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib/libjemalloc.a)
+    set (HAVE_FAST_MALLOC TRUE)
+endif ()
+
+###
+#
+# Done building external dependencies.
+#
+###
+
+set (CPACK_SOURCE_IGNORE_FILES
+".git/")
+
+message("CPACK_SOURCE_IGNORE_FILES = ${CPACK_SOURCE_IGNORE_FILES}")
+
+# Recurse into Salmon source directory
+add_subdirectory ( src )
+
+# build a CPack driven installer package
+include (CPack)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..473d5f8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,53 @@
+# What is RapMap?
+
+RapMap is a testing ground for ideas in quasi-mapping / (lightweight / pseudo) transcriptome alignment.  That means that, at this point, it is somewhat experimental.  The `develop` branch will have the latest improvements and additions, but is not guaranteed to be stable between commits.  Breaking changes to the master branch will be accompanied by a tag to the version before the breaking change.  Currently, RapMap is a stand-alone quasi-mapper that can be used with other tools.  It is a [...]
+
+Quasi-mapping / (lightweight / pseudo)-alignment is the term we are using here for the type of information required for certain tasks (e.g. 
+transcript quantification) that is less "heavyweight" than what is provided by traditional alignment. For example, one may
+only need to know the transcripts / contigs to which a read aligns and, perhaps, the position within those transcripts rather
+than the optimal alignment and base-for-base `CIGAR` string that aligns the read and substring of the transcript.  For details on RapMap (quasi-mapping in particular), please check out the [associated paper](http://bioinformatics.oxfordjournals.org/content/32/12/i192.full.pdf). Note: RapMap implements both quasi-mapping and pseudo-alignment (originally introduced in [Bray et al. 2016](http://www.nature.com/nbt/journal/v34/n5/full/nbt.3519.html)), these two are not the same thing. They ar [...]
+
+There are a number of different ways to collect such information, and the idea of RapMap (as the repository grows) will be to explore multiple different strategies in how to most rapidly determine all *feasible* / *compatible* locations for a read within the transcriptome.  In this sense, it is like an *all-mapper*; the mappings it outputs are intended to be (eventually) disambiguated (*Really, it's more like an "all-best" mapper, since it returns all hits in the top "stratum" of quasi-m [...]
+
+# Building RapMap
+
+To build RapMap, you need a C++11 compliant compiler (g++ >= 4.7 and clang >= 3.4) and CMake.  RapMap is built with the following steps (assuming that `path_to_rapmap` is the toplevel directory where you have cloned this repository):
+
+```
+[path_to_rapmap] > mkdir build && cd build
+[path_to_rapmap/build] > cmake ..
+[path_to_rapmap/build] > make
+[path_to_rapmap/build] > make install
+[path_to_rapmap/build] > cd ../bin
+[path_to_rapmap/bin] > ./rapmap -h
+```
+This should output the standard help message for rapmap.
+
+# Can I use RapMap for genomic alignment?
+
+No, at least not right now.  The index and mapping strategy employed by RapMap are highly geared toward mapping to transcriptomes.  It may be the case that some of these ideas can be successfully applied to genomic alignment, but 
+this functionality is not currently suppored (and is not a high priority right now).
+
+# How fast is RapMap?
+
+Speed is relative, but we think it's very fast: On a synthetic test dataset comprised of 75 million 76bp paired-end reads, mapping to a human transcriptome with ~213,000 transcripts, RapMap takes ~ 10 minutes to align all of the reads *on a single core* (on an Intel Xeon E5-2690 @ 3.00 GHz) --- if you actually want to write out the alignments --- it depends on you disk speed, but for us it's ~15 minutes. Again, these mapping times are *on a single core* --- but RapMap is trivially parall [...]
+
+# OK, that's fast, but is it accurate?
+
+Yes; quasi-mapping seems to provide accurate mapping results. In the above mentioned synthetic dataset (generated *with* sequencing errors), the true location of origin of the read appears in the hits returned by RapMap > 97% of the time. For more details, please refer to [the paper](http://bioinformatics.oxfordjournals.org/content/32/12/i192.full.pdf).
+
+# Caveats
+
+RapMap is experimental, and the code, at this point, is subject to me testing out new ideas (see the description above about the master vs. develop branch). This also means that limited effort has been put into size or speed optimizaiton.  There are numerous ways that the code can be sped up and the memory footprint reduced, but that hasn't been the focus yet --- it will be eventualy.  All of this being said --- RapMap is open to the community because I'd like feedback / help / thoughts. [...]
+
+# External dependencies
+
+[tclap](http://tclap.sourceforge.net/)
+
+[cereal](https://github.com/USCiLab/cereal)
+
+[jellyfish](https://github.com/gmarcais/Jellyfish)
+
+# License 
+
+Since RapMap uses Jellyfish, it must be released under the GPL.  However, this is currently the only GPL dependency.  If it can be replaced, I'd like to re-license RapMap under the BSD license.  I'd be happy to accept pull-requests that replace the Jellyfish components with a library released under a more liberal license (BSD-compatible), but note that I will *not* accept such pull requests if they reduce the speed or increase the memory consumption over the Jellyfish-based version.
diff --git a/TestingScripts/python/ComputeSyntheticStats.py b/TestingScripts/python/ComputeSyntheticStats.py
new file mode 100644
index 0000000..ad38694
--- /dev/null
+++ b/TestingScripts/python/ComputeSyntheticStats.py
@@ -0,0 +1,100 @@
+from __future__ import print_function
+import argparse
+
+def main(args):
+    import pysam
+    import sys
+
+    printFP = args.printFalsePositives
+    alnFile = pysam.AlignmentFile(args.input, 'r')
+    totalNumReads = args.totalNumReads
+    currQueryName = ''
+    skipToNextAlignment = False
+
+    readsWithTrueAln = 0
+    readsSeen = 0
+
+    truePos = 0
+    falsePos = 0
+    falseNeg = 0
+    foundTrueAlignment = False
+    ## True Neg are somewhat ill-defined in our context
+    prevRec = None
+    useNHTag = args.useNHTag
+    NHTagSum = 0
+    nhData = []
+    for rec in alnFile:
+        if rec.is_unmapped:
+            continue
+        qname = rec.qname[:-2]
+        # If this is a new read, remember the name
+        # and increment the counter
+        if qname != currQueryName:
+            readsSeen += 1
+            if useNHTag:
+                tagVal = rec.get_tag('NH')
+                NHTagSum += tagVal
+                nhData.append(tagVal)
+            if readsSeen % 1000000 == 0:
+                print("\r\rSaw {} reads --- thp = {:.2%}".format(readsSeen, \
+                       float(truePos) / readsSeen), file=sys.stderr, end='')
+
+
+            currQueryName = qname
+            if not foundTrueAlignment:
+                falsePos += 1
+                if (printFP):
+                    print(prevRec)
+            skipToNextAlignment = False
+            foundTrueAlignment = False
+
+        prevRec = rec
+        # If we already found the true hit
+        # for this read, don't bother processing
+        # this record
+        if not skipToNextAlignment:
+            # Process the record to find if its the true
+            # hit
+            currQueryName = qname
+            trueTxpName = qname.split(':')[2]
+            alignedTxpName = alnFile.getrname(rec.rname)
+            if (trueTxpName == alignedTxpName):
+                truePos += 1
+                skipToNextAlignment = True
+                foundTrueAlignment = True
+
+    falseNeg = totalNumReads - readsSeen
+    print('\n'.join(["Total Reads = {}" ,
+          "Reads Aligned  = {}" ,
+          "True Pos = {}" ,
+          "False Pos = {}" ,
+          "False Neg = {}" ,
+          "Precision = {:.2%}",
+          "Recall = {:.2%}" ,
+          "TPH = {:.2%}",
+          "FDR = {:.2%}",
+          "F1 Score = {:.2%}"]).format(totalNumReads, readsSeen,
+                                  truePos, falsePos,
+                                  falseNeg, truePos / float(truePos + falsePos),
+                                  truePos / float(truePos + falseNeg),
+                                  truePos / float(readsSeen),
+                                  falsePos / float(falsePos + truePos),
+                                  (2*truePos) / float(2*truePos + falsePos + falseNeg)),
+          file=sys.stderr)
+
+    if useNHTag:
+        print("Average hits-per-read = {:.2}".format(
+              NHTagSum / float(totalNumReads)), file=sys.stderr)
+        if args.nhFreqFile:
+            import pickle
+            pickle.dump(nhData, args.nhFreqFile)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Compute statistics from synthetic SAM file.')
+    parser.add_argument('--input', type=str)
+    parser.add_argument('--totalNumReads', type=int)
+    parser.add_argument('--useNHTag', action='store_true')
+    parser.add_argument('--nhFreqFile', nargs='?', type=argparse.FileType('w'))
+    parser.add_argument('--printFalsePositives', action='store_true')
+    args = parser.parse_args()
+    main(args)
diff --git a/cmake/Modules/FindCereal.cmake b/cmake/Modules/FindCereal.cmake
new file mode 100644
index 0000000..e340764
--- /dev/null
+++ b/cmake/Modules/FindCereal.cmake
@@ -0,0 +1,22 @@
+###############################################################################
+# Find Cereal
+#
+# This sets the following variables:
+# CEREAL_FOUND - True if Cereal was found.
+# CEREAL_INCLUDE_DIRS - Directories containing the Cereal include files.
+# CEREAL_DEFINITIONS - Compiler flags for Cereal.
+
+find_path(CEREAL_INCLUDE_DIR cereal
+	HINTS "${CEREAL_ROOT}/include" "$ENV{CEREAL_ROOT}/include" "/usr/include" "$ENV{PROGRAMFILES}/cereal/include")
+
+set(CEREAL_INCLUDE_DIRS ${CEREAL_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Cereal DEFAULT_MSG CEREAL_INCLUDE_DIR)
+
+mark_as_advanced(CEREAL_INCLUDE_DIR)
+
+if(CEREAL_FOUND)
+  message(STATUS "Cereal found (include: ${CEREAL_INCLUDE_DIRS})")
+endif(CEREAL_FOUND)
+
diff --git a/cmake/Modules/FindJellyfish.cmake b/cmake/Modules/FindJellyfish.cmake
new file mode 100644
index 0000000..e12b30a
--- /dev/null
+++ b/cmake/Modules/FindJellyfish.cmake
@@ -0,0 +1,26 @@
+###############################################################################
+# Find Jellyfish 
+#
+# This sets the following variables:
+# JELLYFISH_FOUND - True if Jellyfish was found.
+# JELLYFISH_INCLUDE_DIRS - Directories containing the Jellyfish include files.
+# JELLYFISH_DEFINITIONS - Compiler flags for Jellyfish.
+
+find_path(JELLYFISH_INCLUDE_DIR jellyfish
+	HINTS "${JELLYFISH_ROOT}/include" "$ENV{JELLYFISH_ROOT}/include" "/usr/include" "$ENV{PROGRAMFILES}/jellyfish/include")
+
+set(JELLYFISH_INCLUDE_DIRS ${JELLYFISH_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+#message("Required Jellyfish version ${Jellyfish_FIND_VERSION}")
+find_package_handle_standard_args(Jellyfish 
+                                 DEFAULT_MSG 
+                                 FOUND_VAR JELLYFISH_FOUND 
+                                  REQUIRED_VARS JELLYFISH_INCLUDE_DIR 
+                                  VERSION_VAR Jellyfish_FOUND_VERSION)
+
+mark_as_advanced(JELLYFISH_INCLUDE_DIR)
+
+if(JELLYFISH_FOUND)
+    message(STATUS "Jellyfish found (include: ${JELLYFISH_INCLUDE_DIRS})")
+endif(JELLYFISH_FOUND)
diff --git a/cmake/Modules/FindJemalloc.cmake b/cmake/Modules/FindJemalloc.cmake
new file mode 100644
index 0000000..6eaf214
--- /dev/null
+++ b/cmake/Modules/FindJemalloc.cmake
@@ -0,0 +1,46 @@
+# From: https://raw.githubusercontent.com/STEllAR-GROUP/hpx/master/cmake/FindJemalloc.cmake
+# Copyright (c)      2014 Thomas Heller
+# Copyright (c) 2007-2012 Hartmut Kaiser
+# Copyright (c) 2010-2011 Matt Anderson
+# Copyright (c) 2011      Bryce Lelbach
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+find_package(PkgConfig)
+pkg_check_modules(PC_JEMALLOC QUIET libjemalloc)
+
+find_path(JEMALLOC_INCLUDE_DIR jemalloc/jemalloc.h
+  HINTS
+    ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT
+    ${PC_JEMALLOC_MINIMAL_INCLUDEDIR}
+    ${PC_JEMALLOC_MINIMAL_INCLUDE_DIRS}
+    ${PC_JEMALLOC_INCLUDEDIR}
+    ${PC_JEMALLOC_INCLUDE_DIRS}
+  PATH_SUFFIXES include)
+
+find_library(JEMALLOC_LIBRARY NAMES jemalloc libjemalloc
+  HINTS
+    ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT
+    ${PC_JEMALLOC_MINIMAL_LIBDIR}
+    ${PC_JEMALLOC_MINIMAL_LIBRARY_DIRS}
+    ${PC_JEMALLOC_LIBDIR}
+    ${PC_JEMALLOC_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64)
+
+set(JEMALLOC_LIBRARIES ${JEMALLOC_LIBRARY})
+set(JEMALLOC_INCLUDE_DIRS ${JEMALLOC_INCLUDE_DIR})
+
+find_package_handle_standard_args(Jemalloc DEFAULT_MSG
+  JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
+
+get_property(_type CACHE JEMALLOC_ROOT PROPERTY TYPE)
+if(_type)
+  set_property(CACHE JEMALLOC_ROOT PROPERTY ADVANCED 1)
+  if("x${_type}" STREQUAL "xUNINITIALIZED")
+    set_property(CACHE JEMALLOC_ROOT PROPERTY TYPE PATH)
+  endif()
+endif()
+
+mark_as_advanced(JEMALLOC_ROOT JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
+
diff --git a/cmake/Modules/FindSSE.cmake b/cmake/Modules/FindSSE.cmake
new file mode 100644
index 0000000..5500a26
--- /dev/null
+++ b/cmake/Modules/FindSSE.cmake
@@ -0,0 +1,141 @@
+# Check if SSE instructions are available on the machine where 
+# the project is compiled.
+# from https://raw.githubusercontent.com/hideo55/CMake-FindSSE/master/FindSSE.cmake
+
+MACRO (FindSSE)
+
+IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+   EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
+
+   STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE)
+   IF (SSE2_TRUE)
+      set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
+   ELSE (SSE2_TRUE)
+      set(SSE2_FOUND false CACHE BOOL "SSE2 available on host")
+   ENDIF (SSE2_TRUE)
+
+   # /proc/cpuinfo apparently omits sse3 :(
+   STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE)
+   IF (NOT SSE3_TRUE)
+      STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO})
+      STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE)
+   ENDIF (NOT SSE3_TRUE)
+
+   STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE)
+   IF (SSE3_TRUE OR SSSE3_TRUE)
+      set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
+   ELSE (SSE3_TRUE OR SSSE3_TRUE)
+      set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
+   ENDIF (SSE3_TRUE OR SSSE3_TRUE)
+   IF (SSSE3_TRUE)
+      set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host")
+   ELSE (SSSE3_TRUE)
+      set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
+   ENDIF (SSSE3_TRUE)
+
+   STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE)
+   IF (SSE41_TRUE)
+      set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
+   ELSE (SSE41_TRUE)
+      set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+   ENDIF (SSE41_TRUE)
+
+   STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE)
+   IF (SSE42_TRUE)
+      set(SSE4_2_FOUND true CACHE BOOL "SSE4.2 available on host")
+   ELSE (SSE42_TRUE)
+      set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+   ENDIF (SSE42_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+   EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
+      CPUINFO)
+
+   STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE)
+   IF (SSE2_TRUE)
+      set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
+   ELSE (SSE2_TRUE)
+      set(SSE2_FOUND false CACHE BOOL "SSE2 available on host")
+   ENDIF (SSE2_TRUE)
+
+   STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE)
+   IF (SSE3_TRUE)
+      set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
+   ELSE (SSE3_TRUE)
+      set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
+   ENDIF (SSE3_TRUE)
+
+   STRING(REGEX REPLACE "^.*(SSSE3).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "SSSE3" "${SSE_THERE}" SSSE3_TRUE)
+   IF (SSSE3_TRUE)
+      set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host")
+   ELSE (SSSE3_TRUE)
+      set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
+   ENDIF (SSSE3_TRUE)
+
+   STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE)
+   IF (SSE41_TRUE)
+      set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
+   ELSE (SSE41_TRUE)
+      set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+   ENDIF (SSE41_TRUE)
+
+   STRING(REGEX REPLACE "^.*(SSE4.2).*$" "\\1" SSE_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "SSE4.2" "${SSE_THERE}" SSE42_TRUE)
+   IF (SSE42_TRUE)
+      set(SSE4_2_FOUND true CACHE BOOL "SSE4.2 available on host")
+   ELSE (SSE42_TRUE)
+      set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+   ENDIF (SSE42_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
+   # TODO
+   set(SSE2_FOUND   true  CACHE BOOL "SSE2 available on host")
+   set(SSE3_FOUND   false CACHE BOOL "SSE3 available on host")
+   set(SSSE3_FOUND  false CACHE BOOL "SSSE3 available on host")
+   set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+   set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
+   set(SSE2_FOUND   true  CACHE BOOL "SSE2 available on host")
+   set(SSE3_FOUND   false CACHE BOOL "SSE3 available on host")
+   set(SSSE3_FOUND  false CACHE BOOL "SSSE3 available on host")
+   set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+   set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+IF(CMAKE_COMPILER_IS_GNUCXX)
+    EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    IF(GCC_VERSION VERSION_LESS 4.2)
+        set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host" FORCE)
+        set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host" FORCE)
+    ENDIF()
+ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+
+if(NOT SSE2_FOUND)
+      MESSAGE(STATUS "Could not find support for SSE2 on this machine.")
+endif(NOT SSE2_FOUND)
+if(NOT SSE3_FOUND)
+      MESSAGE(STATUS "Could not find support for SSE3 on this machine.")
+endif(NOT SSE3_FOUND)
+if(NOT SSSE3_FOUND)
+      MESSAGE(STATUS "Could not find support for SSSE3 on this machine.")
+endif(NOT SSSE3_FOUND)
+if(NOT SSE4_1_FOUND)
+      MESSAGE(STATUS "Could not find support for SSE4.1 on this machine.")
+endif(NOT SSE4_1_FOUND)
+if(NOT SSE4_2_FOUND)
+      MESSAGE(STATUS "Could not find support for SSE4.2 on this machine.")
+endif(NOT SSE4_2_FOUND)
+
+mark_as_advanced(SSE2_FOUND SSE3_FOUND SSSE3_FOUND SSE4_1_FOUND SSE4_2_FOUND)
+
+ENDMACRO(FindSSE)
+
diff --git a/cmake/Modules/FindTcmalloc.cmake b/cmake/Modules/FindTcmalloc.cmake
new file mode 100644
index 0000000..96727ef
--- /dev/null
+++ b/cmake/Modules/FindTcmalloc.cmake
@@ -0,0 +1,47 @@
+# - Find Tcmalloc
+# Find the native Tcmalloc includes and library
+#
+#  Tcmalloc_INCLUDE_DIR - where to find Tcmalloc.h, etc.
+#  Tcmalloc_LIBRARIES   - List of libraries when using Tcmalloc.
+#  Tcmalloc_FOUND       - True if Tcmalloc found.
+
+find_path(Tcmalloc_INCLUDE_DIR google/tcmalloc.h NO_DEFAULT_PATH PATHS
+  ${HT_DEPENDENCY_INCLUDE_DIR}
+  /usr/include
+  /opt/local/include
+  /usr/local/include
+)
+
+if (USE_TCMALLOC)
+  set(Tcmalloc_NAMES tcmalloc)
+else ()
+  set(Tcmalloc_NAMES tcmalloc_minimal tcmalloc)
+endif ()
+
+find_library(Tcmalloc_LIBRARY NO_DEFAULT_PATH
+  NAMES ${Tcmalloc_NAMES}
+  PATHS ${HT_DEPENDENCY_LIB_DIR} /lib /usr/lib /usr/local/lib /opt/local/lib
+)
+
+if (Tcmalloc_INCLUDE_DIR AND Tcmalloc_LIBRARY)
+  set(Tcmalloc_FOUND TRUE)
+  set( Tcmalloc_LIBRARIES ${Tcmalloc_LIBRARY} )
+else ()
+  set(Tcmalloc_FOUND FALSE)
+  set( Tcmalloc_LIBRARIES )
+endif ()
+
+if (Tcmalloc_FOUND)
+  message(STATUS "Found Tcmalloc: ${Tcmalloc_LIBRARY}")
+else ()
+  message(STATUS "Not Found Tcmalloc: ${Tcmalloc_LIBRARY}")
+  if (Tcmalloc_FIND_REQUIRED)
+    message(STATUS "Looked for Tcmalloc libraries named ${Tcmalloc_NAMES}.")
+    message(FATAL_ERROR "Could NOT find Tcmalloc library")
+  endif ()
+endif ()
+
+mark_as_advanced(
+  Tcmalloc_LIBRARY
+  Tcmalloc_INCLUDE_DIR
+  )
diff --git a/include/BooMap.hpp b/include/BooMap.hpp
new file mode 100644
index 0000000..e2056c6
--- /dev/null
+++ b/include/BooMap.hpp
@@ -0,0 +1,193 @@
+#ifndef __BOO_MAP__
+#define __BOO_MAP__
+
+#include "BooPHF.hpp"
+
+#include "cereal/types/vector.hpp"
+#include "cereal/types/utility.hpp"
+#include "cereal/archives/binary.hpp"
+
+#include <fstream>
+#include <vector>
+#include <iterator>
+#include <type_traits>
+
+#include <sys/stat.h>
+
+// adapted from :
+// http://stackoverflow.com/questions/34875315/implementation-my-own-list-and-iterator-stl-c
+template <typename Iter>
+class KeyIterator {
+public:
+    typedef KeyIterator<Iter> self_type;
+    typedef typename std::iterator_traits<Iter>::value_type::first_type value_type;
+    typedef value_type& reference;
+    typedef value_type* pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef int64_t difference_type;
+
+    KeyIterator(Iter first) : curr_(first) {}
+    KeyIterator operator++() { KeyIterator i = *this; curr_++; return i; }
+    KeyIterator operator++(int) { ++curr_; return *this; }
+    reference operator*() { return curr_->first; }
+    pointer operator->() { return &(curr_->first); }
+    bool operator==(const self_type& rhs) { return curr_ == rhs.curr_; }
+    bool operator!=(const self_type& rhs) { return curr_ != rhs.curr_; }
+    bool operator<(const self_type& rhs) { return curr_ < rhs.curr_; }
+    bool operator<=(const self_type& rhs) { return curr_ <= rhs.curr_; }
+    
+private:
+    Iter curr_;
+};
+
+template <typename KeyT, typename ValueT>
+class BooMap {
+public:
+    using HasherT = boomphf::SingleHashFunctor<KeyT>;
+    using BooPHFT = boomphf::mphf<KeyT, HasherT>;
+    using IteratorT = typename std::vector<std::pair<KeyT, ValueT>>::iterator;
+
+    BooMap() : built_(false) {}
+    void add(KeyT&& k, ValueT&& v) {
+        data_.emplace_back(k, v);
+    }
+
+    bool build(int nthreads=1) {
+        size_t numElem = data_.size();
+        KeyIterator<decltype(data_.begin())> kb(data_.begin());
+        KeyIterator<decltype(data_.begin())> ke(data_.end());
+        auto keyIt = boomphf::range(kb, ke);
+        BooPHFT* ph = new BooPHFT(numElem, keyIt, nthreads);
+        boophf_.reset(ph);
+        std::cerr << "reordering keys and values to coincide with phf ... ";
+        std::vector<size_t> inds; inds.reserve(data_.size());
+        for (size_t i = 0; i < data_.size(); ++i) {
+            inds.push_back(ph->lookup(data_[i].first));
+        }
+        reorder_destructive_(inds.begin(), inds.end(), data_.begin());
+        std::cerr << "done\n";
+        built_ = true;
+        return built_;
+    }
+
+    inline IteratorT find(const KeyT& k) {
+        auto ind = boophf_->lookup(k);
+        return (ind < data_.size()) ? (data_[ind].first == k ? data_.begin() + ind : data_.end()) : data_.end();
+    }
+    
+    /**
+     * NOTE: This function *assumes* that the key is in the hash.
+     * If it isn't, you'll get back a random element!
+     */
+    inline ValueT& operator[](const KeyT& k) {
+        auto ind = boophf_->lookup(k);
+        return (ind < data_.size() ? data_[ind].second : data_[0].second);
+    }
+    
+    inline IteratorT begin() { return data_.begin(); }
+    inline IteratorT end() { return data_.end(); }
+    inline IteratorT cend() const { return data_.cend(); }
+    inline IteratorT cbegin() const { return data_.cbegin(); }
+    
+    void save(const std::string& ofileBase) {
+        if (built_) {
+            std::string hashFN = ofileBase + ".bph";
+            // save the perfect hash function
+            {
+                std::ofstream os(hashFN, std::ios::binary);
+                if (!os.is_open()) {
+                    std::cerr << "BooM: unable to open output file [" << hashFN << "]; exiting!\n";
+                    std::exit(1);
+                }
+                boophf_->save(os);
+                os.close();
+            }
+            // and the values
+            std::string dataFN = ofileBase + ".val";
+            {
+                std::ofstream valStream(dataFN, std::ios::binary);
+                if (!valStream.is_open()) {
+                    std::cerr << "BooM: unable to open output file [" << dataFN << "]; exiting!\n";
+                    std::exit(1);
+                }
+                {
+                    cereal::BinaryOutputArchive outArchive(valStream);
+                    outArchive(data_);
+                }
+                valStream.close();
+            }
+        }
+    }
+    
+    void load(const std::string& ofileBase) {
+        std::string hashFN = ofileBase + ".bph";
+        std::string dataFN = ofileBase + ".val";
+
+        if ( !FileExists_(hashFN.c_str()) ) {
+            std::cerr << "BooM: Looking for perfect hash function file [" << hashFN << "], which doesn't exist! exiting.\n";
+            std::exit(1);
+        }
+        if ( !FileExists_(dataFN.c_str()) ) {
+            std::cerr << "BooM: Looking for key-value file [" << dataFN << "], which doesn't exist! exiting.\n";
+            std::exit(1);
+        }
+
+        // load the perfect hash function
+        {
+            boophf_.reset(new BooPHFT);
+            std::ifstream is(hashFN, std::ios::binary);
+            boophf_->load(is);
+            is.close();
+        }
+        // and the values
+        {
+            std::ifstream dataStream(dataFN, std::ios::binary);
+            {
+                cereal::BinaryInputArchive inArchive(dataStream);
+                inArchive(data_);
+            }
+            dataStream.close();
+        }
+        built_ = true;
+    }
+
+private:
+    // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+    bool FileExists_(const char *path) {
+        struct stat fileStat;
+        if ( stat(path, &fileStat) ) {
+            return false;
+        }
+        if ( !S_ISREG(fileStat.st_mode) ) {
+            return false;
+        }
+        return true;
+    }
+
+    // From : http://stackoverflow.com/questions/838384/reorder-vector-using-a-vector-of-indices
+    template< typename order_iterator, typename value_iterator >
+    void reorder_destructive_( order_iterator order_begin, order_iterator order_end, value_iterator v )  {
+        using value_t = typename std::iterator_traits< value_iterator >::value_type;
+        using index_t = typename std::iterator_traits< order_iterator >::value_type;
+        using diff_t = typename std::iterator_traits< order_iterator >::difference_type;
+
+        diff_t remaining = order_end - 1 - order_begin;
+        for ( index_t s = index_t(); remaining > 0; ++ s ) {
+            index_t d = order_begin[s];
+            if ( d == (diff_t) -1 ) continue;
+            -- remaining;
+            value_t temp = v[s];
+            for ( index_t d2; d != s; d = d2 ) {
+                std::swap( temp, v[d] );
+                std::swap( order_begin[d], d2 = (diff_t) -1 );
+                -- remaining;
+            }
+            v[s] = temp;
+        }
+    }
+
+    bool built_;
+    std::vector<std::pair<KeyT, ValueT>> data_;
+    std::unique_ptr<BooPHFT> boophf_{nullptr};
+};
+#endif // __BOO_MAP__ 
diff --git a/include/BooPHF.hpp b/include/BooPHF.hpp
new file mode 100644
index 0000000..64b11c7
--- /dev/null
+++ b/include/BooPHF.hpp
@@ -0,0 +1,1221 @@
+// BooPHF library
+// intended to be a minimal perfect hash function with fast and low memory construction, at the cost of (slightly) higher bits/elem than other state of the art libraries once built.
+// should work with arbitray large number of elements, based on a cascade of  "collision-free" bit arrays
+
+#ifndef __BOO_PHF__
+#define __BOO_PHF__
+
+#include <stdio.h>
+#include <climits>
+#include <stdlib.h>
+#include <iostream>
+#include <math.h>
+
+#include <array>
+#include <unordered_map>
+#include <vector>
+#include <assert.h>
+#include <sys/time.h>
+#include <string.h>
+#include <memory> // for make_shared
+
+
+namespace boomphf {
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark utils
+////////////////////////////////////////////////////////////////
+
+	inline unsigned int popcount_32(unsigned int x)
+	{
+		unsigned int m1 = 0x55555555;
+		unsigned int m2 = 0x33333333;
+		unsigned int m4 = 0x0f0f0f0f;
+		unsigned int h01 = 0x01010101;
+		x -= (x >> 1) & m1;               /* put count of each 2 bits into those 2 bits */
+		x = (x & m2) + ((x >> 2) & m2);   /* put count of each 4 bits in */
+		x = (x + (x >> 4)) & m4;          /* put count of each 8 bits in partie droite  4bit piece*/
+		return (x * h01) >> 24;           /* returns left 8 bits of x + (x<<8) + ... */
+	}
+
+
+	inline unsigned int popcount_64(uint64_t x)
+	{
+		unsigned int low = x & 0xffffffff ;
+		unsigned int high = ( x >> 32LL) & 0xffffffff ;
+
+		return (popcount_32(low) + popcount_32(high));
+	}
+
+
+	///// progress bar
+	class Progress
+	{
+	public:
+		int timer_mode;
+		struct timeval timestamp;
+		double heure_debut, heure_actuelle ;
+		std::string   message;
+
+		uint64_t done;
+		uint64_t todo;
+		int subdiv ; // progress printed every 1/subdiv of total to do
+		double partial;
+		int _nthreads;
+		std::vector<double > partial_threaded;
+		std::vector<uint64_t > done_threaded;
+
+		double steps ; //steps = todo/subidv
+
+		void init(uint64_t ntasks, const char * msg,int nthreads =1)
+		{
+			_nthreads = nthreads;
+			message = std::string(msg);
+			gettimeofday(&timestamp, NULL);
+			heure_debut = timestamp.tv_sec +(timestamp.tv_usec/1000000.0);
+
+			//fprintf(stderr,"| %-*s |\n",98,msg);
+
+			todo= ntasks;
+			done = 0;
+			partial =0;
+			
+			partial_threaded.resize(_nthreads);
+			done_threaded.resize(_nthreads);
+			
+			for (int ii=0; ii<_nthreads;ii++) partial_threaded[ii]=0;
+			for (int ii=0; ii<_nthreads;ii++) done_threaded[ii]=0;
+			subdiv= 1000;
+			steps = (double)todo / (double)subdiv;
+
+			if(!timer_mode)
+			{
+				 fprintf(stderr,"[");fflush(stderr);
+			}
+		}
+
+		void finish()
+		{
+			set(todo);
+			 if(timer_mode)
+			 	fprintf(stderr,"\n");
+			 else
+			 	fprintf(stderr,"]\n");
+
+			fflush(stderr);
+			todo= 0;
+			done = 0;
+			partial =0;
+
+		}
+		void finish_threaded()// called by only one of the threads
+		{
+			done = 0;
+			double rem = 0;
+			for (int ii=0; ii<_nthreads;ii++) done += (done_threaded[ii] );
+			for (int ii=0; ii<_nthreads;ii++) partial += (partial_threaded[ii] );
+
+			finish();
+
+		}
+		void inc(uint64_t ntasks_done)
+		{
+			done += ntasks_done;
+			partial += ntasks_done;
+
+
+			while(partial >= steps)
+			{
+				if(timer_mode)
+				{
+					gettimeofday(&timestamp, NULL);
+					heure_actuelle = timestamp.tv_sec +(timestamp.tv_usec/1000000.0);
+					double elapsed = heure_actuelle - heure_debut;
+					double speed = done / elapsed;
+					double rem = (todo-done) / speed;
+					if(done>todo) rem=0;
+					int min_e  = (int)(elapsed / 60) ;
+					elapsed -= min_e*60;
+					int min_r  = (int)(rem / 60) ;
+					rem -= min_r*60;
+
+				 fprintf(stderr,"%c[%s]  %-5.3g%%   elapsed: %3i min %-2.0f sec   remaining: %3i min %-2.0f sec",13,
+				 		message.c_str(),
+				 		100*(double)done/todo,
+				 		min_e,elapsed,min_r,rem);
+
+				}
+				else
+				{
+					 fprintf(stderr,"-");fflush(stderr);
+				}
+				partial -= steps;
+			}
+
+
+		}
+
+		void inc(uint64_t ntasks_done, int tid) //threads collaborate to this same progress bar
+		{
+			partial_threaded[tid] += ntasks_done;
+			done_threaded[tid] += ntasks_done;
+			while(partial_threaded[tid] >= steps)
+			{
+				if(timer_mode)
+				{
+					struct timeval timet;
+					double now;
+					gettimeofday(&timet, NULL);
+					now = timet.tv_sec +(timet.tv_usec/1000000.0);
+					uint64_t total_done  = 0;
+					for (int ii=0; ii<_nthreads;ii++) total_done += (done_threaded[ii] );
+					double elapsed = now - heure_debut;
+					double speed = total_done / elapsed;
+					double rem = (todo-total_done) / speed;
+					if(total_done > todo) rem =0;
+					int min_e  =  (int)(elapsed / 60) ;
+					elapsed -= min_e*60;
+					int min_r  =  (int)(rem / 60) ;
+					rem -= min_r*60;
+
+					 fprintf(stderr,"%c[%s]  %-5.3g%%   elapsed: %3i min %-2.0f sec   remaining: %3i min %-2.0f sec",13,
+					 		message.c_str(),
+					 		100*(double)total_done/todo,
+					 		min_e,elapsed,min_r,rem);
+				}
+				else
+				{
+					 fprintf(stderr,"-");fflush(stderr);
+				}
+				partial_threaded[tid] -= steps;
+
+			}
+
+		}
+
+		void set(uint64_t ntasks_done)
+		{
+			if(ntasks_done > done)
+				inc(ntasks_done-done);
+		}
+		Progress () :     timer_mode(0) {}
+		//include timer, to print ETA ?
+	};
+
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark hasher
+////////////////////////////////////////////////////////////////
+
+	typedef std::array<uint64_t,10> hash_set_t;
+	typedef std::array<uint64_t,2> hash_pair_t;
+
+
+
+	template <typename Item> class HashFunctors
+	{
+	public:
+
+		/** Constructor.
+		 * \param[in] nbFct : number of hash functions to be used
+		 * \param[in] seed : some initialization code for defining the hash functions. */
+		HashFunctors ()
+		{
+			_nbFct = 7; // use 7 hash func
+			_user_seed = 0;
+			generate_hash_seed ();
+		}
+
+		//return one hash
+        uint64_t operator ()  (const Item& key, size_t idx)  const {  return hash64 (key, _seed_tab[idx]);  }
+
+        uint64_t hashWithSeed(const Item& key, uint64_t seed)  const {  return hash64 (key, seed);  }
+
+		//this one returns all the 7 hashes
+		//maybe use xorshift instead, for faster hash compute
+		hash_set_t operator ()  (const Item& key)
+		{
+			hash_set_t	 hset;
+
+			for(size_t ii=0;ii<10; ii++)
+			{
+				hset[ii] =  hash64 (key, _seed_tab[ii]);
+			}
+			return hset;
+		}
+
+	private:
+
+
+		inline static uint64_t hash64 (Item key, uint64_t seed)
+		{
+			uint64_t hash = seed;
+			hash ^= (hash <<  7) ^  key * (hash >> 3) ^ (~((hash << 11) + (key ^ (hash >> 5))));
+			hash = (~hash) + (hash << 21);
+			hash = hash ^ (hash >> 24);
+			hash = (hash + (hash << 3)) + (hash << 8);
+			hash = hash ^ (hash >> 14);
+			hash = (hash + (hash << 2)) + (hash << 4);
+			hash = hash ^ (hash >> 28);
+			hash = hash + (hash << 31);
+
+			return hash;
+		}
+
+		/* */
+		void generate_hash_seed ()
+		{
+			static const uint64_t rbase[MAXNBFUNC] =
+			{
+				0xAAAAAAAA55555555ULL,  0x33333333CCCCCCCCULL,  0x6666666699999999ULL,  0xB5B5B5B54B4B4B4BULL,
+				0xAA55AA5555335533ULL,  0x33CC33CCCC66CC66ULL,  0x6699669999B599B5ULL,  0xB54BB54B4BAA4BAAULL,
+				0xAA33AA3355CC55CCULL,  0x33663366CC99CC99ULL
+			};
+
+			for (size_t i=0; i<MAXNBFUNC; ++i)  {  _seed_tab[i] = rbase[i];  }
+			for (size_t i=0; i<MAXNBFUNC; ++i)  {  _seed_tab[i] = _seed_tab[i] * _seed_tab[(i+3) % MAXNBFUNC] + _user_seed ;  }
+		}
+
+		size_t _nbFct;
+
+		static const size_t MAXNBFUNC = 10;
+		uint64_t _seed_tab[MAXNBFUNC];
+		uint64_t _user_seed;
+	};
+
+/* alternative hash functor based on xorshift, taking a single hash functor as input.
+we need this 2-functors scheme because HashFunctors won't work with unordered_map.
+(rayan)
+*/
+
+    // wrapper around HashFunctors to return only one value instead of 7
+    template <typename Item> class SingleHashFunctor
+	{
+	public:
+		uint64_t operator ()  (const Item& key, uint64_t seed=0xAAAAAAAA55555555ULL) const  {  return hashFunctors.hashWithSeed(key, seed);  }
+
+	private:
+		HashFunctors<Item> hashFunctors;
+	};
+
+
+
+    template <typename Item, class SingleHasher_t> class XorshiftHashFunctors
+    {
+        /*  Xorshift128*
+            Written in 2014 by Sebastiano Vigna (vigna at acm.org)
+
+            To the extent possible under law, the author has dedicated all copyright
+            and related and neighboring rights to this software to the public domain
+            worldwide. This software is distributed without any warranty.
+
+            See <http://creativecommons.org/publicdomain/zero/1.0/>. */
+        /* This is the fastest generator passing BigCrush without
+           systematic failures, but due to the relatively short period it is
+           acceptable only for applications with a mild amount of parallelism;
+           otherwise, use a xorshift1024* generator.
+
+           The state must be seeded so that it is not everywhere zero. If you have
+           a nonzero 64-bit seed, we suggest to pass it twice through
+           MurmurHash3's avalanching function. */
+
+      //  uint64_t s[ 2 ];
+
+        uint64_t next(uint64_t * s) {
+            uint64_t s1 = s[ 0 ];
+            const uint64_t s0 = s[ 1 ];
+            s[ 0 ] = s0;
+            s1 ^= s1 << 23; // a
+            return ( s[ 1 ] = ( s1 ^ s0 ^ ( s1 >> 17 ) ^ ( s0 >> 26 ) ) ) + s0; // b, c
+        }
+
+        public:
+
+
+		uint64_t h0(hash_pair_t  & s, const Item& key )
+		{
+			s[0] =  singleHasher (key, 0xAAAAAAAA55555555ULL);
+			return s[0];
+		}
+
+		uint64_t h1(hash_pair_t  & s, const Item& key )
+		{
+			s[1] =  singleHasher (key, 0x33333333CCCCCCCCULL);
+			return s[1];
+		}
+
+
+		//return next hash an update state s
+		uint64_t next(hash_pair_t  & s ) {
+			uint64_t s1 = s[ 0 ];
+			const uint64_t s0 = s[ 1 ];
+			s[ 0 ] = s0;
+			s1 ^= s1 << 23; // a
+			return ( s[ 1 ] = ( s1 ^ s0 ^ ( s1 >> 17 ) ^ ( s0 >> 26 ) ) ) + s0; // b, c
+		}
+
+        //this one returns all the  hashes
+        hash_set_t operator ()  (const Item& key)
+        {
+			uint64_t s[ 2 ];
+
+            hash_set_t   hset;
+
+            hset[0] =  singleHasher (key, 0xAAAAAAAA55555555ULL);
+            hset[1] =  singleHasher (key, 0x33333333CCCCCCCCULL);
+
+            s[0] = hset[0];
+            s[1] = hset[1];
+
+            for(size_t ii=2;ii< 10 /* it's much better have a constant here, for inlining; this loop is super performance critical*/; ii++)
+            {
+                hset[ii] = next(s);
+            }
+
+            return hset;
+        }
+    private:
+        SingleHasher_t singleHasher;
+    };
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark iterators
+////////////////////////////////////////////////////////////////
+
+	template <typename Iterator>
+	struct iter_range
+	{
+		iter_range(Iterator b, Iterator e)
+		: m_begin(b)
+		, m_end(e)
+		{}
+
+		Iterator begin() const
+		{ return m_begin; }
+
+		Iterator end() const
+		{ return m_end; }
+
+		Iterator m_begin, m_end;
+	};
+
+	template <typename Iterator>
+	iter_range<Iterator> range(Iterator begin, Iterator end)
+	{
+		return iter_range<Iterator>(begin, end);
+	}
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark BitVector
+////////////////////////////////////////////////////////////////
+
+	class bitVector {
+
+	public:
+
+		bitVector() : _size(0)
+		{
+			_bitArray = nullptr;
+		}
+
+		bitVector(uint64_t n) : _size(n)
+		{
+			_nchar  = (1ULL+n/64ULL);
+			_bitArray =  (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+		}
+
+		~bitVector()
+		{
+			if(_bitArray != nullptr)
+				free(_bitArray);
+		}
+
+		 //copy constructor
+		 bitVector(bitVector const &r)
+		 {
+			 _size =  r._size;
+			 _nchar = r._nchar;
+			 _ranks = r._ranks;
+			 _bitArray = (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+			 memcpy(_bitArray, r._bitArray, _nchar*sizeof(uint64_t) );
+		 }
+		
+		// Copy assignment operator
+		bitVector &operator=(bitVector const &r)
+		{
+			if (&r != this)
+			{
+				_size =  r._size;
+				_nchar = r._nchar;
+				_ranks = r._ranks;
+				if(_bitArray != nullptr)
+					free(_bitArray);
+				_bitArray = (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+				memcpy(_bitArray, r._bitArray, _nchar*sizeof(uint64_t) );
+			}
+			return *this;
+		}
+	
+		// Move assignment operator
+		bitVector &operator=(bitVector &&r)
+		{
+			//printf("bitVector move assignment \n");
+			if (&r != this)
+			{
+				if(_bitArray != nullptr)
+					free(_bitArray);
+				
+				_size =  std::move (r._size);
+				_nchar = std::move (r._nchar);
+				_ranks = std::move (r._ranks);
+				_bitArray = r._bitArray;
+				r._bitArray = nullptr;
+			}
+			return *this;
+		}
+		// Move constructor
+		bitVector(bitVector &&r) : _bitArray ( nullptr),_size(0)
+		{
+			*this = std::move(r);
+		}
+		
+		
+		void resize(uint64_t newsize)
+		{
+			//printf("bitvector resize from  %llu bits to %llu \n",_size,newsize);
+			_nchar  = (1ULL+newsize/64ULL);
+			_bitArray = (uint64_t *) realloc(_bitArray,_nchar*sizeof(uint64_t));
+			_size = newsize;
+		}
+
+		size_t size() const
+		{
+			return _size;
+		}
+
+		uint64_t bitSize() const {return (_nchar*64ULL + _ranks.capacity()*64ULL );}
+
+		//clear whole array
+		void clear()
+		{
+			memset(_bitArray,0,_nchar*sizeof(uint64_t));
+		}
+
+		//clear collisions in interval, only works with start and size multiple of 64
+		void clearCollisions(uint64_t start, size_t size, bitVector * cc)
+		{
+			assert( (start & 63) ==0);
+			assert( (size & 63) ==0);
+			uint64_t ids = (start/64ULL);
+			for(uint64_t ii =0;  ii< (size/64ULL); ii++ )
+			{
+				_bitArray[ids+ii] =  _bitArray[ids+ii] & (~ (cc->get64(ii)) );
+			}
+
+			cc->clear();
+		}
+
+
+		//clear interval, only works with start and size multiple of 64
+		void clear(uint64_t start, size_t size)
+		{
+			assert( (start & 63) ==0);
+			assert( (size & 63) ==0);
+			memset(_bitArray + (start/64ULL),0,(size/64ULL)*sizeof(uint64_t));
+		}
+
+		//for debug purposes
+		void print() const
+		{
+			printf("bit array of size %llu: \n", _size);
+			for(uint64_t ii = 0; ii< _size; ii++)
+			{
+				if(ii%10==0)
+					printf(" (%llu) ",ii);
+				int val = (_bitArray[ii >> 6] >> (ii & 63 ) ) & 1;
+				printf("%i",val);
+			}
+			printf("\n");
+
+			printf("rank array : size %lu \n",_ranks.size());
+			for (uint64_t ii = 0; ii< _ranks.size(); ii++)
+			{
+				printf("%llu:  %llu,  ",ii,_ranks[ii]);
+			}
+			printf("\n");
+		}
+
+		//return value at pos
+		uint64_t operator[](uint64_t pos) const
+		{
+			return (_bitArray[pos >> 6ULL] >> (pos & 63 ) ) & 1;
+		}
+
+		//atomically   return old val and set to 1
+		uint64_t atomic_test_and_set(uint64_t pos)
+		{
+			uint64_t oldval = 	__sync_fetch_and_or (_bitArray + (pos >> 6), (uint64_t) (1ULL << (pos & 63)) );
+
+			return  ( oldval >> (pos & 63 ) ) & 1;
+		}
+
+
+		uint64_t get(uint64_t pos) const
+		{
+			return (*this)[pos];
+		}
+
+		uint64_t get64(uint64_t cell64) const
+		{
+			return _bitArray[cell64];
+		}
+
+		//set bit pos to 1
+		void set(uint64_t pos)
+		{
+			assert(pos<_size);
+			//_bitArray [pos >> 6] |=   (1ULL << (pos & 63) ) ;
+			__sync_fetch_and_or (_bitArray + (pos >> 6ULL), (1ULL << (pos & 63)) );
+		}
+
+		//set bit pos to 0
+		void reset(uint64_t pos)
+		{
+			//_bitArray [pos >> 6] &=   ~(1ULL << (pos & 63) ) ;
+			__sync_fetch_and_and (_bitArray + (pos >> 6ULL), ~(1ULL << (pos & 63) ));
+		}
+
+		//return value of  last rank
+		// add offset to  all ranks  computed
+		uint64_t build_ranks(uint64_t offset =0)
+		{
+			_ranks.reserve(2+ _size/_nb_bits_per_rank_sample);
+
+			uint64_t curent_rank = offset;
+			for (size_t ii = 0; ii < _nchar; ii++) {
+				if (((ii*64)  % _nb_bits_per_rank_sample) == 0) {
+					_ranks.push_back(curent_rank);
+				}
+				curent_rank +=  popcount_64(_bitArray[ii]);
+			}
+
+			return curent_rank;
+		}
+
+		uint64_t rank(uint64_t pos) const
+		{
+			uint64_t word_idx = pos / 64ULL;
+			uint64_t word_offset = pos % 64;
+			uint64_t block = pos / _nb_bits_per_rank_sample;
+			uint64_t r = _ranks[block];
+			for (uint64_t w = block * _nb_bits_per_rank_sample / 64; w < word_idx; ++w) {
+				r += popcount_64( _bitArray[w] );
+			}
+			uint64_t mask = (uint64_t(1) << word_offset ) - 1;
+			r += popcount_64( _bitArray[word_idx] & mask);
+
+			return r;
+		}
+
+
+		void save(std::ostream& os) const
+		{
+			os.write(reinterpret_cast<char const*>(&_size), sizeof(_size));
+			os.write(reinterpret_cast<char const*>(&_nchar), sizeof(_nchar));
+			os.write(reinterpret_cast<char const*>(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
+			size_t sizer = _ranks.size();
+			os.write(reinterpret_cast<char const*>(&sizer),  sizeof(size_t));
+			os.write(reinterpret_cast<char const*>(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
+		}
+
+		void load(std::istream& is)
+		{
+			is.read(reinterpret_cast<char*>(&_size), sizeof(_size));
+			is.read(reinterpret_cast<char*>(&_nchar), sizeof(_nchar));
+			this->resize(_size);
+			is.read(reinterpret_cast<char *>(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
+
+			size_t sizer;
+			is.read(reinterpret_cast<char *>(&sizer),  sizeof(size_t));
+			_ranks.resize(sizer);
+			is.read(reinterpret_cast<char*>(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
+		}
+
+
+	protected:
+		uint64_t*  _bitArray;
+		//uint64_t* _bitArray;
+		uint64_t _size;
+		uint64_t _nchar;
+
+		 // epsilon =  64 / _nb_bits_per_rank_sample   bits
+		// additional size for rank is epsilon * _size
+		static const uint64_t _nb_bits_per_rank_sample = 512; //512 seems ok
+		std::vector<uint64_t> _ranks;
+	};
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark level
+////////////////////////////////////////////////////////////////
+
+	class level{
+	public:
+		level(){ }
+
+		~level() {
+		}
+
+		uint64_t get(uint64_t hash_raw)
+		{
+			uint64_t hashi =    hash_raw %  hash_domain;
+			return bitset.get(hashi);
+		}
+		
+		uint64_t idx_begin;
+		uint64_t hash_domain;
+		bitVector  bitset;
+	};
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark mphf
+////////////////////////////////////////////////////////////////
+
+
+#define NBBUFF 10000
+
+	template<typename Range,typename Iterator>
+	struct thread_args
+	{
+		void * boophf;
+		Range const * range;
+		std::shared_ptr<void> it_p; /* used to be "Iterator it" but because of fastmode, iterator is polymorphic; TODO: think about whether it should be a unique_ptr actually */
+		std::shared_ptr<void> until_p; /* to cache the "until" variable */
+		int level;
+	};
+
+	//forward declaration
+
+    template <typename elem_t, typename Hasher_t, typename Range, typename it_type>
+	void * thread_processLevel(void * args);
+
+
+    /* Hasher_t returns a single hash when operator()(elem_t key) is called.
+       if used with XorshiftHashFunctors, it must have the following operator: operator()(elem_t key, uint64_t seed) */
+    template <typename elem_t, typename Hasher_t>
+	class mphf {
+
+        /* this mechanisms gets P hashes out of Hasher_t */
+        typedef XorshiftHashFunctors<elem_t,Hasher_t> MultiHasher_t ;
+       // typedef HashFunctors<elem_t> MultiHasher_t; // original code (but only works for int64 keys)  (seems to be as fast as the current xorshift)
+		//typedef IndepHashFunctors<elem_t,Hasher_t> MultiHasher_t; //faster than xorshift
+
+	public:
+		mphf() : _built(false)
+		{}
+
+
+		~mphf()
+		{
+
+		}
+
+		
+		// allow perc_elem_loaded  elements to be loaded in ram for faster construction (default 3%), set to 0 to desactivate
+		template <typename Range>
+		mphf( size_t n, Range const& input_range,int num_thread = 1,  double gamma = 2.0 , bool progress =true, float perc_elem_loaded = 0.03) :
+		_gamma(gamma), _hash_domain(size_t(ceil(double(n) * gamma))), _nelem(n), _num_thread(num_thread), _percent_elem_loaded_for_fastMode (perc_elem_loaded), _withprogress(progress)
+		{
+			if(n ==0) return;
+			
+			if(_percent_elem_loaded_for_fastMode > 0.0 )
+				_fastmode =true;
+
+			setup();
+
+			if(_withprogress)
+			{
+			_progressBar.timer_mode=1;
+
+			if(_fastmode)
+				_progressBar.init( _nelem * (_fastModeLevel+1) +  ( _nelem * pow(_proba_collision,_fastModeLevel)) * (_nb_levels-(_fastModeLevel+1))    ,"Building BooPHF",num_thread);
+			else
+				_progressBar.init( _nelem * _nb_levels ,"Building BooPHF");
+			}
+
+			uint64_t offset = 0;
+			for(int ii = 0; ii< _nb_levels; ii++)
+			{
+				_tempBitset =  new bitVector(_levels[ii].hash_domain); // temp collision bitarray for this level
+
+				processLevel(input_range,ii);
+
+				_levels[ii].bitset.clearCollisions(0 , _levels[ii].hash_domain , _tempBitset);
+
+				offset = _levels[ii].bitset.build_ranks(offset);
+
+				delete _tempBitset;
+			}
+
+			if(_withprogress)
+			_progressBar.finish_threaded();
+
+
+			_lastbitsetrank = offset ;
+
+			//printf("used temp ram for construction : %lli MB \n",setLevelFastmode.capacity()* sizeof(elem_t) /1024ULL/1024ULL);
+
+			std::vector<elem_t>().swap(setLevelFastmode);   // clear setLevelFastmode reallocating
+
+
+			pthread_mutex_destroy(&_mutex);
+			
+			_built = true;
+		}
+
+
+		uint64_t lookup(elem_t elem)
+		{
+			if(! _built) return ULLONG_MAX;
+			
+			//auto hashes = _hasher(elem);
+			uint64_t non_minimal_hp,minimal_hp;
+
+
+			hash_pair_t bbhash;  int level;
+			uint64_t level_hash = getLevel(bbhash,elem,&level);
+
+			if( level == (_nb_levels-1))
+			{
+				auto in_final_map  = _final_hash.find (elem);
+				if ( in_final_map == _final_hash.end() )
+				{
+					//elem was not in orignal set of keys
+					return ULLONG_MAX; //  means elem not in set
+				}
+				else
+				{
+					minimal_hp =  in_final_map->second + _lastbitsetrank;
+					return minimal_hp;
+				}
+//				minimal_hp = _final_hash[elem] + _lastbitsetrank;
+//				return minimal_hp;
+			}
+			else
+			{
+				non_minimal_hp =  level_hash %  _levels[level].hash_domain; // in fact non minimal hp would be  + _levels[level]->idx_begin
+			}
+
+			minimal_hp = _levels[level].bitset.rank(non_minimal_hp );
+
+			return minimal_hp;
+		}
+
+		uint64_t nbKeys() const
+		{
+            return _nelem;
+        }
+
+		uint64_t totalBitSize()
+		{
+
+			uint64_t totalsizeBitset = 0;
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				totalsizeBitset += _levels[ii].bitset.bitSize();
+			}
+
+			uint64_t totalsize =  totalsizeBitset +  _final_hash.size()*42*8 ;  // unordered map takes approx 42B per elem [personal test] (42B with uint64_t key, would be larger for other type of elem)
+
+			printf("Bitarray    %12llu  bits (%.2f %%)   (array + ranks )\n",
+				   totalsizeBitset, 100*(float)totalsizeBitset/totalsize);
+			printf("final hash  %12lu  bits (%.2f %%) (nb in final hash %lu)\n",
+				   _final_hash.size()*42*8, 100*(float)(_final_hash.size()*42*8)/totalsize,
+				   _final_hash.size() );
+			return totalsize;
+		}
+
+		template <typename Iterator>  //typename Range,
+        void pthread_processLevel( std::vector<elem_t>  & buffer , std::shared_ptr<Iterator> shared_it, std::shared_ptr<Iterator> until_p, int i)
+		{
+			uint64_t nb_done =0;
+			int tid =  __sync_fetch_and_add (&_nb_living, 1);
+			auto until = *until_p;
+			uint64_t inbuff =0;
+
+
+			
+			for (bool isRunning=true;  isRunning ; )
+			{
+
+				//safely copy n items into buffer
+				pthread_mutex_lock(&_mutex);
+                for(; inbuff<NBBUFF && (*shared_it)!=until;  ++(*shared_it))
+				{
+                    buffer[inbuff]= *(*shared_it); inbuff++;
+				}
+                if((*shared_it)==until) isRunning =false;
+				pthread_mutex_unlock(&_mutex);
+
+
+				//do work on the n elems of the buffer
+                for(uint64_t ii=0; ii<inbuff ; ii++)
+				{
+					elem_t val = buffer[ii];
+
+					//auto hashes = _hasher(val);
+					hash_pair_t bbhash;  int level;
+					uint64_t level_hash = getLevel(bbhash,val,&level, i);
+
+					if(level == i) //insert into lvl i
+					{
+							__sync_fetch_and_add(& _cptLevel,1);
+
+						if(i == _fastModeLevel && _fastmode)
+						{
+							uint64_t idxl2 = __sync_fetch_and_add(& _idxLevelsetLevelFastmode,1);
+							//si depasse taille attendue pour setLevelFastmode, fall back sur slow mode mais devrait pas arriver si hash ok et proba avec nous
+							if(idxl2>= setLevelFastmode.size())
+								_fastmode = false;
+							else
+								setLevelFastmode[idxl2] = val; // create set for fast mode
+						}
+
+						//insert to level i+1 : either next level of the cascade or final hash if last level reached
+						if(i == _nb_levels-1) //stop cascade here, insert into exact hash
+						{
+							uint64_t hashidx =  __sync_fetch_and_add (& _hashidx, 1);
+
+							pthread_mutex_lock(&_mutex); //see later if possible to avoid this, mais pas bcp item vont la
+							// calc rank de fin  precedent level qq part, puis init hashidx avec ce rank, direct minimal, pas besoin inser ds bitset et rank
+							_final_hash[val] = hashidx;
+							pthread_mutex_unlock(&_mutex);
+						}
+						else
+						{
+							//computes next hash
+
+							if ( level == 0)
+								level_hash = _hasher.h0(bbhash,val);
+							else if ( level == 1)
+								level_hash = _hasher.h1(bbhash,val);
+							else
+							{
+								level_hash = _hasher.next(bbhash);
+							}
+							insertIntoLevel(level_hash,i); //should be safe
+						}
+					}
+
+					nb_done++;
+					if((nb_done&1023) ==0  && _withprogress) {_progressBar.inc(nb_done,tid);nb_done=0; }
+
+				}
+
+				inbuff = 0;
+			}
+
+		}
+
+
+		void save(std::ostream& os) const
+		{
+
+			os.write(reinterpret_cast<char const*>(&_gamma), sizeof(_gamma));
+			os.write(reinterpret_cast<char const*>(&_nb_levels), sizeof(_nb_levels));
+			os.write(reinterpret_cast<char const*>(&_lastbitsetrank), sizeof(_lastbitsetrank));
+			os.write(reinterpret_cast<char const*>(&_nelem), sizeof(_nelem));
+			 for(int ii=0; ii<_nb_levels; ii++)
+			 {
+			  	_levels[ii].bitset.save(os);
+			 }
+
+			//save final hash
+			size_t final_hash_size = _final_hash.size();
+
+			os.write(reinterpret_cast<char const*>(&final_hash_size), sizeof(size_t));
+
+
+			// typename std::unordered_map<elem_t,uint64_t,Hasher_t>::iterator
+			for (auto it = _final_hash.begin(); it != _final_hash.end(); ++it )
+			{
+				os.write(reinterpret_cast<char const*>(&(it->first)), sizeof(elem_t));
+				os.write(reinterpret_cast<char const*>(&(it->second)), sizeof(uint64_t));
+			}
+
+		}
+
+		void load(std::istream& is)
+		{
+
+			is.read(reinterpret_cast<char*>(&_gamma), sizeof(_gamma));
+			is.read(reinterpret_cast<char*>(&_nb_levels), sizeof(_nb_levels));
+			is.read(reinterpret_cast<char*>(&_lastbitsetrank), sizeof(_lastbitsetrank));
+			is.read(reinterpret_cast<char*>(&_nelem), sizeof(_nelem));
+			
+			_levels.resize(_nb_levels);
+			
+
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				//_levels[ii].bitset = new bitVector();
+				_levels[ii].bitset.load(is);
+			}
+
+
+
+			//mini setup, recompute size of each level
+			_proba_collision = 1.0 -  pow(((_gamma*(double)_nelem -1 ) / (_gamma*(double)_nelem)),_nelem-1);
+			uint64_t previous_idx =0;
+			_hash_domain = (size_t)  (ceil(double(_nelem) * _gamma)) ;
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				//_levels[ii] = new level();
+				_levels[ii].idx_begin = previous_idx;
+				_levels[ii].hash_domain =  (( (uint64_t) (_hash_domain * pow(_proba_collision,ii)) + 63) / 64 ) * 64;
+				if(_levels[ii].hash_domain == 0 )
+					_levels[ii].hash_domain  = 64 ;
+				previous_idx += _levels[ii].hash_domain;
+			}
+
+			//restore final hash
+
+			_final_hash.clear();
+			size_t final_hash_size ;
+
+			is.read(reinterpret_cast<char *>(&final_hash_size), sizeof(size_t));
+
+			for(unsigned int ii=0; ii<final_hash_size; ii++)
+			{
+				elem_t key;
+				uint64_t value;
+
+				is.read(reinterpret_cast<char *>(&key), sizeof(elem_t));
+				is.read(reinterpret_cast<char *>(&value), sizeof(uint64_t));
+
+				_final_hash[key] = value;
+			}
+			_built = true;
+		}
+
+
+		private :
+
+		void setup()
+		{
+			pthread_mutex_init(&_mutex, NULL);
+
+
+			if(_fastmode)
+				setLevelFastmode.resize(_percent_elem_loaded_for_fastMode * (double)_nelem );
+
+			_proba_collision = 1.0 -  pow(((_gamma*(double)_nelem -1 ) / (_gamma*(double)_nelem)),_nelem-1);
+
+			double sum_geom =_gamma * ( 1.0 +  _proba_collision / (1.0 - _proba_collision));
+			// printf("proba collision %f  sum_geom  %f   \n",_proba_collision,sum_geom);
+
+			_nb_levels = 25;
+			_levels.resize(_nb_levels);
+
+			//build levels
+			uint64_t previous_idx =0;
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+
+				_levels[ii].idx_begin = previous_idx;
+
+				// round size to nearest superior multiple of 64, makes it easier to clear a level
+				_levels[ii].hash_domain =  (( (uint64_t) (_hash_domain * pow(_proba_collision,ii)) + 63) / 64 ) * 64;
+				if(_levels[ii].hash_domain == 0 ) _levels[ii].hash_domain  = 64 ;
+				previous_idx += _levels[ii].hash_domain;
+
+				//printf("build level %i bit array : start %12llu, size %12llu  ",ii,_levels[ii]->idx_begin,_levels[ii]->hash_domain );
+				//printf(" expected elems : %.2f %% total \n",100.0*pow(_proba_collision,ii));
+
+			}
+			
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				 if(pow(_proba_collision,ii) < _percent_elem_loaded_for_fastMode)
+				 {
+				 	_fastModeLevel = ii;
+				 	// printf("fast mode level :  %i \n",ii);
+				 	break;
+				 }
+			}
+
+
+		}
+
+
+		//compute level and returns hash of last level reached
+		uint64_t getLevel(hash_pair_t & bbhash, elem_t val,int * res_level, int maxlevel = 100)
+		{
+			int level = 0;
+			uint64_t hash_raw=0;
+
+			for (int ii=0; ii<(_nb_levels-1) &&  ii < maxlevel ; ii++ )
+			{
+
+				//calc le hash suivant
+				 if ( ii == 0)
+					hash_raw = _hasher.h0(bbhash,val);
+				else if ( ii == 1)
+					hash_raw = _hasher.h1(bbhash,val);
+				else
+				{
+					hash_raw = _hasher.next(bbhash);
+				}
+
+
+				if( _levels[ii].get(hash_raw) )
+				{
+					break;
+				}
+
+				level++;
+			}
+
+			*res_level = level;
+			return hash_raw;
+		}
+
+
+		//insert into bitarray
+		void insertIntoLevel(uint64_t level_hash, int i)
+		{
+			uint64_t hashl =  level_hash % _levels[i].hash_domain;
+
+			if( _levels[i].bitset.atomic_test_and_set(hashl) )
+			{
+				_tempBitset->atomic_test_and_set(hashl);
+			}
+
+		}
+
+
+		//loop to insert into level i
+		template <typename Range>
+		void processLevel(Range const& input_range,int i)
+		{
+			////alloc the bitset for this level
+			_levels[i].bitset =  bitVector(_levels[i].hash_domain); ;
+
+			_cptLevel = 0;
+			_hashidx = 0;
+			_idxLevelsetLevelFastmode =0;
+			_nb_living =0;
+			//create  threads
+			pthread_t *tab_threads= new pthread_t [_num_thread];
+			typedef decltype(input_range.begin()) it_type;
+			thread_args<Range, it_type> t_arg; // meme arg pour tous
+			t_arg.boophf = this;
+			t_arg.range = &input_range;
+			t_arg.it_p =  std::static_pointer_cast<void>(std::make_shared<it_type>(input_range.begin()));
+			t_arg.until_p =  std::static_pointer_cast<void>(std::make_shared<it_type>(input_range.end()));
+
+			t_arg.level = i;
+			if(i >= (_fastModeLevel+1) && _fastmode)
+			{
+				auto data_iterator = boomphf::range(static_cast<const elem_t*>( &setLevelFastmode[0]), static_cast<const elem_t*>( (&setLevelFastmode[0]) +setLevelFastmode.size()));
+                typedef decltype(data_iterator.begin()) fastmode_it_type;
+				t_arg.it_p =  std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(data_iterator.begin()));
+				t_arg.until_p =  std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(data_iterator.end()));
+
+                /* we'd like to do t_arg.it = data_iterator.begin() but types are different;
+                    so, casting to (void*) because of that; and we remember the type in the template */
+
+                for(int ii=0;ii<_num_thread;ii++)
+                    pthread_create (&tab_threads[ii], NULL,  thread_processLevel<elem_t, Hasher_t, Range, fastmode_it_type>, &t_arg); //&t_arg[ii]
+			}
+			else
+			{
+			    for(int ii=0;ii<_num_thread;ii++)
+                    pthread_create (&tab_threads[ii], NULL,  thread_processLevel<elem_t, Hasher_t, Range, decltype(input_range.begin())>, &t_arg); //&t_arg[ii]
+			}
+			//joining
+			for(int ii=0;ii<_num_thread;ii++)
+			{
+				pthread_join(tab_threads[ii], NULL);
+			}
+		//	printf("\ngoing to level %i  : %llu elems  %.2f %%  expected : %.2f %% \n",i,_cptLevel,100.0* _cptLevel/(float)_nelem,100.0* pow(_proba_collision,i) );
+
+			if(i == _fastModeLevel) //shrink to actual number of elements in set
+			{
+				//printf("resize setLevelFastmode to %lli \n",_idxLevelsetLevelFastmode);
+				setLevelFastmode.resize(_idxLevelsetLevelFastmode);
+			}
+			delete [] tab_threads;
+		}
+
+	private:
+		//level ** _levels;
+		std::vector<level> _levels;
+		int _nb_levels;
+        MultiHasher_t _hasher;
+		bitVector * _tempBitset;
+
+		double _gamma;
+		uint64_t _hash_domain;
+		uint64_t _nelem;
+        std::unordered_map<elem_t,uint64_t,Hasher_t> _final_hash;
+		Progress _progressBar;
+		int _nb_living;
+		int _num_thread;
+		uint64_t _hashidx;
+		double _proba_collision;
+		uint64_t _lastbitsetrank;
+		uint64_t _idxLevelsetLevelFastmode;
+		uint64_t _cptLevel;
+
+		// fast build mode , requires  that _percent_elem_loaded_for_fastMode %   elems are loaded in ram
+		float _percent_elem_loaded_for_fastMode ;
+		bool _fastmode;
+		std::vector< elem_t > setLevelFastmode;
+		int _fastModeLevel;
+		bool _withprogress;
+		bool _built;
+	public:
+		pthread_mutex_t _mutex;
+	};
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark threading
+////////////////////////////////////////////////////////////////
+
+
+    template <typename elem_t, typename Hasher_t, typename Range, typename it_type>
+	void * thread_processLevel(void * args)
+	{
+		if(args ==NULL) return NULL;
+
+		thread_args<Range,it_type> *targ = (thread_args<Range,it_type>*) args;
+
+		mphf<elem_t, Hasher_t>  * obw = (mphf<elem_t, Hasher_t > *) targ->boophf;
+		int level = targ->level;
+		std::vector<elem_t> buffer;
+		buffer.resize(NBBUFF);
+		
+		pthread_mutex_t * mutex =  & obw->_mutex;
+
+		pthread_mutex_lock(mutex); // from comment above: "//get starting iterator for this thread, must be protected (must not be currently used by other thread to copy elems in buff)"
+        std::shared_ptr<it_type> startit = std::static_pointer_cast<it_type>(targ->it_p);
+        std::shared_ptr<it_type> until_p = std::static_pointer_cast<it_type>(targ->until_p);
+		pthread_mutex_unlock(mutex);
+
+		obw->pthread_processLevel(buffer, startit, until_p, level);
+
+		return NULL;
+	}
+}
+
+#endif //__BOO_PHF__
diff --git a/include/Const.hpp b/include/Const.hpp
new file mode 100644
index 0000000..39b4c21
--- /dev/null
+++ b/include/Const.hpp
@@ -0,0 +1,36 @@
+/* 
+ *  Copyright (c) 2012 Daisuke Okanohara
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *   1. Redistributions of source code must retain the above Copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above Copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the authors nor the names of its contributors
+ *      may be used to endorse or promote products derived from this
+ *      software without specific prior written permission.
+ */
+
+#ifndef RSDIC_CONST_HPP_
+#define RSDIC_CONST_HPP_
+
+#include <stdint.h>
+
+namespace rsdic{
+
+static const uint64_t kLargeBlockSize = 1024;
+static const uint64_t kSmallBlockSize = 64;
+static const uint64_t kSelectBlockSize = 2048;
+static const uint64_t kUseRawLen = 48;
+static const uint64_t kSmallBlockPerLargeBlock = kLargeBlockSize / kSmallBlockSize;
+
+
+}
+
+#endif // RSDIC_CONST_HPP_
diff --git a/include/EnumCoder.hpp b/include/EnumCoder.hpp
new file mode 100644
index 0000000..675cf70
--- /dev/null
+++ b/include/EnumCoder.hpp
@@ -0,0 +1,53 @@
+/* 
+ *  Copyright (c) 2012 Daisuke Okanohara
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *   1. Redistributions of source code must retain the above Copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above Copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the authors nor the names of its contributors
+ *      may be used to endorse or promote products derived from this
+ *      software without specific prior written permission.
+ */
+
+#ifndef RSDIC_ENUM_CODER_HPP_
+#define RSDIC_ENUM_CODER_HPP_
+
+#include <stdint.h>
+#include "Const.hpp"
+
+namespace rsdic{
+
+class EnumCoder{
+public:
+  static uint64_t Encode(uint64_t val, uint64_t rank_sb);
+  static uint64_t Decode(uint64_t code, uint64_t rank_sb);
+  static bool GetBit(uint64_t code, uint64_t rank_sb, uint64_t pos);
+  static uint64_t Rank(uint64_t code, uint64_t rank_sb, uint64_t pos);
+  static uint64_t Select(uint64_t code, uint64_t rank_sb, uint64_t num, bool bit);
+
+  static uint64_t Len(uint64_t rank_sb){
+    return kEnumCodeLength_[rank_sb];
+  }
+
+  static uint64_t Select0(uint64_t code, uint64_t rank_sb, uint64_t num);
+  static uint64_t Select1(uint64_t code, uint64_t rank_sb, uint64_t num);
+
+private:
+  static uint64_t PopCount(uint64_t code);
+  static uint64_t SelectRaw(uint64_t code, uint64_t num);
+  static const uint8_t kPopCount_[256];
+  static const uint64_t kCombinationTable64_[65][65];
+  static const uint8_t kEnumCodeLength_[65];
+};
+
+}
+
+#endif // RSDIC_ENUM_CODER_HPP_
diff --git a/include/HitManager.hpp b/include/HitManager.hpp
new file mode 100644
index 0000000..24a288e
--- /dev/null
+++ b/include/HitManager.hpp
@@ -0,0 +1,109 @@
+#ifndef __HIT_MANAGER_HPP__
+#define __HIT_MANAGER_HPP__
+
+#include "RapMapUtils.hpp"
+#include "RapMapIndex.hpp"
+#include "RapMapSAIndex.hpp"
+
+//#include "eytzinger_array.h"
+
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <map>
+#include <unordered_map>
+
+namespace rapmap {
+    namespace hit_manager {
+        using HitInfo = rapmap::utils::HitInfo;
+        using ProcessedHit = rapmap::utils::ProcessedHit;
+        using MateStatus = rapmap::utils::MateStatus;
+        using PositionListHelper = rapmap::utils::PositionListHelper;
+        using QuasiAlignment = rapmap::utils::QuasiAlignment;
+        using TxpQueryPos = rapmap::utils::TxpQueryPos;
+        using SATxpQueryPos = rapmap::utils::SATxpQueryPos;
+
+        template <typename T>
+        using SAIntervalHit = rapmap::utils::SAIntervalHit<T>;
+        using SAHitMap = std::map<int, rapmap::utils::ProcessedSAHit>;
+        using ProcessedSAHit = rapmap::utils::ProcessedSAHit;
+
+        class SAProcessedHitVec {
+            public:
+                std::vector<ProcessedSAHit> hits;
+                std::vector<uint32_t> txps;
+        };
+        /*
+        using SAProcessedHitVec = std::tuple<std::vector<ProcessedSAHit>, std::vector<uint32_t>>;
+        */
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimple(std::vector<ProcessedHit>& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus);
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA(SAHitMap& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus);
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA2(std::vector<ProcessedSAHit>& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus);
+
+
+        // Intersects the hit h2 with outHits.
+        // This will modify outHits so that the tqvec field of the
+        // entries in outHits that are labeled by the transcripts in
+        // which h2 appears will have an iterator to the beginning of
+        // the position list for h2.
+        void intersectWithOutput(HitInfo& h2, RapMapIndex& rmi,
+                std::vector<ProcessedHit>& outHits);
+
+        template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                                           RapMapIndexT& rmi,
+                                           uint32_t intervalCounter,
+                                           SAHitMap& outHits);
+                                           
+
+        template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput2(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                RapMapIndexT& rmi,
+                SAProcessedHitVec& outStructs);
+
+        /*
+        void intersectSAIntervalWithOutput3(SAIntervalHit& h,
+                RapMapSAIndex& rmi,
+                SAProcessedHitVec& outHits);
+                */
+
+        std::vector<ProcessedHit> intersectHits(
+                std::vector<HitInfo>& inHits,
+                RapMapIndex& rmi);
+
+        template <typename RapMapIndexT>
+        SAHitMap intersectSAHits(
+                                 std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                                 RapMapIndexT& rmi, 
+                                 bool strictFilter=false);
+
+        template <typename RapMapIndexT>
+        std::vector<ProcessedSAHit> intersectSAHits2(
+                std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                RapMapIndexT& rmi);
+    }
+}
+
+
+#endif // __HIT_MANAGER_HPP__
diff --git a/include/IndexHeader.hpp b/include/IndexHeader.hpp
new file mode 100644
index 0000000..87eba2d
--- /dev/null
+++ b/include/IndexHeader.hpp
@@ -0,0 +1,77 @@
+#ifndef __INDEX_HEADER_HPP__
+#define __INDEX_HEADER_HPP__
+
+#include "spdlog/spdlog.h"
+#include <cereal/types/string.hpp>
+
+// The different types of indices supported
+enum class IndexType : uint8_t {
+    PSEUDO = 0,
+    QUASI,
+    INVALID
+};
+
+class IndexHeader {
+    public:
+        IndexHeader () : type_(IndexType::INVALID), versionString_("invalid"), usesKmers_(false), kmerLen_(0), perfectHash_(false) {}
+
+        IndexHeader(IndexType typeIn, const std::string& versionStringIn,
+                    bool usesKmersIn, uint32_t kmerLenIn, bool bigSA = false, bool perfectHash = false):
+                    type_(typeIn), versionString_(versionStringIn),
+                    usesKmers_(usesKmersIn), kmerLen_(kmerLenIn), bigSA_(bigSA),
+                    perfectHash_(perfectHash) {}
+
+        template <typename Archive>
+            void save(Archive& ar) const {
+                ar( cereal::make_nvp("IndexType", type_) );
+                ar( cereal::make_nvp("IndexVersion", versionString_) );
+                ar( cereal::make_nvp("UsesKmers", usesKmers_) );
+                ar( cereal::make_nvp("KmerLen", kmerLen_) );
+                ar( cereal::make_nvp("BigSA", bigSA_) );
+                ar( cereal::make_nvp("PerfectHash", perfectHash_) );
+            }
+
+        template <typename Archive>
+        void load(Archive& ar) {
+            try {
+                ar( cereal::make_nvp("IndexType", type_) );
+                ar( cereal::make_nvp("IndexVersion", versionString_) );
+                ar( cereal::make_nvp("UsesKmers", usesKmers_) );
+                ar( cereal::make_nvp("KmerLen", kmerLen_) );
+                ar( cereal::make_nvp("BigSA", bigSA_) );
+                ar( cereal::make_nvp("PerfectHash", perfectHash_) );
+            } catch (const cereal::Exception& e) {
+                auto cerrLog = spdlog::get("stderrLog");
+                cerrLog->error("Encountered exception [{}] when loading index.", e.what());
+                cerrLog->error("The index was likely build with an older (and incompatible) "
+                               "version of RapMap.  Please re-build the index with a compatible version.");
+                cerrLog->flush(); 
+                std::exit(1);
+            }
+        }
+
+        IndexType indexType() const { return type_; }
+        std::string version() const { return versionString_; }
+        bool usesKmers() const { return usesKmers_; }
+        uint32_t kmerLen() const { return kmerLen_; }
+        bool bigSA() const { return bigSA_; }
+        bool perfectHash() const { return perfectHash_; }
+
+    private:
+        // The type of index we have
+        IndexType type_;
+        // The version string for the index
+        std::string versionString_;
+        // True if this index makes use of k-mers false otherwise
+        // (currently, all supported indices use k-mers in some form)
+        bool usesKmers_;
+        // The length of k-mer used by the index
+        uint32_t kmerLen_;
+        // Do we have a 64-bit suffix array or not
+        bool bigSA_;
+        // Are we using a perfect hash in the index or not?
+        bool perfectHash_;
+};
+
+
+#endif // __INDEX_HEADER_HPP__
diff --git a/include/JFRaw.hpp b/include/JFRaw.hpp
new file mode 100644
index 0000000..4efa052
--- /dev/null
+++ b/include/JFRaw.hpp
@@ -0,0 +1,30 @@
+#ifndef __JF_RAW_H__
+#define __JF_RAW_H__
+
+#include "jellyfish/file_header.hpp"
+// Type for values
+/*
+struct value_type {
+  char foo;
+  int  bar;
+  bool baz;
+};
+*/
+
+// Special header type. Just like the jellyfish header type, but save
+// one extra piece of information about the hash array.
+class SpecialHeader : public jellyfish::file_header {
+public:
+  SpecialHeader() = default;
+  SpecialHeader(std::istream& is) : jellyfish::file_header(is) { }
+
+  template<typename storage>
+  void update_from_ary(const storage& ary) {
+    jellyfish::file_header::update_from_ary(ary);
+    root_["size_bytes"] = (Json::UInt64)ary.size_bytes();
+  }
+
+  size_t size_bytes() const { return root_["size_bytes"].asLargestUInt(); }
+};
+
+#endif /* __JF_RAW_H__ */
diff --git a/include/PairAlignmentFormatter.hpp b/include/PairAlignmentFormatter.hpp
new file mode 100644
index 0000000..a3e90fe
--- /dev/null
+++ b/include/PairAlignmentFormatter.hpp
@@ -0,0 +1,29 @@
+#ifndef __PAIR_ALIGNMENT_FORMATTER_HPP__
+#define __PAIR_ALIGNMENT_FORMATTER_HPP__
+
+#include "RapMapUtils.hpp"
+
+template <typename IndexPtrT>
+struct PairAlignmentFormatter {
+    PairAlignmentFormatter(IndexPtrT indexIn) : index(indexIn),
+    read1Temp(1000, 'A'),
+    qual1Temp(1000, '~'),
+    read2Temp(1000, 'A'),
+    qual2Temp(1000, '~'),
+    cigarStr1(buff1, 1000),
+    cigarStr2(buff2, 1000) {
+    }
+
+    // Data members
+    IndexPtrT index;
+    std::string read1Temp;
+    std::string qual1Temp;
+    std::string read2Temp;
+    std::string qual2Temp;
+    char buff1[1000];
+    char buff2[1000];
+    rapmap::utils::FixedWriter cigarStr1;
+    rapmap::utils::FixedWriter cigarStr2;
+};
+
+#endif //__PAIR_ALIGNMENT_FORMATTER_HPP__
diff --git a/include/PairSequenceParser.hpp b/include/PairSequenceParser.hpp
new file mode 100644
index 0000000..98e29b3
--- /dev/null
+++ b/include/PairSequenceParser.hpp
@@ -0,0 +1,193 @@
+#ifndef __PAIR_SEQUENCE_PARSER_HPP__
+#define __PAIR_SEQUENCE_PARSER_HPP__
+
+#include <string>
+#include <memory>
+#include <utility>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <fstream>
+
+#include <jellyfish/err.hpp>
+#include <jellyfish/cooperative_pool2.hpp>
+#include <jellyfish/cpp_array.hpp>
+
+struct header_sequence_qual {
+  std::string header;
+  std::string seq;
+  std::string qual;
+};
+struct sequence_list {
+  size_t nb_filled;
+  std::vector<std::pair<header_sequence_qual, header_sequence_qual> > data;
+};
+
+template<typename PathIterator>
+class pair_sequence_parser : public jellyfish::cooperative_pool2<pair_sequence_parser<PathIterator>, sequence_list> {
+  typedef jellyfish::cooperative_pool2<pair_sequence_parser<PathIterator>, sequence_list> super;
+  typedef std::unique_ptr<std::istream> stream_type;
+  enum file_type { DONE_TYPE, FASTA_TYPE, FASTQ_TYPE, ERROR_TYPE };
+
+  struct stream_status {
+    file_type   type;
+    std::string buffer;
+    stream_type stream1;
+    stream_type stream2;
+
+    stream_status() : type(DONE_TYPE) { }
+  };
+  jellyfish::cpp_array<stream_status> streams_;
+  PathIterator                        path_begin_, path_end_;
+  std::mutex                          path_mutex_;
+
+public:
+  /// Size is the number of buffers to keep around. It should be
+  /// larger than the number of thread expected to read from this
+  /// class. nb_sequences is the number of sequences to read into a
+  /// buffer. 'begin' and 'end' are iterators to a range of istream.
+  pair_sequence_parser(uint32_t size, uint32_t nb_sequences,
+                       uint32_t max_producers,
+                       PathIterator path_begin, PathIterator path_end) :
+    super(max_producers, size),
+    streams_(max_producers),
+    path_begin_(path_begin), path_end_(path_end)
+  {
+    for(auto it = super::element_begin(); it != super::element_end(); ++it) {
+      it->nb_filled = 0;
+      it->data.resize(nb_sequences);
+    }
+    for(uint32_t i = 0; i < max_producers; ++i) {
+      streams_.init(i);
+      open_next_files(streams_[i]);
+    }
+  }
+
+  inline bool produce(uint32_t i, sequence_list& buff) {
+    stream_status& st = streams_[i];
+
+    switch(st.type) {
+    case FASTA_TYPE:
+      read_fasta(st, buff);
+      break;
+    case FASTQ_TYPE:
+      read_fastq(st, buff);
+      break;
+    case DONE_TYPE:
+    case ERROR_TYPE:
+      return true;
+    }
+
+    if(st.stream1->good() && st.stream2->good())
+      return false;
+
+    // Reach the end of file, close current and try to open the next one
+    open_next_files(st);
+    return false;
+  }
+
+protected:
+  file_type peek_file_type(std::istream& is) {
+    switch(is.peek()) {
+    case EOF: return DONE_TYPE;
+    case '>': return FASTA_TYPE;
+    case '@': return FASTQ_TYPE;
+    default: return ERROR_TYPE;
+    }
+  }
+
+  void open_next_files(stream_status& st) {
+    st.stream1.reset();
+    st.stream2.reset();
+    const char *p1 = 0, *p2 = 0;
+    {
+      std::lock_guard<std::mutex> lck(path_mutex_);
+      if(path_begin_ < path_end_) {
+        p1 = *path_begin_;
+        ++path_begin_;
+      }
+      if(path_begin_ < path_end_) {
+        p2 = *path_begin_;
+        ++path_begin_;
+      }
+    }
+
+    if(!p1 || !p2) {
+      st.type = DONE_TYPE;
+      return;
+    }
+    st.stream1.reset(new std::ifstream(p1));
+    st.stream2.reset(new std::ifstream(p2));
+    if(!*st.stream1 || !*st.stream2) {
+      st.type = DONE_TYPE;
+      return;
+    }
+
+    // Update the type of the current file and move past first header
+    // to beginning of sequence.
+    file_type type1 = peek_file_type(*st.stream1);
+    file_type type2 = peek_file_type(*st.stream2);
+    if(type1 == DONE_TYPE || type2 == DONE_TYPE)
+      return open_next_files(st);
+    if(type1 != type2)
+       throw std::runtime_error("Paired files are of different format");
+    if(type1 == ERROR_TYPE || type2 == ERROR_TYPE)
+       throw std::runtime_error("Unsupported format");
+    st.type = type1;
+  }
+
+  void read_fasta_one_sequence(std::istream& is, std::string& tmp, header_sequence_qual& hsq) {
+    is.get(); // Skip '>'
+    std::getline(is, hsq.header);
+    hsq.seq.clear();
+    while(is.peek() != '>' && is.peek() != EOF) {
+      std::getline(is, tmp); // Wish there was an easy way to combine the
+      hsq.seq.append(tmp);             // two lines avoiding copying
+    }
+  }
+
+  void read_fasta(stream_status& st, sequence_list& buff) {
+    size_t&      nb_filled = buff.nb_filled;
+    const size_t data_size = buff.data.size();
+
+    for(nb_filled = 0; nb_filled < data_size && st.stream1->peek() != EOF && st.stream2->peek() != EOF; ++nb_filled) {
+      read_fasta_one_sequence(*st.stream1, st.buffer, buff.data[nb_filled].first);
+      read_fasta_one_sequence(*st.stream2, st.buffer, buff.data[nb_filled].second);
+    }
+  }
+
+  void read_fastq_one_sequence(std::istream& is, std::string& tmp, header_sequence_qual& hsq) {
+    is.get(); // Skip '@'
+    std::getline(is, hsq.header);
+    hsq.seq.clear();
+    while(is.peek() != '+' && is.peek() != EOF) {
+      std::getline(is, tmp); // Wish there was an easy way to combine the
+      hsq.seq.append(tmp);             // two lines avoiding copying
+    }
+    if(!is.good())
+      throw std::runtime_error("Truncated fastq file");
+    is.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    hsq.qual.clear();
+    while(hsq.qual.size() < hsq.seq.size() && is.good()) {
+      std::getline(is, tmp);
+      hsq.qual.append(tmp);
+    }
+    if(hsq.qual.size() != hsq.seq.size())
+      std::runtime_error("Invalid fastq file: wrong number of quals");
+    if(is.peek() != EOF && is.peek() != '@')
+      std::runtime_error("Invalid fastq file: header missing");
+
+  }
+
+  void read_fastq(stream_status& st, sequence_list& buff) {
+    size_t&      nb_filled = buff.nb_filled;
+    const size_t data_size = buff.data.size();
+
+    for(nb_filled = 0; nb_filled < data_size && st.stream1->peek() != EOF && st.stream2->peek() != EOF; ++nb_filled) {
+      read_fastq_one_sequence(*st.stream1, st.buffer, buff.data[nb_filled].first);
+      read_fastq_one_sequence(*st.stream2, st.buffer, buff.data[nb_filled].second);
+    }
+  }
+};
+
+#endif /* __PAIR_SEQUENCE_PARSER_HPP__ */
diff --git a/include/RapMapConfig.hpp b/include/RapMapConfig.hpp
new file mode 100644
index 0000000..df7d935
--- /dev/null
+++ b/include/RapMapConfig.hpp
@@ -0,0 +1,14 @@
+#ifndef __RAPMAP_CONFIG_HPP__
+#define __RAPMAP_CONFIG_HPP__
+
+#include <string>
+
+namespace rapmap {
+    constexpr char majorVersion[] = "0";
+    constexpr char minorVersion[] = "3";
+    constexpr char patchVersion[] = "0";
+    constexpr char version [] = "0.3.0";
+    constexpr uint32_t indexVersion = 2;
+}
+
+#endif //__RAPMAP_CONFIG_HPP__
diff --git a/include/RapMapFileSystem.hpp b/include/RapMapFileSystem.hpp
new file mode 100644
index 0000000..0292128
--- /dev/null
+++ b/include/RapMapFileSystem.hpp
@@ -0,0 +1,15 @@
+#ifndef __RAPMAP_FILESYSTEM_HPP__
+#define __RAPMAP_FILESYSTEM_HPP__
+
+namespace rapmap {
+    namespace fs {
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool FileExists(const char *path);
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool DirExists(const char *path);
+        void MakeDir(const char* path);
+    }
+}
+
+
+#endif //__RAPMAP_FILESYSTEM_HPP__
diff --git a/include/RapMapIndex.hpp b/include/RapMapIndex.hpp
new file mode 100644
index 0000000..3994d8d
--- /dev/null
+++ b/include/RapMapIndex.hpp
@@ -0,0 +1,52 @@
+#ifndef __RAP_MAP_INDEX_HPP__
+#define __RAP_MAP_INDEX_HPP__
+
+#include <fstream>
+#include <memory>
+
+//#include "jellyfish/jellyfish.hpp"
+#include "jellyfish/file_header.hpp"
+#include "jellyfish/binary_dumper.hpp"
+#include "jellyfish/hash_counter.hpp"
+#include "jellyfish/mapped_file.hpp"
+#include "JFRaw.hpp"
+
+#include "spdlog/spdlog.h"
+
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+
+#include "RapMapUtils.hpp"
+#include "ScopedTimer.hpp"
+
+class RapMapIndex {
+    using PositionList = std::vector<uint32_t>;
+    using KmerInfoList = std::vector<rapmap::utils::KmerInfo>;
+    using EqClassList = std::vector<rapmap::utils::EqClass>;
+    //using MerMapT = jellyfish::cooperative::hash_counter<rapmap::utils::my_mer>;
+    using FileMerArray = jellyfish::large_hash::array_raw<rapmap::utils::my_mer>;
+    using EqClassLabelVec = std::vector<uint32_t>;
+
+    //using KmerIndex = std::unordered_map<uint64_t, TranscriptList, rapmap::utils::KmerKeyHasher>;
+    //using IntervalIndex = std::unordered_map<uint64_t, rapmap::utils::KmerInterval, rapmap::utils::KmerKeyHasher>;
+
+    public:
+    RapMapIndex();
+
+    bool load(std::string& indexPrefix);
+
+    KmerInfoList kmerInfos;
+    std::unique_ptr<char> rawHashMem{nullptr};
+    std::unique_ptr<FileMerArray> merHash{nullptr};
+    EqClassList eqClassList;
+    EqClassLabelVec eqLabelList;
+    PositionList posList;
+    std::vector<std::string> txpNames;
+    std::vector<uint32_t> txpLens;
+    std::vector<uint8_t> fwdJumpTable;
+    std::vector<uint8_t> revJumpTable;
+};
+
+#endif //__RAP_MAP_INDEX_HPP__
diff --git a/include/RapMapSAIndex.hpp b/include/RapMapSAIndex.hpp
new file mode 100644
index 0000000..4bb5f83
--- /dev/null
+++ b/include/RapMapSAIndex.hpp
@@ -0,0 +1,63 @@
+#ifndef __RAPMAP_SA_INDEX_HPP__
+#define __RAPMAP_SA_INDEX_HPP__
+
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+
+#include "spdlog/spdlog.h"
+#include "spdlog/details/format.h"
+
+#include "google/dense_hash_map"
+#include "bit_array.h"
+//#include "bitmap.h"
+//#include "shared.h"
+#include "rank9b.h"
+
+#include <cstdio>
+#include <vector>
+#include <memory>
+
+#include <fstream>
+#include "RapMapUtils.hpp"
+
+template <typename IndexT, typename HashT>
+class RapMapSAIndex {
+    public:
+    using IndexType = IndexT;
+    using HashType = HashT;
+
+      struct BitArrayDeleter {
+        void operator()(BIT_ARRAY* b) {
+          if(b != nullptr) {
+            bit_array_free(b);
+          }
+        }
+      };
+
+	  using BitArrayPointer = std::unique_ptr<BIT_ARRAY, BitArrayDeleter>;
+
+    RapMapSAIndex();
+
+  	// Given a position, p, in the concatenated text,
+  	// return the corresponding transcript
+  	IndexT transcriptAtPosition(IndexT p);
+
+    bool load(const std::string& indDir);
+
+    std::vector<IndexT> SA;
+
+    BitArrayPointer bitArray{nullptr};
+    std::unique_ptr<rank9b> rankDict{nullptr};
+
+    std::string seq;
+    std::vector<std::string> txpNames;
+    std::vector<IndexT> txpOffsets;
+    std::vector<IndexT> txpLens;
+    std::vector<IndexT> positionIDs;
+    std::vector<rapmap::utils::SAIntervalWithKey<IndexT>> kintervals;
+    HashT khash;
+};
+
+#endif //__RAPMAP_SA_INDEX_HPP__
diff --git a/include/RapMapUtils.hpp b/include/RapMapUtils.hpp
new file mode 100644
index 0000000..6239b4c
--- /dev/null
+++ b/include/RapMapUtils.hpp
@@ -0,0 +1,811 @@
+#ifndef __RAP_MAP_UTILS_HPP__
+#define __RAP_MAP_UTILS_HPP__
+
+#include <atomic>
+#include <cmath>
+#include <memory>
+#include "xxhash.h"
+#include <cereal/archives/binary.hpp>
+#include "jellyfish/mer_dna.hpp"
+#include "spdlog/spdlog.h"
+#include "spdlog/details/format.h"
+#include "PairSequenceParser.hpp"
+
+#ifdef RAPMAP_SALMON_SUPPORT
+#include "LibraryFormat.hpp"
+#endif
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+// Must be forward-declared
+template <typename IndexT>
+class PairAlignmentFormatter;
+template <typename IndexT>
+class SingleAlignmentFormatter;
+
+// Forward-declare because the C++ compiler is dumb
+class RapMapIndex;
+
+namespace rapmap {
+    namespace utils {
+
+    using my_mer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
+
+    constexpr uint32_t newTxpSetMask = 0x80000000;
+    constexpr uint32_t rcSetMask = 0x40000000;
+
+    // Positions are stored in a packed format, where the highest
+    // 2-bits encode if this position refers to a new transcript
+    // and whether or not the k-mer from the hash matches this txp
+    // in the forward or RC direction.
+    void decodePosition(uint32_t p, uint32_t& pout, bool& newTxp, bool& isRC);
+
+    template <typename IndexT>
+        void writeSAMHeader(IndexT& rmi, std::shared_ptr<spdlog::logger> out) {
+            fmt::MemoryWriter hd;
+            hd.write("@HD\tVN:0.1\tSO:unknown\n");
+
+            auto& txpNames = rmi.txpNames;
+            auto& txpLens = rmi.txpLens;
+
+            auto numRef = txpNames.size();
+            for (size_t i = 0; i < numRef; ++i) {
+                hd.write("@SQ\tSN:{}\tLN:{:d}\n", txpNames[i], txpLens[i]);
+            }
+            // Eventuall output a @PG line
+            //hd.format("@PG\t");
+            std::string headerStr(hd.str());
+            // Don't include the last '\n', since the logger will do it for us.
+            headerStr.pop_back();
+            out->info() << headerStr;
+        }
+
+    template <typename IndexT>
+        void writeSAMHeader(IndexT& rmi, std::ostream& outStream) {
+            fmt::MemoryWriter hd;
+            hd.write("@HD\tVN:0.1\tSO:unknown\n");
+
+            auto& txpNames = rmi.txpNames;
+            auto& txpLens = rmi.txpLens;
+
+            auto numRef = txpNames.size();
+            for (size_t i = 0; i < numRef; ++i) {
+                hd.write("@SQ\tSN:{}\tLN:{:d}\n", txpNames[i], txpLens[i]);
+            }
+            // Eventuall output a @PG line
+            //hd.format("@PG\t");
+            outStream << hd.str();
+        }
+
+    // from http://stackoverflow.com/questions/9435385/split-a-string-using-c11
+    std::vector<std::string> tokenize(const std::string &s, char delim);
+
+    // from https://github.com/cppformat/cppformat/issues/105
+    class FixedBuffer : public fmt::Buffer<char> {
+        public:
+            FixedBuffer(char *array, std::size_t size)
+                : fmt::Buffer<char>(array, size) {}
+
+        protected:
+            void grow(std::size_t size) {
+                throw std::runtime_error("buffer overflow");
+            }
+    };
+
+    class FixedWriter : public fmt::Writer {
+        private:
+            FixedBuffer buffer_;
+        public:
+            FixedWriter(char *array, std::size_t size)
+                : fmt::Writer(buffer_), buffer_(array, size) {}
+    };
+
+    /**
+     * Stores both the key (k-mer)
+     * and the interval to which it corresponds.
+     * This is useful if the hash itself doesn't validate
+     * the key (e.g. a minimum perfect hash).
+     **/
+    template <typename IndexT>
+    struct SAIntervalWithKey {
+        uint64_t kmer;
+      //  SAInterval<IndexT> second;
+        IndexT begin;
+        IndexT end;
+        template <typename Archive>
+            void load(Archive& ar) { ar(kmer, begin, end); }
+
+        template <typename Archive>
+            void save(Archive& ar) const { ar(kmer, begin, end); }
+    };
+
+    template <typename IndexT>
+    struct SAInterval {
+      /*
+        SAInterval(IndexT beginIn, IndexT endIn) : begin(beginIn), end(endIn) {}
+	SAInterval(std::initializer_list<IndexT> il) {
+	  auto it = il.begin();
+	  begin = *(it);
+	  ++it;
+	  end = *(il.begin());
+	}
+	*/
+
+        IndexT begin;
+        IndexT end;
+        template <typename Archive>
+            void load(Archive& ar) { ar(begin, end); }
+
+        template <typename Archive>
+            void save(Archive& ar) const { ar(begin, end); }
+    };
+
+
+    struct HitCounters {
+        std::atomic<uint64_t> peHits{0};
+        std::atomic<uint64_t> seHits{0};
+        std::atomic<uint64_t> trueHits{0};
+        std::atomic<uint64_t> totHits{0};
+        std::atomic<uint64_t> numReads{0};
+        std::atomic<uint64_t> tooManyHits{0};
+        std::atomic<uint64_t> lastPrint{0};
+    };
+
+    class JFMerKeyHasher{
+        public:
+            size_t operator()(const my_mer& m) const {
+                auto k = rapmap::utils::my_mer::k();
+                auto v = m.get_bits(0, 2*k);
+                return XXH64(static_cast<void*>(&v), 8, 0);
+            }
+    };
+
+    class KmerKeyHasher {
+        public:
+            size_t operator()(const uint64_t& m) const {
+                //auto k = rapmap::utils::my_mer::k();
+                //auto v = m.get_bits(0, 2*k);
+                auto v = m;
+                return XXH64(static_cast<void*>(&v), 8, 0);
+            }
+    };
+
+    struct KmerInterval {
+        uint64_t offset;
+        uint32_t length;
+
+        template <typename Archive>
+            void save(Archive& arch) const {
+                arch(offset, length);
+            }
+
+        template <typename Archive>
+            void load(Archive& arch) {
+                arch(offset, length);
+            }
+    };
+
+    struct KmerInfo {
+        KmerInfo () : eqId(0), offset(0), count(0) {}
+
+
+        KmerInfo(uint32_t eqIdIn, uint32_t offsetIn, uint32_t countIn) :
+            eqId(eqIdIn), offset(offsetIn), count(countIn) {}
+
+        template <typename Archive>
+        void load(Archive& ar) {
+            ar(eqId, offset, count);
+        }
+
+        template <typename Archive>
+        void save(Archive& ar) const {
+            ar(eqId, offset, count);
+        }
+        uint32_t eqId = 0;
+        uint32_t offset = 0;
+        uint32_t count = 0;
+    };
+
+
+    template <class T>
+    inline void hashCombine(std::size_t& seed, const T& v)
+    {
+            std::hash<T> hasher;
+            seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2);
+    }
+
+    constexpr uint32_t uint32Invalid = std::numeric_limits<uint32_t>::max();
+    using PositionList = std::vector<uint32_t>;
+    using KmerInfoList = std::vector<KmerInfo>;
+
+    enum class MateStatus : uint8_t {
+        SINGLE_END = 0,
+        PAIRED_END_LEFT = 1,
+        PAIRED_END_RIGHT = 2,
+        PAIRED_END_PAIRED = 3 };
+
+    // Wraps the standard iterator of the Position list to provide
+    // some convenient functionality.  In the future, maybe this
+    // should be a proper iterator adaptor.
+    struct PositionListHelper{
+        using PLIt = PositionList::iterator;
+
+        PositionListHelper(PLIt itIn, PLIt endIn) :
+            it_(itIn), end_(endIn) {}
+        // The underlying iterator shouldn't be advanced further
+        inline bool done() { return it_ == end_; }
+
+        // The actual postion on the transcript
+        int32_t pos() const { return static_cast<int32_t>((*it_) & 0x3FFFFFFF); }
+
+        // True if the position encoded was on the reverse complement strand
+        // of the reference transcript, false otherwise.
+        bool isRC() const { return (*it_) & 0x40000000; }
+
+        // True if we hit the position list for a new transcript, false otherwise
+        bool isNewTxp() const { return (*it_) & 0x80000000; }
+
+        void advanceToNextTranscript() {
+            if (it_ < end_) {
+                do {
+                    ++it_;
+                } while (!isNewTxp() and it_ != end_);
+
+            }
+        }
+
+        PLIt it_; // The underlying iterator
+        PLIt end_; // The end of the container
+    };
+
+
+    struct QuasiAlignment {
+  	QuasiAlignment() :
+		tid(std::numeric_limits<uint32_t>::max()),
+		pos(std::numeric_limits<int32_t>::max()),
+		fwd(true),
+		readLen(std::numeric_limits<uint32_t>::max()),
+		fragLen(std::numeric_limits<uint32_t>::max()),
+		isPaired(false)
+#ifdef RAPMAP_SALMON_SUPPORT
+        ,format(LibraryFormat::formatFromID(0))
+#endif // RAPMAP_SALMON_SUPPORT
+        {}
+
+        QuasiAlignment(uint32_t tidIn, int32_t posIn,
+                bool fwdIn, uint32_t readLenIn,
+                uint32_t fragLenIn = 0,
+                bool isPairedIn = false) :
+            tid(tidIn), pos(posIn), fwd(fwdIn),
+            readLen(readLenIn), fragLen(fragLenIn),
+            isPaired(isPairedIn)
+#ifdef RAPMAP_SALMON_SUPPORT
+        ,format(LibraryFormat::formatFromID(0))
+#endif // RAPMAP_SALMON_SUPPORT
+        {}
+        QuasiAlignment(QuasiAlignment&& other) = default;
+        QuasiAlignment& operator=(QuasiAlignment&) = default;
+        QuasiAlignment& operator=(QuasiAlignment&& o) = default;
+        QuasiAlignment(const QuasiAlignment& o) = default;
+        QuasiAlignment(QuasiAlignment& o) = default;
+
+        // Some convenience functions to allow salmon interop
+#ifdef RAPMAP_SALMON_SUPPORT
+        inline uint32_t transcriptID() const { return tid; }
+        inline double score() { return 1.0; }
+        inline uint32_t fragLength() { return fragLen; }
+        inline int32_t hitPos() { return std::min(pos, matePos); }
+        double logProb{HUGE_VAL};
+        double logBias{HUGE_VAL};
+        inline LibraryFormat libFormat() { return format; }
+        LibraryFormat format;
+#endif // RAPMAP_SALMON_SUPPORT
+
+        // Only 1 since the mate must have the same tid
+        // we won't call *chimeric* alignments here.
+        uint32_t tid;
+        // Left-most position of the hit
+        int32_t pos;
+        // left-most position of the mate
+        int32_t matePos;
+        // Is the read from the forward strand
+        bool fwd;
+        // Is the mate from the forward strand
+        bool mateIsFwd;
+        // The fragment length (template length)
+        // This is 0 for single-end or orphaned reads.
+        uint32_t fragLen;
+        // The read's length
+        uint32_t readLen;
+        // The mate's length
+        uint32_t mateLen;
+        // Is this a paired *alignment* or not
+        bool isPaired;
+        MateStatus mateStatus;
+    };
+
+    struct HitInfo {
+        HitInfo(KmerInfoList::iterator kit, uint32_t merIDIn,
+                int32_t queryPosIn, bool queryRCIn) :
+                kinfo(kit), merID(merIDIn), queryPos(queryPosIn),
+                queryRC(queryRCIn) {}
+
+        KmerInfoList::iterator kinfo;
+        uint32_t merID;
+        int32_t queryPos;
+        bool queryRC;
+    };
+
+    template <typename OffsetT>
+    struct SAIntervalHit {
+        SAIntervalHit(OffsetT beginIn, OffsetT endIn, uint32_t lenIn, uint32_t queryPosIn, bool queryRCIn) :
+            begin(beginIn), end(endIn), len(lenIn), queryPos(queryPosIn), queryRC(queryRCIn) {}
+
+	      OffsetT span() { return end - begin; }
+        OffsetT begin, end;
+        uint32_t len, queryPos;
+        bool queryRC;
+    };
+
+    struct SATxpQueryPos {
+	SATxpQueryPos(uint32_t posIn, uint32_t qposIn, bool queryRCIn, bool activeIn = false) :
+		pos(posIn), queryPos(qposIn), queryRC(queryRCIn), active(activeIn) {}
+	uint32_t pos, queryPos;
+	bool queryRC, active;
+    };
+
+    struct ProcessedSAHit {
+	    ProcessedSAHit() : tid(std::numeric_limits<uint32_t>::max()), active(false), numActive(1) {}
+
+	    ProcessedSAHit(uint32_t txpIDIn, uint32_t txpPosIn, uint32_t queryPosIn, bool queryRCIn) :
+		    tid(txpIDIn), active(false), numActive(1)
+	    {
+		tqvec.emplace_back(txpPosIn, queryPosIn, queryRCIn);
+	    }
+
+        /**
+         * This enforces a more stringent consistency check on
+         * the hits for this transcript.  The hits must be co-linear
+         * with respect to the query and target.
+         * 
+         * input: numToCheck --- the number of hits to check in sorted order
+         *                       hits after the last of these need not be consistent.
+         * return: numToCheck if the first numToCheck hits are consistent; 
+         *         -1 otherwise
+         **/
+        int32_t checkConsistent(int32_t numToCheck) {
+            auto numHits = tqvec.size();
+
+            // special case for only 1 or two hits (common)
+            if (numHits == 1) {
+                return numToCheck;
+            } else if (numHits == 2) {
+                auto& h1 = (tqvec[0].queryPos < tqvec[1].queryPos) ? tqvec[0] : tqvec[1];
+                auto& h2 = (tqvec[0].queryPos < tqvec[1].queryPos) ? tqvec[1] : tqvec[2];
+                return (h2.pos > h1.pos) ? (numToCheck) : -1;
+            } else {
+                // first, sort by query position
+                std::sort(tqvec.begin(), tqvec.end(), 
+                          [](const SATxpQueryPos& q1, const SATxpQueryPos& q2) -> bool {
+                              return q1.queryPos < q2.queryPos;
+                          });
+
+                int32_t lastRefPos{std::numeric_limits<int32_t>::min()};
+                for (size_t i = 0; i < numToCheck; ++i) {
+                    int32_t refPos = static_cast<int32_t>(tqvec[i].pos);
+                    if (refPos > lastRefPos) {
+                        lastRefPos = refPos;
+                    } else {
+                        return i;
+                    }
+                }
+                return numToCheck;
+            }
+        }
+
+	    uint32_t tid;
+	    std::vector<SATxpQueryPos> tqvec;
+        bool active;
+	    uint32_t numActive;
+    };
+
+    struct SAHitInfo {
+	    SAHitInfo(uint32_t txpIDIn, uint32_t txpPosIn, uint32_t queryPosIn, bool queryRCIn) :
+		    tid(txpIDIn), pos(txpPosIn), queryPos(queryPosIn), queryRC(queryRCIn) {}
+	    uint32_t tid;
+	    uint32_t pos;
+	    uint32_t queryPos;
+	    bool queryRC;
+    };
+
+    struct TxpQueryPos {
+        TxpQueryPos(PositionListHelper& ph, int32_t queryPosIn, bool queryRCIn) :
+                txpPosInfo(ph), queryPos(queryPosIn), queryRC(queryRCIn) {}
+        // Iterator for the beginning of the position list
+        // of a given k-mer into a given transcript.
+        PositionListHelper txpPosInfo;
+        // The position of the k-mer on the query.
+        int32_t queryPos;
+        bool queryRC;
+    };
+
+    struct ProcessedHit {
+        ProcessedHit() : tid(std::numeric_limits<uint32_t>::max()) {}
+        ProcessedHit(uint32_t tidIn,
+                     PositionListHelper ph, int32_t queryPos, bool queryRC) :
+                     tid(tidIn) {
+                         tqvec.emplace_back(ph, queryPos, queryRC);
+                     }
+
+
+        uint32_t tid; // transcript id
+        // A vector of iterators into the position list
+        // for the k-mers hitting this transcript
+        std::vector<TxpQueryPos> tqvec;
+    };
+
+
+    struct EqClass {
+        EqClass() :
+            txpListStart(uint32Invalid), txpListLen(uint32Invalid) {}
+        EqClass(uint32_t txpListStartIn, uint32_t txpListLenIn) :
+            txpListStart(txpListStartIn), txpListLen(txpListLenIn) {}
+
+        template <typename Archive>
+        void load (Archive& ar) {
+            ar(txpListStart, txpListLen);
+        }
+
+        template <typename Archive>
+        void save (Archive& ar) const {
+            ar(txpListStart, txpListLen);
+        }
+
+        uint32_t txpListStart;
+        uint32_t txpListLen;
+    };
+
+    inline void printMateStatus(rapmap::utils::MateStatus ms) {
+        switch(ms) {
+            case rapmap::utils::MateStatus::SINGLE_END:
+                std::cerr << "SINGLE END";
+                break;
+            case rapmap::utils::MateStatus::PAIRED_END_LEFT:
+                std::cerr << "PAIRED END (LEFT)";
+                break;
+            case rapmap::utils::MateStatus::PAIRED_END_RIGHT:
+                std::cerr << "PAIRED END (RIGHT)";
+                break;
+            case rapmap::utils::MateStatus::PAIRED_END_PAIRED:
+                std::cerr << "PAIRED END (PAIRED)";
+                break;
+        }
+    }
+
+
+    // Declarations for functions dealing with SAM formatting and output
+    //
+    inline void adjustOverhang(int32_t& pos, uint32_t readLen,
+		    uint32_t txpLen, FixedWriter& cigarStr) {
+	    cigarStr.clear();
+	    if (pos + readLen < 0) {
+            cigarStr.write("{}S", readLen);
+            pos = 0;
+        } else if (pos < 0) {
+		    int32_t matchLen = readLen + pos;
+            int32_t clipLen = readLen - matchLen;
+		    cigarStr.write("{}S{}M", clipLen, matchLen);
+		    // Now adjust the mapping position
+		    pos = 0;
+	    } else if (pos > txpLen) {
+            cigarStr.write("{}S", readLen);
+        } else if (pos + readLen > txpLen) {
+		    int32_t matchLen = txpLen - pos;
+		    int32_t clipLen = readLen - matchLen;
+		    cigarStr.write("{}M{}S", matchLen, clipLen);
+	    } else {
+		    cigarStr.write("{}M", readLen);
+	    }
+    }
+
+    inline void adjustOverhang(QuasiAlignment& qa, uint32_t txpLen,
+		    FixedWriter& cigarStr1, FixedWriter& cigarStr2) {
+	    if (qa.isPaired) { // both mapped
+		    adjustOverhang(qa.pos, qa.readLen, txpLen, cigarStr1);
+		    adjustOverhang(qa.matePos, qa.mateLen, txpLen, cigarStr2);
+	    } else if (qa.mateStatus == MateStatus::PAIRED_END_LEFT ) {
+		    // left read mapped
+		    adjustOverhang(qa.pos, qa.readLen, txpLen, cigarStr1);
+		    // right read un-mapped will just be read length * S
+		    cigarStr2.clear();
+		    cigarStr2.write("{}S", qa.mateLen);
+	    } else if (qa.mateStatus == MateStatus::PAIRED_END_RIGHT) {
+		    // right read mapped
+		    adjustOverhang(qa.pos, qa.readLen, txpLen, cigarStr2);
+		    // left read un-mapped will just be read length * S
+		    cigarStr1.clear();
+		    cigarStr1.write("{}S", qa.readLen);
+	    }
+    }
+
+
+
+        // get the sam flags for the quasialignment qaln.
+        // peinput is true if the read is paired in *sequencing*; false otherwise
+        // the sam flags for mate 1 are written into flags1 and for mate2 into flags2
+        inline void getSamFlags(const QuasiAlignment& qaln,
+                uint16_t& flags) {
+            constexpr uint16_t pairedInSeq = 0x1;
+            constexpr uint16_t mappedInProperPair = 0x2;
+            constexpr uint16_t unmapped = 0x4;
+            constexpr uint16_t mateUnmapped = 0x8;
+            constexpr uint16_t isRC = 0x10;
+            constexpr uint16_t mateIsRC = 0x20;
+            constexpr uint16_t isRead1 = 0x40;
+            constexpr uint16_t isRead2 = 0x80;
+            constexpr uint16_t isSecondaryAlignment = 0x100;
+            constexpr uint16_t failedQC = 0x200;
+            constexpr uint16_t isPCRDup = 0x400;
+            constexpr uint16_t supplementaryAln = 0x800;
+
+            flags = 0;
+            // Not paired in sequencing
+            // flags1 = (peInput) ? pairedInSeq : 0;
+            // flags |= properlyAligned;
+            // we don't output unmapped yet
+            // flags |= unmapped
+            // flags |= mateUnmapped
+            flags |= (qaln.fwd) ? 0 : isRC;
+            // Mate flag meaningless
+            // flags1 |= (qaln.mateIsFwd) ? 0 : mateIsRC;
+            // flags |= isRead1;
+            //flags2 |= isRead2;
+        }
+
+        // get the sam flags for the quasialignment qaln.
+        // peinput is true if the read is paired in *sequencing*; false otherwise
+        // the sam flags for mate 1 are written into flags1 and for mate2 into flags2
+        inline void getSamFlags(const QuasiAlignment& qaln,
+                bool peInput,
+                uint16_t& flags1,
+                uint16_t& flags2) {
+            constexpr uint16_t pairedInSeq = 0x1;
+            constexpr uint16_t properlyAligned = 0x2;
+            constexpr uint16_t unmapped = 0x4;
+            constexpr uint16_t mateUnmapped = 0x8;
+            constexpr uint16_t isRC = 0x10;
+            constexpr uint16_t mateIsRC = 0x20;
+            constexpr uint16_t isRead1 = 0x40;
+            constexpr uint16_t isRead2 = 0x80;
+            constexpr uint16_t isSecondaryAlignment = 0x100;
+            constexpr uint16_t failedQC = 0x200;
+            constexpr uint16_t isPCRDup = 0x400;
+            constexpr uint16_t supplementaryAln = 0x800;
+
+            flags1 = flags2 = 0;
+            flags1 = (peInput) ? pairedInSeq : 0;
+            flags1 |= (qaln.isPaired) ? properlyAligned : 0;
+            flags2 = flags1;
+            // we don't output unmapped yet
+            bool read1Unaligned = qaln.mateStatus == MateStatus::PAIRED_END_RIGHT;
+            bool read2Unaligned = qaln.mateStatus == MateStatus::PAIRED_END_LEFT;
+            // If read 1 is unaligned, flags1 gets "unmapped" and flags2 gets "mate unmapped"
+            flags1 |= (read1Unaligned) ? unmapped : 0;
+            flags2 |= (read1Unaligned) ? mateUnmapped : 0;
+            // If read 2 is unaligned, flags2 gets "unmapped" and flags1 gets "mate unmapped"
+            flags2 |= (read2Unaligned) ? unmapped : 0;
+            flags1 |= (read2Unaligned) ? mateUnmapped : 0;
+
+            flags1 |= (qaln.fwd) ? 0 : isRC;
+            flags1 |= (qaln.mateIsFwd) ? 0 : mateIsRC;
+            flags2 |= (qaln.mateIsFwd) ? 0 : isRC;
+            flags2 |= (qaln.fwd) ? 0 : mateIsRC;
+            flags1 |= isRead1;
+            flags2 |= isRead2;
+        }
+
+	// Adapted from
+        // https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/blob/8c9933a1685e0ab50c7d8b7926c9068bc0c9d7d2/src/main.c#L36
+        void reverseRead(std::string& seq,
+                std::string& qual,
+                std::string& readWork,
+                std::string& qualWork);
+
+        template <typename ReadPairT, typename IndexT>
+        uint32_t writeAlignmentsToStream(
+                ReadPairT& r,
+                PairAlignmentFormatter<IndexT>& formatter,
+                HitCounters& hctr,
+                std::vector<QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+        template <typename ReadT, typename IndexT>
+        uint32_t writeAlignmentsToStream(
+                ReadT& r,
+                SingleAlignmentFormatter<IndexT>& formatter,
+                HitCounters& hctr,
+                std::vector<QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+        inline void mergeLeftRightHitsFuzzy(
+                bool leftMatches,
+                bool rightMatches,
+                std::vector<QuasiAlignment>& leftHits,
+                std::vector<QuasiAlignment>& rightHits,
+                std::vector<QuasiAlignment>& jointHits,
+                uint32_t readLen,
+                uint32_t maxNumHits,
+                bool& tooManyHits,
+                HitCounters& hctr) {
+
+            if (leftHits.empty()) {
+                if (!leftMatches) {
+                    if (!rightHits.empty()) {
+                        jointHits.insert(jointHits.end(),
+                                std::make_move_iterator(rightHits.begin()),
+                                std::make_move_iterator(rightHits.end()));
+                        hctr.seHits += rightHits.size();
+                    }
+                }
+            } else if (rightHits.empty()) {
+                if (!rightMatches) {
+                    if (!leftHits.empty()) {
+                        jointHits.insert(jointHits.end(),
+                                std::make_move_iterator(leftHits.begin()),
+                                std::make_move_iterator(leftHits.end()));
+                        hctr.seHits += leftHits.size();
+                    }
+                }
+            } else {
+                constexpr const int32_t signedZero{0};
+                auto leftIt = leftHits.begin();
+                auto leftEnd = leftHits.end();
+                auto leftLen = std::distance(leftIt, leftEnd);
+                if (rightHits.size() > 0) {
+                    auto rightIt = rightHits.begin();
+                    auto rightEnd = rightHits.end();
+                    auto rightLen = std::distance(rightIt, rightEnd);
+                    size_t numHits{0};
+                    jointHits.reserve(std::min(leftLen, rightLen));
+                    uint32_t leftTxp, rightTxp;
+                    while (leftIt != leftEnd && rightIt != rightEnd) {
+                        leftTxp = leftIt->tid;
+                        rightTxp = rightIt->tid;
+                        if (leftTxp < rightTxp) {
+                            ++leftIt;
+                        } else {
+                            if (!(rightTxp < leftTxp)) {
+                                int32_t startRead1 = std::max(leftIt->pos, signedZero);
+                                int32_t startRead2 = std::max(rightIt->pos, signedZero);
+                                bool read1First{(startRead1 < startRead2)};
+                                int32_t fragStartPos = read1First ? startRead1 : startRead2;
+                                int32_t fragEndPos = read1First ? 
+                                    (startRead2 + rightIt->readLen) : (startRead1 + leftIt->readLen);
+                                uint32_t fragLen = fragEndPos - fragStartPos;
+                                jointHits.emplace_back(leftTxp,
+                                        leftIt->pos,
+                                        leftIt->fwd,
+                                        leftIt->readLen,
+                                        fragLen, true);
+                                // Fill in the mate info
+                                auto& qaln = jointHits.back();
+                                qaln.mateLen = rightIt->readLen;
+                                qaln.matePos = rightIt->pos;
+                                qaln.mateIsFwd = rightIt->fwd;
+                                jointHits.back().mateStatus = MateStatus::PAIRED_END_PAIRED;
+
+                                ++numHits;
+                                if (numHits > maxNumHits) { tooManyHits = true; break; }
+                                ++leftIt;
+                            }
+                            ++rightIt;
+                        }
+                    }
+                }
+                if (tooManyHits) { jointHits.clear(); ++hctr.tooManyHits; }
+            }
+
+            // If we had proper paired hits
+            if (jointHits.size() > 0) {
+                hctr.peHits += jointHits.size();
+                //orphanStatus = 0;
+            }
+        }
+
+        inline void mergeLeftRightHits(
+                std::vector<QuasiAlignment>& leftHits,
+                std::vector<QuasiAlignment>& rightHits,
+                std::vector<QuasiAlignment>& jointHits,
+                uint32_t readLen,
+                uint32_t maxNumHits,
+                bool& tooManyHits,
+                HitCounters& hctr) {
+            if (leftHits.size() > 0) {
+                constexpr const int32_t signedZero{0};
+                auto leftIt = leftHits.begin();
+                auto leftEnd = leftHits.end();
+                auto leftLen = std::distance(leftIt, leftEnd);
+                if (rightHits.size() > 0) {
+                    auto rightIt = rightHits.begin();
+                    auto rightEnd = rightHits.end();
+                    auto rightLen = std::distance(rightIt, rightEnd);
+                    size_t numHits{0};
+                    jointHits.reserve(std::min(leftLen, rightLen));
+                    uint32_t leftTxp, rightTxp;
+                    while (leftIt != leftEnd && rightIt != rightEnd) {
+                        leftTxp = leftIt->tid;
+                        rightTxp = rightIt->tid;
+                        if (leftTxp < rightTxp) {
+                            ++leftIt;
+                        } else {
+                            if (!(rightTxp < leftTxp)) {
+                                int32_t startRead1 = std::max(leftIt->pos, signedZero);
+                                int32_t startRead2 = std::max(rightIt->pos, signedZero);
+                                bool read1First{(startRead1 < startRead2)};
+                                int32_t fragStartPos = read1First ? startRead1 : startRead2;
+                                int32_t fragEndPos = read1First ? 
+                                    (startRead2 + rightIt->readLen) : (startRead1 + leftIt->readLen);
+                                uint32_t fragLen = fragEndPos - fragStartPos;
+                                jointHits.emplace_back(leftTxp,
+                                        startRead1,
+                                        leftIt->fwd,
+                                        leftIt->readLen,
+                                        fragLen, true);
+                                // Fill in the mate info
+                                auto& qaln = jointHits.back();
+                                qaln.mateLen = rightIt->readLen;
+                                qaln.matePos = startRead2;
+                                qaln.mateIsFwd = rightIt->fwd;
+                                jointHits.back().mateStatus = MateStatus::PAIRED_END_PAIRED;
+
+                                ++numHits;
+                                if (numHits > maxNumHits) { tooManyHits = true; break; }
+                                ++leftIt;
+                            }
+                            ++rightIt;
+                        }
+                    }
+                }
+                if (tooManyHits) { jointHits.clear(); ++hctr.tooManyHits; }
+            }
+
+            // If we had proper paired hits
+            if (jointHits.size() > 0) {
+                hctr.peHits += jointHits.size();
+                //orphanStatus = 0;
+            } else if (leftHits.size() + rightHits.size() > 0 and !tooManyHits) {
+                // If there weren't proper paired hits, then either
+                // there were too many hits, and we forcibly discarded the read
+                // or we take the single end hits.
+                auto numHits = leftHits.size() + rightHits.size();
+                hctr.seHits += numHits;
+                //orphanStatus = 0;
+                //orphanStatus |= (leftHits.size() > 0) ? 0x1 : 0;
+                //orphanStatus |= (rightHits.size() > 0) ? 0x2 : 0;
+                jointHits.insert(jointHits.end(),
+                        std::make_move_iterator(leftHits.begin()),
+                        std::make_move_iterator(leftHits.end()));
+                jointHits.insert(jointHits.end(),
+                        std::make_move_iterator(rightHits.begin()),
+                        std::make_move_iterator(rightHits.end()));
+            }
+        }
+
+    /*
+    template <typename Archive>
+    void save(Archive& archive, const my_mer& mer);
+
+    template <typename Archive>
+    void load(Archive& archive, my_mer& mer);
+    */
+    }
+}
+
+
+#endif // __RAP_MAP_UTILS_HPP__
diff --git a/include/SACollector.hpp b/include/SACollector.hpp
new file mode 100644
index 0000000..261b2ce
--- /dev/null
+++ b/include/SACollector.hpp
@@ -0,0 +1,580 @@
+#ifndef SA_COLLECTOR_HPP
+#define SA_COLLECTOR_HPP
+
+#include "RapMapUtils.hpp"
+#include "RapMapSAIndex.hpp"
+#include "SASearcher.hpp"
+
+#include <iostream>
+#include <algorithm>
+#include <iterator>
+
+template <typename RapMapIndexT>
+class SACollector {
+    public:
+    using OffsetT = typename RapMapIndexT::IndexType;
+
+    SACollector(RapMapIndexT* rmi) : rmi_(rmi) {}
+    bool operator()(std::string& read,
+                    std::vector<rapmap::utils::QuasiAlignment>& hits,
+                    SASearcher<RapMapIndexT>& saSearcher,
+                    rapmap::utils::MateStatus mateStatus,
+                    bool strictCheck=false,
+                    bool consistentHits=false) {
+
+        using QuasiAlignment = rapmap::utils::QuasiAlignment;
+        using MateStatus = rapmap::utils::MateStatus;
+
+        //auto& posIDs = rmi_->positionIDs;
+        auto& rankDict = rmi_->rankDict;
+        auto& txpStarts = rmi_->txpOffsets;
+        auto& SA = rmi_->SA;
+        auto& khash = rmi_->khash;
+        auto& text = rmi_->seq;
+        uint32_t sampFactor{1};
+        auto salen = SA.size();
+
+        auto readLen = read.length();
+        auto maxDist = 1.5 * readLen;
+        auto k = rapmap::utils::my_mer::k();
+        auto readStartIt = read.begin();
+        auto readEndIt = read.end();
+
+        auto readRevStartIt = read.rbegin();
+        auto readRevEndIt = read.rend();
+
+        auto rb = read.begin();
+        auto re = rb + k;
+        OffsetT lbLeftFwd = 0, ubLeftFwd = 0;
+        OffsetT lbLeftRC = 0, ubLeftRC = 0;
+        OffsetT lbRightFwd = 0, ubRightFwd = 0;
+        OffsetT lbRightRC = 0, ubRightRC = 0;
+        OffsetT matchedLen;
+
+        uint32_t fwdHit{0};
+        uint32_t rcHit{0};
+
+        bool foundHit = false;
+        bool isRev = false;
+        rapmap::utils::my_mer mer;
+        rapmap::utils::my_mer rcMer;
+
+        enum HitStatus { ABSENT = -1, UNTESTED = 0, PRESENT = 1 };
+        // Record if k-mers are hits in the
+        // fwd direction, rc direction or both
+        struct KmerDirScore {
+	  KmerDirScore(rapmap::utils::my_mer kmerIn, int32_t kposIn, HitStatus fwdScoreIn, HitStatus rcScoreIn) :
+	    kmer(kmerIn), kpos(kposIn), fwdScore(fwdScoreIn), rcScore(rcScoreIn) {}
+	  KmerDirScore() : kpos(0), fwdScore(UNTESTED), rcScore(UNTESTED) {}
+	  bool operator==(const KmerDirScore& other) const { return kpos == other.kpos; }
+	  bool operator<(const KmerDirScore& other) const { return kpos < other.kpos; }
+          void print() { 
+	    std::cerr << "{ " << kmer.to_str() << ", " <<  kpos << ", " << ((fwdScore) ? "PRESENT" : "ABSENT") << ", " << ((rcScore) ? "PRESENT" : "ABSENT") << "}\t";
+	  }
+            rapmap::utils::my_mer kmer;
+	    int32_t kpos;
+            HitStatus fwdScore;
+            HitStatus rcScore;
+        };
+
+        // This allows implementing our heurisic for comparing
+        // forward and reverse-complement strand matches
+        std::vector<KmerDirScore> kmerScores;
+
+        using SAIntervalHit = rapmap::utils::SAIntervalHit<OffsetT>;
+
+        std::vector<SAIntervalHit> fwdSAInts;
+        std::vector<SAIntervalHit> rcSAInts;
+
+        std::vector<uint32_t> leftTxps, leftTxpsRC;
+        std::vector<uint32_t> rightTxps, rightTxpsRC;
+        OffsetT maxInterval{1000};
+
+        // The number of bases that a new query position (to which
+        // we skipped) should overlap the previous extension. A
+        // value of 0 means no overlap (the new search begins at the next
+        // base) while a value of (k - 1) means that k-1 bases (one less than
+        // the k-mer size) must overlap.
+        OffsetT skipOverlap = k-1;
+        // Number of nucleotides to skip when encountering a homopolymer k-mer.
+        OffsetT homoPolymerSkip = k/2;
+
+        // Find a hit within the read
+        // While we haven't fallen off the end
+        while (re < read.end()) {
+
+            // Get the k-mer at the current start position.
+            // And make sure that it's valid (contains no Ns).
+            auto pos = std::distance(readStartIt, rb);
+            auto invalidPos = read.find_first_of("nN", pos);
+            if (invalidPos <= pos + k) {
+                rb = read.begin() + invalidPos + 1;
+                re = rb + k;
+                continue;
+            }
+
+            // If the next k-bases are valid, get the k-mer and
+            // reverse complement k-mer
+            mer = rapmap::utils::my_mer(read.c_str() + pos);
+            if (mer.is_homopolymer()) { rb += homoPolymerSkip; re += homoPolymerSkip; continue; }
+            rcMer = mer.get_reverse_complement();
+
+            // See if we can find this k-mer in the hash
+            auto merIt = khash.find(mer.get_bits(0, 2*k));
+            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+
+            // If we can find the k-mer in the hash, get its SA interval
+            if (merIt != khash.end()) {
+                OffsetT lb = merIt->second.begin;
+                OffsetT ub = merIt->second.end;
+
+                // lb must be 1 *less* then the current lb
+                auto lbRestart = std::max(static_cast<OffsetT>(0), lb-1);
+                // Extend the SA interval using the read sequence as far as
+                // possible
+                std::tie(lbLeftFwd, ubLeftFwd, matchedLen) =
+                    saSearcher.extendSearchNaive(lbRestart, ub, k, rb, readEndIt);
+
+                // If the SA interval is valid, and not too wide, then record
+                // the hit.
+                OffsetT diff = ubLeftFwd - lbLeftFwd;
+                if (ubLeftFwd > lbLeftFwd and diff < maxInterval) {
+                    auto queryStart = std::distance(read.begin(), rb);
+                    fwdSAInts.emplace_back(lbLeftFwd, ubLeftFwd, matchedLen, queryStart, false);
+                    if (strictCheck) {
+                        ++fwdHit;
+                        // If we also match this k-mer in the rc direction
+			if (rcMerIt != khash.end()) {
+			  ++rcHit;
+			  kmerScores.emplace_back(mer, pos, PRESENT, PRESENT);
+			} else { // Otherwise it doesn't match in the rc direction
+			  kmerScores.emplace_back(mer, pos, PRESENT, ABSENT);
+			}
+
+			// If we didn't end the match b/c we exhausted the query
+                        // test the mismatching k-mer in the other strand
+                        // TODO: check for 'N'?
+                        if (rb + matchedLen < readEndIt){
+                            auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
+                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
+                            kmerScores.emplace_back(mer, kmerPos, ABSENT, UNTESTED);
+                        }
+                    } else { // no strict check
+                        ++fwdHit;
+                        if (rcMerIt != khash.end()) { ++rcHit; }
+                    }
+                }
+            }
+
+            // See if the reverse complement k-mer is in the hash
+            if (rcMerIt != khash.end()) {
+                lbLeftRC = rcMerIt->second.begin;
+                ubLeftRC = rcMerIt->second.end;
+                OffsetT diff = ubLeftRC - lbLeftRC;
+                if (ubLeftRC > lbLeftRC) {
+                    // The original k-mer didn't match in the foward direction
+                    if (!fwdHit) {
+                        ++rcHit;
+                        if (strictCheck) {
+			  kmerScores.emplace_back(mer, pos, ABSENT, PRESENT);
+                        }
+                    }
+                }
+            }
+
+            // If we had a hit with either k-mer then we can
+            // break out of this loop to look for the next informative position
+            if (fwdHit + rcHit > 0) {
+                foundHit = true;
+                break;
+            }
+            ++rb; ++re;
+        }
+
+        // If we went the entire length of the read without finding a hit
+        // then we can bail.
+        if (!foundHit) { return false; }
+
+        bool lastSearch{false};
+        // If we had a hit on the forward strand
+        if (fwdHit) {
+
+            // The length of this match
+            auto matchLen = fwdSAInts.front().len;
+            // The iterator to where this match began
+            rb = read.begin() + fwdSAInts.front().queryPos;
+
+            // [lb, ub) is the suffix array interval for the MMP (maximum mappable prefix)
+            // of the k-mer we found.  The NIP (next informative position) in the sequence
+            // is the position after the LCE (longest common extension) of
+            // T[SA[lb]:] and T[SA[ub-1]:]
+            auto remainingLength = std::distance(rb + matchLen, readEndIt);
+            auto lce = saSearcher.lce(lbLeftFwd, ubLeftFwd-1, matchLen, remainingLength);
+            auto fwdSkip = std::max(static_cast<OffsetT>(matchLen) - skipOverlap,
+                                    static_cast<OffsetT>(lce) - skipOverlap);
+
+            size_t nextInformativePosition = std::min(
+                    std::max(static_cast<OffsetT>(0),
+                    static_cast<OffsetT>(readLen)- static_cast<OffsetT>(k)),
+                    static_cast<OffsetT>(std::distance(readStartIt, rb) + fwdSkip)
+                    );
+
+            rb = read.begin() + nextInformativePosition;
+            re = rb + k;
+
+            size_t invalidPos{0};
+            while (re <= readEndIt) {
+                // The offset into the string
+                auto pos = std::distance(readStartIt, rb);
+
+                // The position of the first N in the k-mer (if there is one)
+                // If we have already verified there are no Ns in the remainder
+                // of the string (invalidPos is std::string::npos) then we can
+                // skip this test.
+                if (invalidPos != std::string::npos) {
+                    invalidPos = read.find_first_of("nN", pos);
+                }
+
+                // If the first N is within k bases, then this k-mer is invalid
+                if (invalidPos < pos + k) {
+                    // A valid k-mer can't start until after the 'N'
+                    nextInformativePosition = invalidPos + 1;
+                    rb = read.begin() + nextInformativePosition;
+                    re = rb + k;
+                    // Go to the next iteration of the while loop
+                    continue;
+                }
+
+                // If the current end position is valid
+                if (re <= readEndIt) {
+
+                    mer = rapmap::utils::my_mer(read.c_str() + pos);
+                    if (mer.is_homopolymer()) { rb += homoPolymerSkip; re = rb + k; continue; }
+                    auto merIt = khash.find(mer.get_bits(0, 2*k));
+
+                    if (merIt != khash.end()) {
+                        if (strictCheck) {
+                            ++fwdHit;
+                            kmerScores.emplace_back(mer, pos, PRESENT, UNTESTED);
+                            auto rcMer = mer.get_reverse_complement();
+                            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+                            if (rcMerIt != khash.end()) {
+                                ++rcHit;
+                                kmerScores.back().rcScore = PRESENT;
+                            }
+                        }
+
+                        lbRightFwd = merIt->second.begin;
+                        ubRightFwd = merIt->second.end;
+
+                        // lb must be 1 *less* then the current lb
+                        lbRightFwd = std::max(static_cast<OffsetT>(0), lbRightFwd - 1);
+                        std::tie(lbRightFwd, ubRightFwd, matchedLen) =
+                            saSearcher.extendSearchNaive(lbRightFwd, ubRightFwd,
+                                    k, rb, readEndIt);
+
+                        OffsetT diff = ubRightFwd - lbRightFwd;
+                        if (ubRightFwd > lbRightFwd and diff < maxInterval) {
+                            auto queryStart = std::distance(read.begin(), rb);
+                            fwdSAInts.emplace_back(lbRightFwd, ubRightFwd, matchedLen, queryStart, false);
+                            // If we didn't end the match b/c we exhausted the query
+                            // test the mismatching k-mer in the other strand
+                            // TODO: check for 'N'?
+                            if (strictCheck and rb + matchedLen < readEndIt){
+                                auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
+                                mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
+				// TODO: 04/11/16
+                                kmerScores.emplace_back(mer, kmerPos, UNTESTED, UNTESTED);
+                            }
+
+                        }
+
+                        if (lastSearch) { break; }
+                        auto mismatchIt = rb + matchedLen;
+                        if (mismatchIt < readEndIt) {
+                            auto remainingDistance = std::distance(mismatchIt, readEndIt);
+                            auto lce = saSearcher.lce(lbRightFwd, ubRightFwd-1, matchedLen, remainingDistance);
+
+                            // Where we would jump if we just used the MMP
+                            auto skipMatch = mismatchIt - skipOverlap;
+                            // Where we would jump if we used the LCE
+                            auto skipLCE = rb + lce - skipOverlap;
+                            // Pick the larger of the two
+                            rb = std::max(skipLCE, skipMatch);
+                            if (rb > (readEndIt - k)) {
+                                rb = readEndIt - k;
+                                lastSearch = true;
+                            }
+                            re = rb + k;
+                        } else {
+                            lastSearch = true;
+                            rb = readEndIt - k;
+                            re = rb + k;
+                        }
+
+                    } else {
+                        rb += sampFactor;
+                        re = rb + k;
+                    }
+                }
+            }
+        }
+
+        lastSearch = false;
+        if (rcHit >= fwdHit) {
+            size_t pos{read.length() - k};
+
+            auto revReadEndIt = read.rend();
+
+            auto revRB = read.rbegin();
+            auto revRE = revRB + k;
+
+            auto invalidPosIt = revRB;
+            while (revRE <= revReadEndIt){
+
+                revRE = revRB + k;
+                if (revRE > revReadEndIt) { break; }
+
+                // See if this k-mer would contain an N
+                // only check if we don't yet know that there are no remaining
+                // Ns
+                if (invalidPosIt != revReadEndIt) {
+                    invalidPosIt = std::find_if(revRB, revRE,
+                                                 [](const char c) -> bool {
+                                                     return c == 'n' or c == 'N';
+                                                 });
+                }
+
+                // If we found an N before the end of the k-mer
+                if (invalidPosIt < revRE) {
+                    // Skip to the k-mer starting at the next position
+                    // (i.e. right past the N)
+                    revRB = invalidPosIt + 1;
+                    continue;
+                }
+
+                // The distance from the beginning of the read to the
+                // start of the k-mer
+                pos = std::distance(revRE, revReadEndIt);
+
+                // Get the k-mer and query it in the hash
+                mer = rapmap::utils::my_mer(read.c_str() + pos);
+                if (mer.is_homopolymer()) { revRB += homoPolymerSkip; revRE += homoPolymerSkip; continue; }
+                rcMer = mer.get_reverse_complement();
+                auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+
+                // If we found the k-mer
+                if (rcMerIt != khash.end()) {
+                    if (strictCheck) {
+                        ++rcHit;
+                        kmerScores.emplace_back(mer, pos, UNTESTED, PRESENT);
+                        auto merIt = khash.find(mer.get_bits(0, 2*k));
+                        if (merIt != khash.end()) {
+                            ++fwdHit;
+                            kmerScores.back().fwdScore = PRESENT;
+                        }
+                    }
+
+
+                    lbRightRC = rcMerIt->second.begin;
+                    ubRightRC = rcMerIt->second.end;
+
+                    // lb must be 1 *less* then the current lb
+                    // We can't move any further in the reverse complement direction
+                    lbRightRC = std::max(static_cast<OffsetT>(0), lbRightRC - 1);
+                    std::tie(lbRightRC, ubRightRC, matchedLen) =
+                        saSearcher.extendSearchNaive(lbRightRC, ubRightRC, k,
+                                revRB, revReadEndIt, true);
+
+                    OffsetT diff = ubRightRC - lbRightRC;
+                    if (ubRightRC > lbRightRC and diff < maxInterval) {
+                        auto queryStart = std::distance(read.rbegin(), revRB);
+                        rcSAInts.emplace_back(lbRightRC, ubRightRC, matchedLen, queryStart, true);
+                        // If we didn't end the match b/c we exhausted the query
+                        // test the mismatching k-mer in the other strand
+                        // TODO: check for 'N'?
+                        if (strictCheck and revRB + matchedLen < revReadEndIt){
+                            auto kmerPos = std::distance(revRB + matchedLen, revReadEndIt);
+                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
+                            // TODO: 04/11/16
+                            kmerScores.emplace_back(mer, kmerPos, UNTESTED, UNTESTED);
+                        }
+                    }
+
+                    if (lastSearch) { break; }
+                    auto mismatchIt = revRB + matchedLen;
+                    if (mismatchIt < revReadEndIt) {
+                        auto remainingDistance = std::distance(mismatchIt, revReadEndIt);
+                        auto lce = saSearcher.lce(lbRightRC, ubRightRC-1, matchedLen, remainingDistance);
+
+                        // Where we would jump if we just used the MMP
+                        auto skipMatch = mismatchIt - skipOverlap;
+                        // Where we would jump if we used the lce
+                        auto skipLCE = revRB + lce - skipOverlap;
+                        // Choose the larger of the two
+                        revRB = std::max(skipLCE, skipMatch);
+                        if (revRB > (revReadEndIt - k)) {
+                            revRB = revReadEndIt - k;
+                            lastSearch = true;
+                        }
+                        revRE = revRB + k;
+                    } else {
+                        lastSearch = true;
+                        revRB = revReadEndIt - k;
+                        revRE = revRB + k;
+                    }
+
+                } else {
+                    revRB += sampFactor;
+                    revRE = revRB + k;
+                }
+            }
+        }
+
+        if (strictCheck) {
+            // The first two conditions shouldn't happen
+            // but I'm just being paranoid here
+            if (fwdHit > 0 and rcHit == 0) {
+                rcSAInts.clear();
+            } else if (rcHit > 0 and fwdHit == 0) {
+                fwdSAInts.clear();
+            } else {
+	      std::sort( kmerScores.begin(), kmerScores.end() );
+	      auto e = std::unique(kmerScores.begin(), kmerScores.end());
+                // Compute the score for the k-mers we need to
+                // test in both the forward and rc directions.
+                int32_t fwdScore{0};
+                int32_t rcScore{0};
+                // For every kmer score structure
+		//std::cerr << "[\n";
+                for (auto kmsIt = kmerScores.begin(); kmsIt != e; ++kmsIt) {//: kmerScores) {
+   		    auto& kms = *kmsIt;
+                    // If the forward k-mer is untested, then test it
+                    if (kms.fwdScore == UNTESTED) {
+                        auto merIt = khash.find(kms.kmer.get_bits(0, 2*k));
+                        kms.fwdScore = (merIt != khash.end()) ? PRESENT : ABSENT;
+                    }
+                    // accumulate the score
+                    fwdScore += kms.fwdScore;
+
+                    // If the rc k-mer is untested, then test it
+                    if (kms.rcScore == UNTESTED) {
+                        rcMer = kms.kmer.get_reverse_complement();
+                        auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+                        kms.rcScore = (rcMerIt != khash.end()) ? PRESENT : ABSENT;
+                    }
+                    // accumulate the score
+                    rcScore += kms.rcScore;
+		    //kms.print();
+		    //std::cerr << "\n";
+                }
+		//std::cerr << "]\n";
+                // If the forward score is strictly greater
+                // then get rid of the rc hits.
+                if (fwdScore > rcScore) {
+                    rcSAInts.clear();
+                } else if (rcScore > fwdScore) {
+                    // If the rc score is strictly greater
+                    // get rid of the forward hits
+                    fwdSAInts.clear();
+                }
+            }
+        }
+
+        auto fwdHitsStart = hits.size();
+        // If we had > 1 forward hit
+        if (fwdSAInts.size() > 1) {
+            auto processedHits = rapmap::hit_manager::intersectSAHits(fwdSAInts, *rmi_, consistentHits);
+            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
+        } else if (fwdSAInts.size() == 1) { // only 1 hit!
+            auto& saIntervalHit = fwdSAInts.front();
+                auto initialSize = hits.size();
+                for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
+                        auto globalPos = SA[i];
+		            	auto txpID = rmi_->transcriptAtPosition(globalPos);
+                        // the offset into this transcript
+                        auto pos = globalPos - txpStarts[txpID];
+                        int32_t hitPos = pos - saIntervalHit.queryPos;
+                        hits.emplace_back(txpID, hitPos, true, readLen);
+                        hits.back().mateStatus = mateStatus;
+                }
+                // Now sort by transcript ID (then position) and eliminate
+                // duplicates
+                auto sortStartIt = hits.begin() + initialSize;
+                auto sortEndIt = hits.end();
+                std::sort(sortStartIt, sortEndIt,
+                                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                                if (a.tid == b.tid) {
+                                return a.pos < b.pos;
+                                } else {
+                                return a.tid < b.tid;
+                                }
+                                });
+                auto newEnd = std::unique(hits.begin() + initialSize, hits.end(),
+                                [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                                return a.tid == b.tid;
+                                });
+                hits.resize(std::distance(hits.begin(), newEnd));
+        }
+        auto fwdHitsEnd = hits.size();
+
+        auto rcHitsStart = fwdHitsEnd;
+        // If we had > 1 rc hit
+        if (rcSAInts.size() > 1) {
+            auto processedHits = rapmap::hit_manager::intersectSAHits(rcSAInts, *rmi_, consistentHits);
+            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
+        } else if (rcSAInts.size() == 1) { // only 1 hit!
+            auto& saIntervalHit = rcSAInts.front();
+            auto initialSize = hits.size();
+            for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
+                auto globalPos = SA[i];
+		        auto txpID = rmi_->transcriptAtPosition(globalPos);
+                // the offset into this transcript
+                auto pos = globalPos - txpStarts[txpID];
+                int32_t hitPos = pos - saIntervalHit.queryPos;
+                hits.emplace_back(txpID, hitPos, false, readLen);
+                hits.back().mateStatus = mateStatus;
+            }
+            // Now sort by transcript ID (then position) and eliminate
+            // duplicates
+            auto sortStartIt = hits.begin() + rcHitsStart;
+            auto sortEndIt = hits.end();
+            std::sort(sortStartIt, sortEndIt,
+                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    if (a.tid == b.tid) {
+                    return a.pos < b.pos;
+                    } else {
+                    return a.tid < b.tid;
+                    }
+                    });
+            auto newEnd = std::unique(sortStartIt, sortEndIt,
+                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    return a.tid == b.tid;
+                    });
+            hits.resize(std::distance(hits.begin(), newEnd));
+        }
+        auto rcHitsEnd = hits.size();
+
+        // If we had both forward and RC hits, then merge them
+        if ((fwdHitsEnd > fwdHitsStart) and (rcHitsEnd > rcHitsStart)) {
+            // Merge the forward and reverse hits
+            std::inplace_merge(hits.begin() + fwdHitsStart, hits.begin() + fwdHitsEnd, hits.begin() + rcHitsEnd,
+                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    return a.tid < b.tid;
+                    });
+            // And get rid of duplicate transcript IDs
+            auto newEnd = std::unique(hits.begin() + fwdHitsStart, hits.begin() + rcHitsEnd,
+                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    return a.tid == b.tid;
+                    });
+            hits.resize(std::distance(hits.begin(), newEnd));
+        }
+        // Return true if we had any valid hits and false otherwise.
+        return foundHit;
+    }
+
+    private:
+        RapMapIndexT* rmi_;
+};
+
+#endif // SA_COLLECTOR_HPP
diff --git a/include/SASearcher.hpp b/include/SASearcher.hpp
new file mode 100644
index 0000000..b36e476
--- /dev/null
+++ b/include/SASearcher.hpp
@@ -0,0 +1,631 @@
+#ifndef SA_SEARCHER_HPP
+#define SA_SEARCHER_HPP
+
+#include <vector>
+#include <algorithm>
+#include <iterator>
+#include "jellyfish/mer_dna.hpp"
+
+#include "RapMapUtils.hpp"
+#include "RapMapSAIndex.hpp"
+
+template <typename RapMapIndexT>
+class SASearcher {
+    public:
+        using OffsetT = typename RapMapIndexT::IndexType;
+
+        SASearcher(RapMapIndexT* rmi) :
+            rmi_(rmi), seq_(&rmi->seq), sa_(&rmi->SA) {}
+
+        int cmp(std::string::iterator abeg,
+                std::string::iterator aend,
+                std::string::iterator bbeg,
+                std::string::iterator bend) {
+            auto ait = abeg;
+            auto bit = bbeg;
+            //size_t la = a.length();
+            //size_t lb = b.length();
+            while (ait < aend and bit < bend) {
+                if (*ait < *bit) {
+                    return -1;
+                } else if (*ait > *bit) {
+                    return 1;
+                }
+                ++ait;
+                ++bit;
+            }
+            if (bit == bend and ait < aend) {
+                return 1;
+            }
+            return 0;
+        }
+
+        enum class SearchDirection : uint8_t {
+            UP = 0, DOWN
+        };
+    
+        template <typename IndexT>
+        struct BoundSearchResult {
+            IndexT maxLen;
+            IndexT bound;
+            SearchDirection dir;
+        };
+
+
+
+	/**
+	 * OK!  It should be (is) possible to figure out what we need with only two binary
+	 * searches.  However, that seems to have some tricky corner cases and has been
+	 * somewhat illusive so far.  This "naive" version performs *3* binary searches.
+	 * The first determines the length of the maximum mappable prefix (MMP).  The second
+	 * finds the lower bound for the query interval and the third finds the upper bound.
+	 * The final binary search *is* optimized (it has a lower bound given by the value)
+	 * returned by second search.  However, this method is likely a bit slower than the
+	 * one above (when it can be made to work correctly at all times).
+	 */
+        template <typename IteratorT>
+        std::tuple<OffsetT, OffsetT, OffsetT> extendSearchNaive(
+                OffsetT lbIn, // The lower bound for the search
+                OffsetT ubIn, // The upper bound for the search
+                OffsetT startAt, // The offset at which to start looking
+                IteratorT qb, // Iterator to the beginning of the query
+                IteratorT qe, // Iterator to the end of the query
+                bool complementBases=false // True if bases should be complemented
+                                           // before comparison
+                ) {
+
+            std::vector<OffsetT>& SA = *sa_;
+            std::string& seq = *seq_;
+
+            int64_t m = std::distance(qb, qe);
+            size_t n = seq.length();
+
+            auto sb = seq.begin();
+            auto se = seq.end();
+
+            // If the bounds are already trivial, just figure how long
+            // of a prefix we share and return the interval.
+            if (ubIn - lbIn == 2) {
+                lbIn += 1;
+                auto i = startAt;
+                while (i < m and SA[lbIn] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+                    if ( queryChar < *(sb + SA[lbIn] + i) ) {
+                        break;
+                    } else if ( queryChar > *(sb + SA[lbIn] + i)) {
+                        break;
+                    }
+                    ++i;
+                }
+                return std::make_tuple(lbIn, ubIn, static_cast<OffsetT>(i));
+            }
+
+            BoundSearchResult<OffsetT> res1, res2;
+
+            char smallest = '#';
+            char largest = '}';
+            char sentinel = smallest;
+
+            // FIX: these have to be large enough to hold the *sum* of the boundaries!
+            int64_t l = lbIn, r = ubIn;
+            int64_t lcpLP = startAt, lcpRP = startAt;
+            int64_t c{0};
+            int64_t i{0};
+
+            int64_t maxI{startAt};
+            int64_t prevI = startAt;
+            int64_t prevILow = startAt;
+            int64_t prevIHigh = startAt;
+            int64_t validBoundLow = ubIn;
+            int64_t validBoundHigh = lbIn;
+            int64_t validBound = 0;
+            bool plt{true};
+            // Reduce the search interval until we hit a border
+            // i.e. until c == r - 1 or c == l + 1
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                        if (i > prevIHigh) {
+                            prevIHigh = i;
+                            validBoundHigh = c;
+                        } else if (i == prevIHigh) {
+                            validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+                        }
+
+                        break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        if (i > prevILow) {
+                            prevILow = i;
+                            validBoundLow = c;
+                        } else if (i == prevILow) {
+                            validBoundLow = c > validBoundLow ? c : validBoundLow;
+                        }
+                        plt = false;
+                        break;
+                    }
+
+                    ++i;
+                }
+                if (i == m or SA[c] + i == n) {
+                    if (i > prevIHigh) {
+                        prevIHigh = i;
+                        validBoundHigh = c;
+                    } else if (i == prevIHigh) {
+                        validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+                    }
+                }
+
+                if (plt) {
+                    if (c == l + 1) {
+                        auto maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            bool knownValid{true};
+            m = res1.maxLen + 1;
+
+            // first search for the lower bound
+            sentinel = '#';
+            l = lbIn;
+            r = ubIn;
+
+            lcpLP = startAt;
+            lcpRP = startAt;
+            c = 0;
+            plt = true;
+            i = startAt;
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
+                    // If we're reverse complementing
+                    if (queryChar != sentinel and complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                     	break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) {
+                        res1.bound = c;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        res1.bound = r;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            // then search for the upper bound
+            sentinel = '{';
+            l = res1.bound - 1;
+            r = ubIn;
+
+            lcpLP = startAt;
+            lcpRP = startAt;
+            c = 0;
+            plt = true;
+            i = startAt;
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
+                    // If we're reverse complementing
+                    if (queryChar != sentinel and complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                     	break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) {
+                        res2.bound = c;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        res2.bound = r;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            // Must occur at least once!
+            if (res1.bound == res2.bound) { res2.bound += 1; }
+            return std::make_tuple(static_cast<OffsetT>(res1.bound), static_cast<OffsetT>(res2.bound), static_cast<OffsetT>(res1.maxLen));
+        }
+
+
+        /**
+         * Compute the longest common extension between the suffixes
+         * at T[SA[p1]] and T[SA[p2]].  Start the comparison at `startAt`
+         * positions into the suffix, and only consider an extension
+         * going to at most position `stopAt`.
+         */
+        OffsetT lce(OffsetT p1, OffsetT p2,
+                    OffsetT startAt=0,
+                    OffsetT stopAt=std::numeric_limits<OffsetT>::max(),
+                    bool verbose=false) {
+            std::string& seq = *seq_;
+            std::vector<OffsetT>& SA = *sa_;
+            OffsetT len = static_cast<OffsetT>(startAt);
+            auto o1 = SA[p1] + startAt;
+            auto o2 = SA[p2] + startAt;
+            auto maxIndex = std::max(o1, o2);
+            while (maxIndex + len < textLen_ and seq[o1+len] == seq[o2+len]) {
+                if (seq[o1+len] == '$') { break; }
+                if (len >= stopAt) { break; }
+                ++len;
+            }
+            return len;
+        }
+
+    private:
+        RapMapIndexT* rmi_;
+        std::string* seq_;
+        std::vector<OffsetT>* sa_;
+        OffsetT textLen_;
+};
+
+
+        /*
+        // http://www.cs.jhu.edu/~langmea/resources/lecture_notes/suffix_arrays.pdf
+        std::tuple<int, int> querySimpleAccel(std::string::iterator qb,
+                                              std::string::iterator qe) {
+            std::vector<int>& SA = *sa_;
+            std::string& seq = *seq_;
+            //ForwardIt it;
+            auto sb = seq.begin();
+            auto se = seq.end();
+
+            size_t n = seq.length();
+            size_t m = std::distance(qb, qe);
+            size_t l = 0, r = n;
+            size_t lcpLP = 0, lcpRP = 0;
+            size_t c{0};
+            size_t i{0};
+            bool plt{true};
+            size_t lower{0};
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    if ( *(qb + i) < *(sb + SA[c] + i) ) {
+                        break;
+                    } else if ( *(qb + i) > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) { lower = c; break; }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) { lower = r; break; }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            i = 0;
+            l = 0;
+            r = n;
+            lcpLP = 0;
+            lcpRP = 0;
+            size_t upper{0};
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    if ( *(qb + i) < *(sb + SA[c] + i) ) {
+                        break;
+                    } else if ( *(qb + i) > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) { upper = c; break; }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) { upper = r; break; }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+            return std::make_tuple(lower, upper);
+        }
+
+
+        // http://www.cs.jhu.edu/~langmea/resources/lecture_notes/suffix_arrays.pdf
+        // templated on the iterator type so we can use a forward or revers iterator
+        template <typename IteratorT>
+        std::tuple<int, int, int> extendSearch(
+                int lbIn, // The lower bound for the search
+                int ubIn, // The upper bound for the search
+                int startAt, // The offset at which to start looking
+                IteratorT qb, // Iterator to the beginning of the query
+                IteratorT qe, // Iterator to the end of the query
+                bool complementBases=false // True if bases should be complemented
+                                           // before comparison
+                ) {
+
+            std::vector<int>& SA = *sa_;
+            std::string& seq = *seq_;
+
+            int m = std::distance(qb, qe);
+            size_t n = seq.length();
+
+            auto sb = seq.begin();
+            auto se = seq.end();
+
+            // If the bounds are already trivial, just figure how long
+            // of a prefix we share and return the interval.
+            if (ubIn - lbIn == 2) {
+                lbIn += 1;
+                auto i = startAt;
+                while (i < m and SA[lbIn] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+                    if ( queryChar < *(sb + SA[lbIn] + i) ) {
+                        break;
+                    } else if ( queryChar > *(sb + SA[lbIn] + i)) {
+                        break;
+                    }
+                    ++i;
+                }
+                return std::make_tuple(lbIn, ubIn, i);
+            }
+
+            BoundSearchResult res1, res2;
+
+            char smallest = '#';
+            char largest = '}';
+            char sentinel = smallest;
+
+            int l = lbIn, r = ubIn;
+            int lcpLP = startAt, lcpRP = startAt;
+            int c{0};
+            int i{0};
+            int maxI{startAt};
+            int prevI = startAt;
+            int prevILow = startAt;
+            int prevIHigh = startAt;
+            int validBoundLow = ubIn;
+            int validBoundHigh = lbIn;
+            int validBound = 0;
+            bool plt{true};
+            bool prevPLT{true};
+            //std::cerr << "lbIn = " << lbIn << ", ubIn = " << ubIn << "\n";
+            // Reduce the search interval until we hit a border
+            // i.e. until c == r - 1 or c == l + 1
+            while (true) {
+                c = (l + r) / 2;
+                //std::cerr << "l = " << l << ", r = " << r << ", c = " << c << '\n';
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                        if (i > prevIHigh) {
+                            prevIHigh = i;
+                            validBoundHigh = c;
+                        } else if (i == prevIHigh) {
+                            validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+                        }
+                        //std::cerr << "(l = " << l << ", r = " << r << ") pattern < SA[" << c << "]\n";
+                        //std::cerr << "(i = " << i << ", m = " << m << ") " << queryChar << " < " <<  *(sb + SA[c] + i) << "\n";
+
+                        break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        if (i > prevILow) {
+                            prevILow = i;
+                            validBoundLow = c;
+                        } else if (i == prevILow) {
+                            validBoundLow = c > validBoundLow ? c : validBoundLow;
+                        }
+                        //std::cerr << "(l = " << l << ", r = " << r << ") pattern > SA[" << c << "]\n";
+                        //std::cerr << "(i = " << i << ", m = " << m << ") " << queryChar << " > " <<  *(sb + SA[c] + i) << "\n";
+                        plt = false;
+                        break;
+                    }
+
+                    ++i;
+		}
+		if (i == m or SA[c] + i == n) {
+			if (i > prevIHigh) {
+				prevIHigh = i;
+				validBoundHigh = c;
+			} else if (i == prevIHigh) {
+				validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+			}
+		}
+
+                if (plt) {
+                    if (c == l + 1) {
+                        std::cerr << "path 1\n";
+                        auto maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        if (maxI == m) {
+                            res1.dir = SearchDirection::DOWN;
+                            res1.bound = c;
+                        } else {
+                            validBound = (prevILow >= prevIHigh) ? validBoundLow : validBoundHigh;
+                            res1.bound = validBound;
+                            res1.dir = (res1.bound == validBoundLow) ? SearchDirection::DOWN : SearchDirection::UP;
+                        }
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        std::cerr << "path 2\n";
+                        maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        validBound = (prevILow >= prevIHigh) ? validBoundLow : validBoundHigh;
+                        if (maxI == m) {
+                            res1.bound = r;
+                        } else {
+                            res1.bound = validBound;
+                        }
+                        res1.dir = (res1.bound == validBoundLow) ? SearchDirection::DOWN : SearchDirection::UP;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+
+            bool knownValid{true};
+            m = res1.maxLen + 1;
+
+            switch (res1.dir) {
+                case SearchDirection::UP:
+                    sentinel = '#';
+                    r = res1.bound;
+                    l = lbIn;
+                    std::cerr << "direction was UP; lb = " << l << ", ub = " << r << "\n";
+                    std::cerr << "direction was UP; origLb = " << lbIn << ", origUb = " << ubIn << "\n";
+                    break;
+                case SearchDirection::DOWN:
+                    sentinel = '{';
+                    r = ubIn;
+                    l = res1.bound;
+                    std::cerr << "direction was DOWN; lb = " << l << ", ub = " << r << "\n";
+                    std::cerr << "direction was UP; origLb = " << lbIn << ", origUb = " << ubIn << "\n";
+                    break;
+            }
+
+            if (r - l < 2) {
+                if (r == l) { r += 1; }
+                //std::cerr << "early exit!\n";
+                return std::make_tuple(l, r, res1.maxLen);
+            }
+
+
+            lcpLP = startAt;
+            lcpRP = startAt;
+            c = 0;
+            plt = true;
+            prevPLT = true;
+            prevI = 0;
+            prevILow = 0;
+            prevIHigh = 0;
+            i = startAt;
+            validBound = 0;
+            validBoundLow = ubIn;
+            validBoundHigh = lbIn;
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
+                    // If we're reverse complementing
+                    if (queryChar != sentinel and complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                     	break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) {
+                        res2.dir = SearchDirection::DOWN;
+                        res2.bound = c;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        res2.bound = r;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            auto bound1 = std::min(res1.bound, res2.bound);
+            auto bound2 = std::max(res1.bound, res2.bound);
+            // Must occur at least once!
+            if (bound1 == bound2) { bound2 += 1; }
+            return std::make_tuple(bound1, bound2, res1.maxLen);
+        }
+        */
+
+#endif //SA_SEARCHER_HPP
diff --git a/include/ScopedTimer.hpp b/include/ScopedTimer.hpp
new file mode 100644
index 0000000..de1121c
--- /dev/null
+++ b/include/ScopedTimer.hpp
@@ -0,0 +1,22 @@
+#ifndef __SCOPED_TIMER_HPP__
+#define __SCOPED_TIMER_HPP__
+// from https://gist.github.com/justgord/4482447
+#include <chrono>
+#include <iostream>
+
+struct ScopedTimer
+{
+    std::chrono::high_resolution_clock::time_point t0;
+
+    ScopedTimer()
+        : t0(std::chrono::high_resolution_clock::now())
+    { }
+    ~ScopedTimer(void)
+    {
+        auto  t1 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> elapsedSec =  t1 - t0;
+        std::cerr << "Elapsed time: " << elapsedSec.count() << "s\n";
+    }
+};
+
+#endif //__SCOPED_TIMER_HPP__
diff --git a/include/SingleAlignmentFormatter.hpp b/include/SingleAlignmentFormatter.hpp
new file mode 100644
index 0000000..2510082
--- /dev/null
+++ b/include/SingleAlignmentFormatter.hpp
@@ -0,0 +1,22 @@
+#ifndef __SINGLE_ALIGNMENT_FORMATTER_HPP__
+#define __SINGLE_ALIGNMENT_FORMATTER_HPP__
+
+#include "RapMapUtils.hpp"
+
+template <typename IndexPtrT>
+struct SingleAlignmentFormatter {
+    SingleAlignmentFormatter(IndexPtrT indexIn) : index(indexIn),
+    readTemp(1000, 'A'),
+    qualTemp(1000, '~'),
+    cigarStr(buff, 1000){
+    }
+
+    // Data members
+    IndexPtrT index;
+    std::string readTemp;
+    std::string qualTemp;
+    char buff[1000];
+    rapmap::utils::FixedWriter cigarStr;
+};
+
+#endif //__PAIR_ALIGNMENT_FORMATTER_HPP__
diff --git a/include/SpinLock.hpp b/include/SpinLock.hpp
new file mode 100644
index 0000000..56647fa
--- /dev/null
+++ b/include/SpinLock.hpp
@@ -0,0 +1,25 @@
+#ifndef __SPIN_LOCK_HPP__
+#define __SPIN_LOCK_HPP__
+
+#include <atomic>
+
+// Taken from http://stackoverflow.com/questions/26583433/c11-implementation-of-spinlock-using-atomic
+class SpinLock {
+    std::atomic_flag locked = ATOMIC_FLAG_INIT ;
+public:
+    void lock() {
+        while (locked.test_and_set(std::memory_order_acquire)) { ; }
+    }
+
+    // from http://stackoverflow.com/questions/19742993/implementing-a-spinlock-in-boost-example-neededhttp://stackoverflow.com/questions/19742993/implementing-a-spinlock-in-boost-example-needed
+    // is this legit?
+    bool try_lock() {
+        return !locked.test_and_set(std::memory_order_acquire);
+    }
+
+    void unlock() {
+        locked.clear(std::memory_order_release);
+    }
+};
+
+#endif //__SPIN_LOCK_HPP__
diff --git a/include/Type.hpp b/include/Type.hpp
new file mode 100644
index 0000000..ef64e92
--- /dev/null
+++ b/include/Type.hpp
@@ -0,0 +1,30 @@
+/* 
+ *  Copyright (c) 2012 Daisuke Okanohara
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *   1. Redistributions of source code must retain the above Copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above Copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the authors nor the names of its contributors
+ *      may be used to endorse or promote products derived from this
+ *      software without specific prior written permission.
+ */
+
+#ifndef RSDIC_TYPE_HPP_
+#define RSDIC_TYPE_HPP_
+
+#include <stdint.h>
+
+namespace rsdic{
+typedef uint32_t rsdic_uint; // use uint64_t for bitvec >= 4GB
+}
+
+
+#endif // RSDIC_TYPE_HPP_
diff --git a/include/Util.hpp b/include/Util.hpp
new file mode 100644
index 0000000..5896ab9
--- /dev/null
+++ b/include/Util.hpp
@@ -0,0 +1,69 @@
+/* 
+ *  Copyright (c) 2012 Daisuke Okanohara
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *   1. Redistributions of source code must retain the above Copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above Copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the authors nor the names of its contributors
+ *      may be used to endorse or promote products derived from this
+ *      software without specific prior written permission.
+ */
+
+#ifndef RSDIC_UTIL_HPP_
+#define RSDIC_UTIL_HPP_
+
+#include <vector>
+#include <stdint.h>
+#include "Const.hpp"
+
+namespace rsdic{
+
+class Util{
+public:
+  static uint64_t GetSlice(const std::vector<uint64_t>& bits,
+                           uint64_t pos, uint64_t len) {
+    if (len == 0) return 0;
+    uint64_t block = pos / kSmallBlockSize;
+    uint64_t offset = pos % kSmallBlockSize;
+    uint64_t ret = bits[block] >> offset;
+    if (offset + len > kSmallBlockSize){
+      ret |= (bits[block+1] << (kSmallBlockSize - offset));
+    }
+    if (len == 64) return ret;
+    return ret & ((1LLU << len) - 1);
+  }
+
+  static void SetSlice(std::vector<uint64_t>& bits,
+                       uint64_t pos, uint64_t len, uint64_t val) {
+    if (len == 0) return;
+    uint64_t block = pos / kSmallBlockSize;
+    uint64_t offset = pos % kSmallBlockSize;
+    bits[block] |= val << offset;
+    if (offset + len > kSmallBlockSize){
+      bits[block+1] |= val >> (kSmallBlockSize - offset);
+    }
+  }
+
+  static uint64_t Floor(uint64_t num, uint64_t div){
+    return (num + div - 1) / div;
+  }
+
+  static uint64_t GetNum(bool bit, uint64_t num, uint64_t total) {
+    if (bit) return num;
+    else return total - num;
+  }
+
+
+};
+
+}
+
+#endif // RSDIC_UTIL_HPP_
diff --git a/include/bar.h b/include/bar.h
new file mode 100644
index 0000000..2dc3b1c
--- /dev/null
+++ b/include/bar.h
@@ -0,0 +1,148 @@
+/*
+ bar.h
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ maintainer: Isaac Turner <turner.isaac at gmail.com>
+ license: Public Domain, no warranty
+ date: Sept 2014
+*/
+
+// shorten the names of some of the bit_array functions to be more
+// like the str* function names.  The prefix "bar" is used to represent
+// bit_array and is analogous to "str".
+
+#ifndef BAR_HEADER_SEEN
+#define BAR_HEADER_SEEN
+
+#include "bit_array.h"
+
+#define bar BIT_ARRAY
+
+#define barcreate  bit_array_create
+#define bardestroy bit_array_free
+#define baralloc   bit_array_alloc
+#define barfree    bit_array_dealloc
+#define barlen     bit_array_length
+
+#define barsize    bit_array_resize
+#define barcap     bit_array_ensure_size
+
+// These five are MACROs
+#define barget     bit_array_get
+#define barset     bit_array_set
+#define barclr     bit_array_clear
+#define barflip    bit_array_toggle
+#define barmake    bit_array_assign
+
+/* Functions instead of macros bars* => s for safe */
+#define barsget    bit_array_get_bit
+#define barsset    bit_array_set_bit
+#define barsclr    bit_array_clear_bit
+#define barsflip   bit_array_toggle_bit
+#define barsmake   bit_array_assign_bit
+
+/* "resize" functions barr*: automatically enlarge array if needed */
+#define barrget    bit_array_rget
+#define barrset    bit_array_rset
+#define barrclr    bit_array_rclear
+#define barrflip   bit_array_rtoggle
+#define barrmake   bit_array_rassign
+
+#define barsetn    bit_array_set_bits
+#define barclrn    bit_array_clear_bits
+#define barflipn   bit_array_toggle_bits
+
+#define barsetr    bit_array_set_region
+#define barclrr    bit_array_clear_region
+#define barflipr   bit_array_toggle_region
+
+#define barfill    bit_array_set_all
+#define barzero    bit_array_clear_all
+#define bartogl    bit_array_toggle_all
+
+/* gw "get word" */
+#define bargw64    bit_array_get_word64
+#define bargw32    bit_array_get_word32
+#define bargw16    bit_array_get_word16
+#define bargw8     bit_array_get_word8
+#define bargwn     bit_array_get_wordn
+
+/* sw "set word" */
+#define barsw64    bit_array_set_word64
+#define barsw32    bit_array_set_word32
+#define barsw16    bit_array_set_word16
+#define barswn     bit_array_set_wordn
+
+#define barncpy    bit_array_copy
+#define barcpy     bit_array_copy_all
+#define bardup     bit_array_clone
+
+#define barpopc    bit_array_num_bits_set
+#define barzeros   bit_array_num_bits_cleared
+#define bardist    bit_array_hamming_distance
+#define barparity  bit_array_parity
+
+#define barfns     bit_array_find_next_set_bit
+#define barfps     bit_array_find_prev_set_bit
+#define barffs     bit_array_find_first_set_bit
+#define barfls     bit_array_find_last_set_bit
+
+#define barfnz     bit_array_find_next_clear_bit
+#define barfpz     bit_array_find_prev_clear_bit
+#define barffz     bit_array_find_first_clear_bit
+#define barflz     bit_array_find_last_clear_bit
+
+#define barsort    bit_array_sort_bits
+#define barsortr   bit_array_sort_bits_rev
+
+#define barand     bit_array_and
+#define baror      bit_array_or
+#define barxor     bit_array_xor
+#define barnot     bit_array_not
+
+#define barcmp     bit_array_cmp
+#define barcmpbe   bit_array_cmp_big_endian
+#define barcmpw    bit_array_cmp_words
+#define barcmp64   bit_array_cmp_uint64
+
+#define barshr     bit_array_shift_right
+#define barshl     bit_array_shift_left
+#define bareshl    bit_array_shift_left_extend
+
+#define barcycr    bit_array_cycle_right
+#define barcycl    bit_array_cycle_left
+
+#define barmix     bit_array_interleave
+
+#define barrev     bit_array_reverse
+#define barrevr    bit_array_reverse_region
+
+#define bar2num    bit_array_as_num
+
+/* Add/sub/mult/div a bit array with: */
+/*   _i unsigned integer, _si shifted integer, _sb shifted bitarray */
+#define baraddi    bit_array_add_uint64
+#define baraddsi   bit_array_add_word
+#define baraddsb   bit_array_add_words
+#define barsubi    bit_array_sub_uint64
+#define barsubsi   bit_array_sub_word
+#define barsubsb   bit_array_sub_words
+#define barmuli    bit_array_mul_uint64
+#define bardivi    bit_array_div_uint64
+
+/* arguments are both bit arrays */
+#define baradd     bit_array_add
+#define barsub     bit_array_subtract
+#define barmul     bit_array_multiply
+#define bardiv     bit_array_divide
+
+#define barsave    bit_array_save
+#define barload    bit_array_load
+
+#define barhash    bit_array_hash
+
+#define barrand    bit_array_random
+#define barshfl    bit_array_shuffle
+#define barperm    bit_array_next_permutation
+
+#endif /* BAR_HEADER_SEEN */
diff --git a/include/bit_array.h b/include/bit_array.h
new file mode 100644
index 0000000..70b50ad
--- /dev/null
+++ b/include/bit_array.h
@@ -0,0 +1,552 @@
+/*
+ bit_array.h
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ maintainer: Isaac Turner <turner.isaac at gmail.com>
+ license: Public Domain, no warranty
+ date: Sep 2014
+*/
+
+#ifndef BIT_ARRAY_HEADER_SEEN
+#define BIT_ARRAY_HEADER_SEEN
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "bit_macros.h"
+
+typedef struct BIT_ARRAY BIT_ARRAY;
+
+// 64 bit words
+typedef uint64_t word_t, word_addr_t, bit_index_t;
+typedef uint8_t word_offset_t; // Offset within a 64 bit word
+
+#define BIT_INDEX_MIN 0
+#define BIT_INDEX_MAX (~(bit_index_t)0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Structs
+//
+
+struct BIT_ARRAY
+{
+  word_t* words;
+  bit_index_t num_of_bits;
+  // Number of words used -- this is just round_up(num_of_bits / 64)
+  // if num_of_bits == 0, this is 0
+  word_addr_t num_of_words;
+  // For more efficient allocation we use realloc only to double size --
+  // not for adding every word.  Initial size is INIT_CAPACITY_WORDS.
+  word_addr_t capacity_in_words;
+};
+
+//
+// Basics: Constructor, destructor, get length, resize
+//
+
+// Constructor - create a new bit array of length nbits
+BIT_ARRAY* bit_array_create(bit_index_t nbits);
+
+// Destructor - free the memory used for a bit array
+void bit_array_free(BIT_ARRAY* bitarray);
+
+// Allocate using existing struct
+BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits);
+void bit_array_dealloc(BIT_ARRAY* bitarr);
+
+// Get length of bit array
+bit_index_t bit_array_length(const BIT_ARRAY* bit_arr);
+
+// Change the size of a bit array. Enlarging an array will add zeros
+// to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory)
+char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits);
+
+// If bitarr length < num_bits, resizes to num_bits
+char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits);
+
+// Same as above but exit with an error message if out of memory
+void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
+void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
+
+
+//
+// Macros
+//
+
+//
+// Get, set, clear, assign and toggle individual bits
+// Macros for fast access -- beware: no bounds checking
+//
+
+#define bit_array_get(arr,i)      bitset_get((arr)->words, i)
+#define bit_array_set(arr,i)      bitset_set((arr)->words, i)
+#define bit_array_clear(arr,i)    bitset_del((arr)->words, i)
+#define bit_array_toggle(arr,i)   bitset_tgl((arr)->words, i)
+// c must be 0 or 1
+#define bit_array_assign(arr,i,c) bitset_cpy((arr)->words,i,c)
+
+//
+// Get, set, clear, assign and toggle individual bits
+// "Safe": use assert() to check bounds
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b);
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c);
+
+//
+// "Resizing": enlarge array if needed
+//
+
+char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c);
+
+//
+// Set, clear and toggle several bits at once
+//
+
+// Set multiple bits at once.
+// e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+// Clear multiple bits at once.
+// e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+// Toggle multiple bits at once
+// e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+//
+// Set, clear and toggle all bits in a region
+//
+
+// Set all the bits in a region
+void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+// Clear all the bits in a region
+void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+// Toggle all the bits in a region
+void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+//
+// Set, clear and toggle all bits at once
+//
+
+// Set all bits in this array to 1
+void bit_array_set_all(BIT_ARRAY* bitarr);
+
+// Set all bits in this array to 0
+void bit_array_clear_all(BIT_ARRAY* bitarr);
+
+// Set all 1 bits to 0, and all 0 bits to 1
+void bit_array_toggle_all(BIT_ARRAY* bitarr);
+
+//
+// Get / set a word of a given size
+//
+
+// First bit is in the least significant bit position
+// start index must be within the range of the bit array (0 <= x < length)
+uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start);
+uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start);
+uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start);
+uint8_t  bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start);
+uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n);
+
+// Set 64 bits at once from a particular start position
+void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word);
+void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word);
+void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word);
+void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte);
+void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n);
+
+//
+// Number of bits set
+//
+
+// Get the number of bits set (hamming weight)
+bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr);
+
+// Get the number of bits not set (length - hamming weight)
+bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr);
+
+// Get the number of bits set in on array and not the other.  This is equivalent
+// to hamming weight of the XOR when the two arrays are the same length.
+// e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010)
+bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1,
+                                       const BIT_ARRAY* arr2);
+
+// Parity - returns 1 if odd number of bits set, 0 if even
+char bit_array_parity(const BIT_ARRAY* bitarr);
+
+//
+// Find indices of set/clear bits
+//
+
+// Find the index of the next bit that is set, at or after `offset`
+// Returns 1 if a bit is set, otherwise 0
+// Index of next set bit is stored in the integer pointed to by result
+// If no next bit is set result is not changed
+char bit_array_find_next_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the next bit that is NOT set, at or after `offset`
+// Returns 1 if a bit is NOT set, otherwise 0
+// Index of next zero bit is stored in the integer pointed to by `result`
+// If no next bit is zero, value at `result` is not changed
+char bit_array_find_next_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the previous bit that is set, before offset.
+// Returns 1 if a bit is set, otherwise 0
+// Index of previous set bit is stored in the integer pointed to by `result`
+// If no previous bit is set result is not changed
+char bit_array_find_prev_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the previous bit that is NOT set, before offset.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of previous zero bit is stored in the integer pointed to by `result`
+// If no previous bit is zero result is not changed
+char bit_array_find_prev_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                   bit_index_t* result);
+
+// Find the index of the first bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of first set bit is stored in the integer pointed to by `result`
+// If no bit is set result is not changed
+char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the first bit that is NOT set.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of first zero bit is stored in the integer pointed to by `result`
+// If no bit is zero result is not changed
+char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the last bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of last set bit is stored in the integer pointed to by `result`
+// If no bit is set result is not changed
+char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the last bit that is NOT set.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of last zero bit is stored in the integer pointed to by `result`
+// If no bit is zero result is not changed
+char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+
+//
+// Sorting
+//
+
+// Put all the 0s before all the 1s
+void bit_array_sort_bits(BIT_ARRAY* bitarr);
+
+// Put all the 1s before all the 0s
+void bit_array_sort_bits_rev(BIT_ARRAY* bitarr);
+
+
+//
+// String and printing methods
+//
+
+// Construct a BIT_ARRAY from a string.
+void bit_array_from_str(BIT_ARRAY* bitarr, const char* bitstr);
+
+// Construct a BIT_ARRAY from a substring with given on and off characters.
+void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset,
+                           const char* str, size_t len,
+                           const char *on, const char *off, char left_to_right);
+
+// Takes a char array to write to.  `str` must be bitarr->num_of_bits+1 in
+// length. Terminates string with '\0'
+char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str);
+char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str);
+
+// Get a string representations for a given region, using given on/off
+// characters.
+// Note: does not null-terminate
+void bit_array_to_substr(const BIT_ARRAY* bitarr,
+                         bit_index_t start, bit_index_t length,
+                         char* str, char on, char off, char left_to_right);
+
+// Print this array to a file stream.  Prints '0's and '1'.  Doesn't print
+// newline.
+void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout);
+
+// Print a string representations for a given region, using given on/off
+// characters. Reverse prints from highest to lowest -- this is useful for
+// printing binary numbers
+void bit_array_print_substr(const BIT_ARRAY* bitarr,
+                            bit_index_t start, bit_index_t length,
+                            FILE* fout, char on, char off, char left_to_right);
+
+//
+// Decimal
+//
+
+// Get bit array as decimal str (e.g. 0b1101 -> "13")
+size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len);
+
+// Return number of characters used
+size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal);
+
+//
+// Hexidecimal
+//
+
+// Loads array from hex string
+// Returns the number of bits loaded (will be chars rounded up to multiple of 8)
+// (0 on failure)
+bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset,
+                               const char* str, size_t len);
+
+// Returns number of characters written
+size_t bit_array_to_hex(const BIT_ARRAY* bitarr,
+                        bit_index_t start, bit_index_t length,
+                        char* str, char uppercase);
+
+// Print bit array as hex
+size_t bit_array_print_hex(const BIT_ARRAY* bitarr,
+                           bit_index_t start, bit_index_t length,
+                           FILE* fout, char uppercase);
+
+//
+// Clone and copy
+//
+
+// Copy a BIT_ARRAY struct and the data it holds - returns pointer to new object
+#define bit_array_dup	bit_array_clone
+BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr);
+
+// Copy bits from one array to another
+// Note: use MACRO bit_array_copy
+// Destination and source can be the same bit_array and
+// src/dst regions can overlap
+void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                    const BIT_ARRAY* src, bit_index_t srcindx,
+                    bit_index_t length);
+
+// copy all of src to dst. dst is resized to match src.
+void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src);
+
+//
+// Logic operators
+//
+
+// BIT_ARRAYs can all be different or the same object
+// dest array will be resized if it is too short
+//
+void bit_array_and(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_or (BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_xor(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_not(BIT_ARRAY* dest, const BIT_ARRAY* src);
+
+//
+// Comparisons
+//
+
+// Note: (bit_array_cmp(a,b) == 0) <=> (bit_array_cmp_big_endian(a,b) == 0)
+
+// comparison functions return:
+//   1 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  -1 iff bitarr1 < bitarr2
+
+// Compare two bit arrays by value stored, with index 0 being the Least
+// Significant Bit (LSB). Arrays do not have to be the same length.
+// Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side]
+int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
+
+// Compare two bit arrays by value stored, with index 0 being the Most
+// Significant Bit (MSB). Arrays do not have to be the same length.
+// Example: 10.. > 01.. [index 0 is MSB at left hand side]
+int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
+
+// compare bitarr with (bitarr2 << pos)
+int bit_array_cmp_words(const BIT_ARRAY *bitarr,
+                        bit_index_t pos, const BIT_ARRAY *bitarr2);
+
+//
+// Shift, interleave, reverse
+//
+
+// Shift array left/right.  If fill is zero, filled with 0, otherwise 1
+void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
+void bit_array_shift_left (BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
+
+// shift left without losing any bits. Resizes bitarr.
+void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist,
+                                 char fill);
+
+// Cyclic shift
+void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t dist);
+void bit_array_cycle_left (BIT_ARRAY* bitarr, bit_index_t dist);
+
+// Interleave
+// dst cannot point to the same bit array as src1 or src2
+// src1, src2 may point to the same bit array
+// abcd 1234 -> a1b2c3d4
+// 0011 0000 -> 00001010
+// 1111 0000 -> 10101010
+// 0101 1010 -> 01100110
+// Extends dst if it is too short, but does not shrink it if it is too long
+// if dst is longer than length(src1)+length(src2), the end bits are not altered
+void bit_array_interleave(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1,
+                          const BIT_ARRAY* src2);
+
+// Reverse the whole array or part of it
+void bit_array_reverse(BIT_ARRAY* bitarr);
+void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+//
+// Numeric
+//
+
+// Returns 1 on sucess, 0 if value in array is too big
+char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result);
+
+// 1 iff bitarr > value
+// 0 iff bitarr == value
+// -1 iff bitarr < value
+int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value);
+
+//
+// Arithmetic
+//
+
+// bitarr will be extended if needed
+void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value);
+
+// Add `add` to `bitarr` at `pos` -- same as:
+//   bitarr + (add << pos)
+// where pos can be bigger than the length of the array (bitarr will be resized)
+void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add);
+
+// Add `add` to `bitarr` at `pos`
+void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add);
+
+// If value is greater than bitarr, bitarr is not changed and 0 is returned
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value);
+
+// minus `minus` from `bitarr` at `pos` -- same as:
+//   bitarr + (minus << pos)
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_word(BIT_ARRAY *bitarr, bit_index_t pos, word_t minus);
+
+// minus `minus` from `bitarr` at `pos`
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus);
+
+// Multiply by some value
+void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier);
+
+// bitarr = round_down(bitarr / divisor)
+// rem = bitarr % divisor
+void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem);
+
+//
+// Arithmetic between arrays
+//
+
+// dst = src1 + src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than either of src1, src2, it is enlarged
+void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+
+// dst = src1 - src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than src1, it will be extended to be as long as src1
+// src1 must be greater than or equal to src2 (src1 >= src2)
+void bit_array_subtract(BIT_ARRAY* dst,
+                        const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+
+// dst = src1 * src2
+// Pointers cannot all point to the same BIT_ARRAY
+void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2);
+
+// Results in:
+//   quotient = dividend / divisor
+//   dividend = dividend % divisor
+// (dividend is used to return the remainder)
+void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor);
+
+//
+// Read/Write bit_array to a file
+//
+// File format is [8 bytes: for number of elements in array][data]
+// Number of bytes of data is: (int)((num_of_bits + 7) / 8)
+//
+
+// Saves bit array to a file
+// returns the number of bytes written
+bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f);
+
+// Reads bit array from a file. bitarr is resized and filled.
+// Returns 1 on success, 0 on failure
+char bit_array_load(BIT_ARRAY* bitarr, FILE* f);
+
+
+//
+// Hash function
+//
+
+// Pass seed as 0 on first call, pass previous hash value if rehashing due
+// to a collision
+// Using bob jenkins hash lookup3
+uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed);
+
+//
+// Randomness
+//
+
+// Set bits randomly with probability prob : 0 <= prob <= 1
+void bit_array_random(BIT_ARRAY* bitarr, float prob);
+
+// Shuffle the bits in an array randomly
+void bit_array_shuffle(BIT_ARRAY* bitarr);
+
+// Get the next permutation of an array with a fixed size and given number of
+// bits set.  Also known as next lexicographic permutation.
+// Given a bit array find the next lexicographic orginisation of the bits
+// Number of possible combinations given by (size choose bits_set) i.e. nCk
+// 00011 -> 00101 -> 00110 -> 01001 -> 01010 ->
+// 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start)
+void bit_array_next_permutation(BIT_ARRAY* bitarr);
+
+//
+// Generally useful functions
+//
+
+// Generalised 'binary to string' function
+// Adds bits to the string in order of lsb to msb
+// e.g. 0b11010 (26 in decimal) would come out as "01011"
+char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str);
+
+// Same as above but in reverse
+char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/bit_macros.h b/include/bit_macros.h
new file mode 100644
index 0000000..7eebfcb
--- /dev/null
+++ b/include/bit_macros.h
@@ -0,0 +1,205 @@
+/*
+ bit_macros.h
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ author: Isaac Turner <turner.isaac at gmail.com>
+ license: Public Domain, no warranty
+ date: Dec 2013
+*/
+
+#ifndef BITSET_H_
+#define BITSET_H_
+
+#include <inttypes.h>
+#include <sched.h>
+
+// trailing_zeros is number of least significant zeros
+// leading_zeros is number of most significant zeros
+#if defined(_WIN32)
+  #define trailing_zeros(x) ({ __typeof(x) _r; _BitScanReverse64(&_r, x); _r; })
+  #define leading_zeros(x) ({ __typeof(x) _r; _BitScanForward64(&_r, x); _r; })
+#else
+  #define trailing_zeros(x) ((x) ? (__typeof(x))__builtin_ctzll(x) : (__typeof(x))sizeof(x)*8)
+  #define leading_zeros(x) ((x) ? (__typeof(x))__builtin_clzll(x) : (__typeof(x))sizeof(x)*8)
+#endif
+
+// Get index of top set bit. If x is 0 return nbits
+#define top_set_bit(x) ((x) ? sizeof(x)*8-leading_zeros(x)-1 : sizeof(x)*8)
+
+#define roundup_bits2bytes(bits)   (((bits)+7)/8)
+#define roundup_bits2words32(bits) (((bits)+31)/32)
+#define roundup_bits2words64(bits) (((bits)+63)/64)
+
+// Round a number up to the nearest number that is a power of two
+#define roundup2pow(x) (1UL << (64 - leading_zeros(x)))
+
+#define rot32(x,r) (((x)<<(r)) | ((x)>>(32-(r))))
+#define rot64(x,r) (((x)<<(r)) | ((x)>>(64-(r))))
+
+// need to check for length == 0, undefined behaviour if uint64_t >> 64 etc
+#define bitmask(nbits,type) ((nbits) ? ~(type)0 >> (sizeof(type)*8-(nbits)): (type)0)
+#define bitmask32(nbits) bitmask(nbits,uint32_t)
+#define bitmask64(nbits) bitmask(nbits,uint64_t)
+
+// A possibly faster way to combine two words with a mask
+//#define bitmask_merge(a,b,abits) ((a & abits) | (b & ~abits))
+#define bitmask_merge(a,b,abits) (b ^ ((a ^ b) & abits))
+
+// Swap lowest four bits. A nibble is 4 bits (i.e. half a byte)
+#define rev_nibble(x) ((((x)&1)<<3)|(((x)&2)<<1)|(((x)&4)>>1)|(((x)&8)>>3))
+
+//
+// Bit array (bitset)
+//
+// bitsetX_wrd(): get word for a given position
+// bitsetX_idx(): get index within word for a given position
+#define _VOLPTR(x) ((volatile __typeof(x) *)(&(x)))
+#define _VOLVALUE(x) (*_VOLPTR(x))
+
+#define _TYPESHIFT(arr,word,shift) \
+        ((__typeof(*(arr)))((__typeof(*(arr)))(word) << (shift)))
+
+#define bitsetX_wrd(wrdbits,pos) ((pos) / (wrdbits))
+#define bitsetX_idx(wrdbits,pos) ((pos) % (wrdbits))
+
+#define bitset32_wrd(pos) ((pos) >> 5)
+#define bitset32_idx(pos) ((pos) & 31)
+
+#define bitset64_wrd(pos) ((pos) >> 6)
+#define bitset64_idx(pos) ((pos) & 63)
+
+//
+// Bit functions on arrays
+//
+#define bitset2_get(arr,wrd,idx)     (((arr)[wrd] >> (idx)) & 0x1)
+#define bitset2_set(arr,wrd,idx)     ((arr)[wrd] |=  _TYPESHIFT(arr,1,idx))
+#define bitset2_del(arr,wrd,idx)     ((arr)[wrd] &=~ _TYPESHIFT(arr,1,idx))
+#define bitset2_tgl(arr,wrd,idx)     ((arr)[wrd] ^=  _TYPESHIFT(arr,1,idx))
+#define bitset2_or(arr,wrd,idx,bit)  ((arr)[wrd] |=  _TYPESHIFT(arr,bit,idx))
+#define bitset2_xor(arr,wrd,idx,bit) ((arr)[wrd]  = ~((arr)[wrd] ^ (~_TYPESHIFT(arr,bit,idx))))
+#define bitset2_and(arr,wrd,idx,bit) ((arr)[wrd] &= (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx)))
+#define bitset2_cpy(arr,wrd,idx,bit) ((arr)[wrd]  = ((arr)[wrd] &~ _TYPESHIFT(arr,1,idx)) | _TYPESHIFT(arr,bit,idx))
+
+//
+// Thread safe versions
+//
+// They return the value of the bit (0 or 1) before it was updated
+#define bitset2_get_mt(arr,wrd,idx)     bitset2_get(_VOLPTR(*(arr)),wrd,idx)
+#define bitset2_set_mt(arr,wrd,idx)     ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_del_mt(arr,wrd,idx)     ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), ~_TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_tgl_mt(arr,wrd,idx)     ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_or_mt(arr,wrd,idx,bit)  ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
+#define bitset2_xor_mt(arr,wrd,idx,bit) ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
+#define bitset2_and_mt(arr,wrd,idx,bit) ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx))) >> (idx))&1)
+#define bitset2_cpy_mt(arr,wrd,idx,bit) ((bit) ? bitset2_set_mt(arr,wrd,idx) : bitset2_del_mt(arr,wrd,idx))
+
+//
+// Auto detect size of type from pointer
+//
+#define bitset_wrd(arr,pos) bitsetX_wrd(sizeof(*(arr))*8,pos)
+#define bitset_idx(arr,pos) bitsetX_idx(sizeof(*(arr))*8,pos)
+#define bitset_op(func,arr,pos)      func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos))
+#define bitset_op2(func,arr,pos,bit) func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos), bit)
+
+// Auto-detect type size: bit functions
+#define bitset_get(arr,pos)     bitset_op(bitset2_get, arr, pos)
+#define bitset_set(arr,pos)     bitset_op(bitset2_set, arr, pos)
+#define bitset_del(arr,pos)     bitset_op(bitset2_del, arr, pos)
+#define bitset_tgl(arr,pos)     bitset_op(bitset2_tgl, arr, pos)
+#define bitset_or(arr,pos,bit)  bitset_op2(bitset2_or, arr, pos, bit)
+#define bitset_xor(arr,pos,bit) bitset_op2(bitset2_xor, arr, pos, bit)
+#define bitset_and(arr,pos,bit) bitset_op2(bitset2_and, arr, pos, bit)
+#define bitset_cpy(arr,pos,bit) bitset_op2(bitset2_cpy, arr, pos, bit)
+
+// Auto-detect type size: thread safe bit functions
+// They return the value of the bit (0 or 1) before it was updated
+#define bitset_get_mt(arr,pos)     bitset_op(bitset2_get_mt,  arr, pos)
+#define bitset_set_mt(arr,pos)     bitset_op(bitset2_set_mt,  arr, pos)
+#define bitset_del_mt(arr,pos)     bitset_op(bitset2_del_mt,  arr, pos)
+#define bitset_tgl_mt(arr,pos)     bitset_op(bitset2_tgl_mt,  arr, pos)
+#define bitset_or_mt(arr,pos,bit)  bitset_op2(bitset2_or_mt,  arr, pos, bit)
+#define bitset_xor_mt(arr,pos,bit) bitset_op2(bitset2_xor_mt, arr, pos, bit)
+#define bitset_and_mt(arr,pos,bit) bitset_op2(bitset2_and_mt, arr, pos, bit)
+#define bitset_cpy_mt(arr,pos,bit) bitset_op2(bitset2_cpy_mt, arr, pos, bit)
+
+// Clearing a word does not return a meaningful value
+#define bitset_clear_word(arr,pos) ((arr)[bitset_wrd(arr,pos)] = 0)
+#define bitset_clear_word_mt(arr,pos) (_VOLVALUE((arr)[bitset_wrd(arr,pos)]) = 0)
+
+//
+// Compact bit array of spin locks
+// These are most effecient when arr is of type: volatile char*
+//
+// Acquire a lock
+#define bitlock_acquire_block(arr,pos,wait,abandon) do {                       \
+  size_t _w = bitset_wrd(arr,pos);                                             \
+  __typeof(*(arr)) _o, _n, _b = _TYPESHIFT(arr, 1, bitset_idx(arr,pos));       \
+  do {                                                                         \
+    while((_o = _VOLVALUE((arr)[_w])) & _b) { wait }                           \
+    abandon                                                                    \
+    _n = _o | _b;                                                              \
+  } while(!__sync_bool_compare_and_swap(_VOLPTR((arr)[_w]), _o, _n));          \
+  __sync_synchronize(); /* Must not move commands to before acquiring lock */  \
+} while(0)
+
+// Undefined behaviour if you do not already hold the lock
+#define bitlock_release(arr,pos) do {                                          \
+  size_t _w = bitset_wrd(arr,pos);                                             \
+  __typeof(*(arr)) _mask = ~_TYPESHIFT(arr, 1, bitset_idx(arr,pos));           \
+  __sync_synchronize(); /* Must get the lock before releasing it */            \
+  __sync_and_and_fetch(_VOLPTR((arr)[_w]), _mask);                             \
+} while(0)
+
+#define bitlock_acquire(arr,pos) bitlock_acquire_block(arr,pos,{},{})
+
+// calls yield if cannot acquire the lock
+#define bitlock_yield_acquire(arr,pos) bitlock_acquire_block(arr,pos,sched_yield();,{})
+
+// Block until we get the lock or someone else does
+// sets the memory pointed to by retptr to 1 if we got the lock, 0 otherwise
+#define bitlock_try_acquire(arr,pos,retptr) do {                               \
+  *retptr = 1; /* default to success, set to zero if locked */                 \
+  bitlock_acquire_block(arr,pos,{*retptr=0;break;},if(!*retptr){break;});      \
+} while(0)
+
+/*
+ * Byteswapping
+ */
+
+/* clang uses these to check for features */
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+/* GCC versions < 4.3 do not have __builtin_bswapX() */
+#if ( defined(__clang__) && !__has_builtin(__builtin_bswap64) ) ||             \
+    ( !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) &&   \
+      ( (__GNUC__ < 4)  || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)) )
+  #define byteswap64(x) ( (((uint64_t)(x) << 56))                       | \
+                          (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
+                          (((uint64_t)(x) << 24) & 0xff0000000000ULL)   | \
+                          (((uint64_t)(x) <<  8) & 0xff00000000ULL)     | \
+                          (((uint64_t)(x) >>  8) & 0xff000000ULL)       | \
+                          (((uint64_t)(x) >> 24) & 0xff0000ULL)         | \
+                          (((uint64_t)(x) >> 40) & 0xff00ULL)           | \
+                          (((uint64_t)(x) >> 56)) )
+
+  #define byteswap32(x) ( (((uint32_t)(x) << 24))                       | \
+                          (((uint32_t)(x) <<  8) & 0xff0000U)           | \
+                          (((uint32_t)(x) >>  8) & 0xff00U)             | \
+                          (((uint32_t)(x) >> 24)) )
+
+  /* uint16_t type might be bigger than 2 bytes, so need to mask */
+  #define byteswap16(x) ( (((uint16_t)(x) & 0xff) << 8) | \
+                          (((uint16_t)(x) >> 8) & 0xff) )
+#else
+  #define byteswap64(x) __builtin_bswap64(x)
+  #define byteswap32(x) __builtin_bswap64(x)
+  #define byteswap16(x) __builtin_bswap64(x)
+#endif
+
+#endif /* BITLOCK_H_ */
diff --git a/include/btree/btree.h b/include/btree/btree.h
new file mode 100644
index 0000000..49310a2
--- /dev/null
+++ b/include/btree/btree.h
@@ -0,0 +1,2394 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree implementation of the STL set and map interfaces. A btree is both
+// smaller and faster than STL set/map. The red-black tree implementation of
+// STL set/map has an overhead of 3 pointers (left, right and parent) plus the
+// node color information for each stored value. So a set<int32> consumes 20
+// bytes for each value stored. This btree implementation stores multiple
+// values on fixed size nodes (usually 256 bytes) and doesn't store child
+// pointers for leaf nodes. The result is that a btree_set<int32> may use much
+// less memory per stored value. For the random insertion benchmark in
+// btree_test.cc, a btree_set<int32> with node-size of 256 uses 4.9 bytes per
+// stored value.
+//
+// The packing of multiple values on to each node of a btree has another effect
+// besides better space utilization: better cache locality due to fewer cache
+// lines being accessed. Better cache locality translates into faster
+// operations.
+//
+// CAVEATS
+//
+// Insertions and deletions on a btree can cause splitting, merging or
+// rebalancing of btree nodes. And even without these operations, insertions
+// and deletions on a btree will move values around within a node. In both
+// cases, the result is that insertions and deletions can invalidate iterators
+// pointing to values other than the one being inserted/deleted. This is
+// notably different from STL set/map which takes care to not invalidate
+// iterators on insert/erase except, of course, for iterators pointing to the
+// value being erased.  A partial workaround when erasing is available:
+// erase() returns an iterator pointing to the item just after the one that was
+// erased (or end() if none exists).  See also safe_btree.
+
+// PERFORMANCE
+//
+//   btree_bench --benchmarks=. 2>&1 | ./benchmarks.awk
+//
+// Run on pmattis-warp.nyc (4 X 2200 MHz CPUs); 2010/03/04-15:23:06
+// Benchmark                 STL(ns) B-Tree(ns) @    <size>
+// --------------------------------------------------------
+// BM_set_int32_insert        1516      608  +59.89%  <256>    [40.0,  5.2]
+// BM_set_int32_lookup        1160      414  +64.31%  <256>    [40.0,  5.2]
+// BM_set_int32_fulllookup     960      410  +57.29%  <256>    [40.0,  4.4]
+// BM_set_int32_delete        1741      528  +69.67%  <256>    [40.0,  5.2]
+// BM_set_int32_queueaddrem   3078     1046  +66.02%  <256>    [40.0,  5.5]
+// BM_set_int32_mixedaddrem   3600     1384  +61.56%  <256>    [40.0,  5.3]
+// BM_set_int32_fifo           227      113  +50.22%  <256>    [40.0,  4.4]
+// BM_set_int32_fwditer        158       26  +83.54%  <256>    [40.0,  5.2]
+// BM_map_int32_insert        1551      636  +58.99%  <256>    [48.0, 10.5]
+// BM_map_int32_lookup        1200      508  +57.67%  <256>    [48.0, 10.5]
+// BM_map_int32_fulllookup     989      487  +50.76%  <256>    [48.0,  8.8]
+// BM_map_int32_delete        1794      628  +64.99%  <256>    [48.0, 10.5]
+// BM_map_int32_queueaddrem   3189     1266  +60.30%  <256>    [48.0, 11.6]
+// BM_map_int32_mixedaddrem   3822     1623  +57.54%  <256>    [48.0, 10.9]
+// BM_map_int32_fifo           151      134  +11.26%  <256>    [48.0,  8.8]
+// BM_map_int32_fwditer        161       32  +80.12%  <256>    [48.0, 10.5]
+// BM_set_int64_insert        1546      636  +58.86%  <256>    [40.0, 10.5]
+// BM_set_int64_lookup        1200      512  +57.33%  <256>    [40.0, 10.5]
+// BM_set_int64_fulllookup     971      487  +49.85%  <256>    [40.0,  8.8]
+// BM_set_int64_delete        1745      616  +64.70%  <256>    [40.0, 10.5]
+// BM_set_int64_queueaddrem   3163     1195  +62.22%  <256>    [40.0, 11.6]
+// BM_set_int64_mixedaddrem   3760     1564  +58.40%  <256>    [40.0, 10.9]
+// BM_set_int64_fifo           146      103  +29.45%  <256>    [40.0,  8.8]
+// BM_set_int64_fwditer        162       31  +80.86%  <256>    [40.0, 10.5]
+// BM_map_int64_insert        1551      720  +53.58%  <256>    [48.0, 20.7]
+// BM_map_int64_lookup        1214      612  +49.59%  <256>    [48.0, 20.7]
+// BM_map_int64_fulllookup     994      592  +40.44%  <256>    [48.0, 17.2]
+// BM_map_int64_delete        1778      764  +57.03%  <256>    [48.0, 20.7]
+// BM_map_int64_queueaddrem   3189     1547  +51.49%  <256>    [48.0, 20.9]
+// BM_map_int64_mixedaddrem   3779     1887  +50.07%  <256>    [48.0, 21.6]
+// BM_map_int64_fifo           147      145   +1.36%  <256>    [48.0, 17.2]
+// BM_map_int64_fwditer        162       41  +74.69%  <256>    [48.0, 20.7]
+// BM_set_string_insert       1989     1966   +1.16%  <256>    [64.0, 44.5]
+// BM_set_string_lookup       1709     1600   +6.38%  <256>    [64.0, 44.5]
+// BM_set_string_fulllookup   1573     1529   +2.80%  <256>    [64.0, 35.4]
+// BM_set_string_delete       2520     1920  +23.81%  <256>    [64.0, 44.5]
+// BM_set_string_queueaddrem  4706     4309   +8.44%  <256>    [64.0, 48.3]
+// BM_set_string_mixedaddrem  5080     4654   +8.39%  <256>    [64.0, 46.7]
+// BM_set_string_fifo          318      512  -61.01%  <256>    [64.0, 35.4]
+// BM_set_string_fwditer       182       93  +48.90%  <256>    [64.0, 44.5]
+// BM_map_string_insert       2600     2227  +14.35%  <256>    [72.0, 55.8]
+// BM_map_string_lookup       2068     1730  +16.34%  <256>    [72.0, 55.8]
+// BM_map_string_fulllookup   1859     1618  +12.96%  <256>    [72.0, 44.0]
+// BM_map_string_delete       3168     2080  +34.34%  <256>    [72.0, 55.8]
+// BM_map_string_queueaddrem  5840     4701  +19.50%  <256>    [72.0, 59.4]
+// BM_map_string_mixedaddrem  6400     5200  +18.75%  <256>    [72.0, 57.8]
+// BM_map_string_fifo          398      596  -49.75%  <256>    [72.0, 44.0]
+// BM_map_string_fwditer       243      113  +53.50%  <256>    [72.0, 55.8]
+
+#ifndef UTIL_BTREE_BTREE_H__
+#define UTIL_BTREE_BTREE_H__
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <new>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#ifndef NDEBUG
+#define NDEBUG 1
+#endif
+
+namespace btree {
+
+// Inside a btree method, if we just call swap(), it will choose the
+// btree::swap method, which we don't want. And we can't say ::swap
+// because then MSVC won't pickup any std::swap() implementations. We
+// can't just use std::swap() directly because then we don't get the
+// specialization for types outside the std namespace. So the solution
+// is to have a special swap helper function whose name doesn't
+// collide with other swap functions defined by the btree classes.
+template <typename T>
+inline void btree_swap_helper(T &a, T &b) {
+  using std::swap;
+  swap(a, b);
+}
+
+// A template helper used to select A or B based on a condition.
+template<bool cond, typename A, typename B>
+struct if_{
+  typedef A type;
+};
+
+template<typename A, typename B>
+struct if_<false, A, B> {
+  typedef B type;
+};
+
+// Types small_ and big_ are promise that sizeof(small_) < sizeof(big_)
+typedef char small_;
+
+struct big_ {
+  char dummy[2];
+};
+
+// A compile-time assertion.
+template <bool>
+struct CompileAssert {
+};
+
+#define COMPILE_ASSERT(expr, msg) \
+  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// A helper type used to indicate that a key-compare-to functor has been
+// provided. A user can specify a key-compare-to functor by doing:
+//
+//  struct MyStringComparer
+//      : public util::btree::btree_key_compare_to_tag {
+//    int operator()(const string &a, const string &b) const {
+//      return a.compare(b);
+//    }
+//  };
+//
+// Note that the return type is an int and not a bool. There is a
+// COMPILE_ASSERT which enforces this return type.
+struct btree_key_compare_to_tag {
+};
+
+// A helper class that indicates if the Compare parameter is derived from
+// btree_key_compare_to_tag.
+template <typename Compare>
+struct btree_is_key_compare_to
+    : public std::is_convertible<Compare, btree_key_compare_to_tag> {
+};
+
+// A helper class to convert a boolean comparison into a three-way
+// "compare-to" comparison that returns a negative value to indicate
+// less-than, zero to indicate equality and a positive value to
+// indicate greater-than. This helper class is specialized for
+// less<string> and greater<string>. The btree_key_compare_to_adapter
+// class is provided so that btree users automatically get the more
+// efficient compare-to code when using common google string types
+// with common comparison functors.
+template <typename Compare>
+struct btree_key_compare_to_adapter : Compare {
+  btree_key_compare_to_adapter() { }
+  btree_key_compare_to_adapter(const Compare &c) : Compare(c) { }
+  btree_key_compare_to_adapter(const btree_key_compare_to_adapter<Compare> &c)
+      : Compare(c) {
+  }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::less<std::string> >
+    : public btree_key_compare_to_tag {
+  btree_key_compare_to_adapter() {}
+  btree_key_compare_to_adapter(const std::less<std::string>&) {}
+  btree_key_compare_to_adapter(
+      const btree_key_compare_to_adapter<std::less<std::string> >&) {}
+  int operator()(const std::string &a, const std::string &b) const {
+    return a.compare(b);
+  }
+};
+
+template <>
+struct btree_key_compare_to_adapter<std::greater<std::string> >
+    : public btree_key_compare_to_tag {
+  btree_key_compare_to_adapter() {}
+  btree_key_compare_to_adapter(const std::greater<std::string>&) {}
+  btree_key_compare_to_adapter(
+      const btree_key_compare_to_adapter<std::greater<std::string> >&) {}
+  int operator()(const std::string &a, const std::string &b) const {
+    return b.compare(a);
+  }
+};
+
+// A helper class that allows a compare-to functor to behave like a plain
+// compare functor. This specialization is used when we do not have a
+// compare-to functor.
+template <typename Key, typename Compare, bool HaveCompareTo>
+struct btree_key_comparer {
+  btree_key_comparer() {}
+  btree_key_comparer(Compare c) : comp(c) {}
+  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+    return comp(x, y);
+  }
+  bool operator()(const Key &x, const Key &y) const {
+    return bool_compare(comp, x, y);
+  }
+  Compare comp;
+};
+
+// A specialization of btree_key_comparer when a compare-to functor is
+// present. We need a plain (boolean) comparison in some parts of the btree
+// code, such as insert-with-hint.
+template <typename Key, typename Compare>
+struct btree_key_comparer<Key, Compare, true> {
+  btree_key_comparer() {}
+  btree_key_comparer(Compare c) : comp(c) {}
+  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
+    return comp(x, y) < 0;
+  }
+  bool operator()(const Key &x, const Key &y) const {
+    return bool_compare(comp, x, y);
+  }
+  Compare comp;
+};
+
+// A helper function to compare to keys using the specified compare
+// functor. This dispatches to the appropriate btree_key_comparer comparison,
+// depending on whether we have a compare-to functor or not (which depends on
+// whether Compare is derived from btree_key_compare_to_tag).
+template <typename Key, typename Compare>
+static bool btree_compare_keys(
+    const Compare &comp, const Key &x, const Key &y) {
+  typedef btree_key_comparer<Key, Compare,
+      btree_is_key_compare_to<Compare>::value> key_comparer;
+  return key_comparer::bool_compare(comp, x, y);
+}
+
+template <typename Key, typename Compare,
+          typename Alloc, int TargetNodeSize, int ValueSize>
+struct btree_common_params {
+  // If Compare is derived from btree_key_compare_to_tag then use it as the
+  // key_compare type. Otherwise, use btree_key_compare_to_adapter<> which will
+  // fall-back to Compare if we don't have an appropriate specialization.
+  typedef typename if_<
+    btree_is_key_compare_to<Compare>::value,
+    Compare, btree_key_compare_to_adapter<Compare> >::type key_compare;
+  // A type which indicates if we have a key-compare-to functor or a plain old
+  // key-compare functor.
+  typedef btree_is_key_compare_to<key_compare> is_key_compare_to;
+
+  typedef Alloc allocator_type;
+  typedef Key key_type;
+  typedef ssize_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  enum {
+    kTargetNodeSize = TargetNodeSize,
+
+    // Available space for values.  This is largest for leaf nodes,
+    // which has overhead no fewer than two pointers.
+    kNodeValueSpace = TargetNodeSize - 2 * sizeof(void*),
+  };
+
+  // This is an integral type large enough to hold as many
+  // ValueSize-values as will fit a node of TargetNodeSize bytes.
+  typedef typename if_<
+    (kNodeValueSpace / ValueSize) >= 256,
+    uint16_t,
+    uint8_t>::type node_count_type;
+};
+
+// A parameters structure for holding the type parameters for a btree_map.
+template <typename Key, typename Data, typename Compare,
+          typename Alloc, int TargetNodeSize>
+struct btree_map_params
+    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+                                 sizeof(Key) + sizeof(Data)> {
+  typedef Data data_type;
+  typedef Data mapped_type;
+  typedef std::pair<const Key, data_type> value_type;
+  typedef std::pair<Key, data_type> mutable_value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  enum {
+    kValueSize = sizeof(Key) + sizeof(data_type),
+  };
+
+  static const Key& key(const value_type &x) { return x.first; }
+  static const Key& key(const mutable_value_type &x) { return x.first; }
+  static void swap(mutable_value_type *a, mutable_value_type *b) {
+    btree_swap_helper(a->first, b->first);
+    btree_swap_helper(a->second, b->second);
+  }
+};
+
+// A parameters structure for holding the type parameters for a btree_set.
+template <typename Key, typename Compare, typename Alloc, int TargetNodeSize>
+struct btree_set_params
+    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
+                                 sizeof(Key)> {
+  typedef std::false_type data_type;
+  typedef std::false_type mapped_type;
+  typedef Key value_type;
+  typedef value_type mutable_value_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  enum {
+    kValueSize = sizeof(Key),
+  };
+
+  static const Key& key(const value_type &x) { return x; }
+  static void swap(mutable_value_type *a, mutable_value_type *b) {
+    btree_swap_helper<mutable_value_type>(*a, *b);
+  }
+};
+
+// An adapter class that converts a lower-bound compare into an upper-bound
+// compare.
+template <typename Key, typename Compare>
+struct btree_upper_bound_adapter : public Compare {
+  btree_upper_bound_adapter(Compare c) : Compare(c) {}
+  bool operator()(const Key &a, const Key &b) const {
+    return !static_cast<const Compare&>(*this)(b, a);
+  }
+};
+
+template <typename Key, typename CompareTo>
+struct btree_upper_bound_compare_to_adapter : public CompareTo {
+  btree_upper_bound_compare_to_adapter(CompareTo c) : CompareTo(c) {}
+  int operator()(const Key &a, const Key &b) const {
+    return static_cast<const CompareTo&>(*this)(b, a);
+  }
+};
+
+// Dispatch helper class for using linear search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_linear_search_plain_compare {
+  static int lower_bound(const K &k, const N &n, Compare comp)  {
+    return n.linear_search_plain_compare(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, Compare comp)  {
+    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using linear search with compare-to
+template <typename K, typename N, typename CompareTo>
+struct btree_linear_search_compare_to {
+  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
+    return n.linear_search_compare_to(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
+    typedef btree_upper_bound_adapter<K,
+        btree_key_comparer<K, CompareTo, true> > upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using binary search with plain compare.
+template <typename K, typename N, typename Compare>
+struct btree_binary_search_plain_compare {
+  static int lower_bound(const K &k, const N &n, Compare comp)  {
+    return n.binary_search_plain_compare(k, 0, n.count(), comp);
+  }
+  static int upper_bound(const K &k, const N &n, Compare comp)  {
+    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
+    return n.binary_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// Dispatch helper class for using binary search with compare-to.
+template <typename K, typename N, typename CompareTo>
+struct btree_binary_search_compare_to {
+  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
+    return n.binary_search_compare_to(k, 0, n.count(), CompareTo());
+  }
+  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
+    typedef btree_upper_bound_adapter<K,
+        btree_key_comparer<K, CompareTo, true> > upper_compare;
+    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
+  }
+};
+
+// A node in the btree holding. The same node type is used for both internal
+// and leaf nodes in the btree, though the nodes are allocated in such a way
+// that the children array is only valid in internal nodes.
+template <typename Params>
+class btree_node {
+ public:
+  typedef Params params_type;
+  typedef btree_node<Params> self_type;
+  typedef typename Params::key_type key_type;
+  typedef typename Params::data_type data_type;
+  typedef typename Params::value_type value_type;
+  typedef typename Params::mutable_value_type mutable_value_type;
+  typedef typename Params::pointer pointer;
+  typedef typename Params::const_pointer const_pointer;
+  typedef typename Params::reference reference;
+  typedef typename Params::const_reference const_reference;
+  typedef typename Params::key_compare key_compare;
+  typedef typename Params::size_type size_type;
+  typedef typename Params::difference_type difference_type;
+  // Typedefs for the various types of node searches.
+  typedef btree_linear_search_plain_compare<
+    key_type, self_type, key_compare> linear_search_plain_compare_type;
+  typedef btree_linear_search_compare_to<
+    key_type, self_type, key_compare> linear_search_compare_to_type;
+  typedef btree_binary_search_plain_compare<
+    key_type, self_type, key_compare> binary_search_plain_compare_type;
+  typedef btree_binary_search_compare_to<
+    key_type, self_type, key_compare> binary_search_compare_to_type;
+  // If we have a valid key-compare-to type, use linear_search_compare_to,
+  // otherwise use linear_search_plain_compare.
+  typedef typename if_<
+    Params::is_key_compare_to::value,
+    linear_search_compare_to_type,
+    linear_search_plain_compare_type>::type linear_search_type;
+  // If we have a valid key-compare-to type, use binary_search_compare_to,
+  // otherwise use binary_search_plain_compare.
+  typedef typename if_<
+    Params::is_key_compare_to::value,
+    binary_search_compare_to_type,
+    binary_search_plain_compare_type>::type binary_search_type;
+  // If the key is an integral or floating point type, use linear search which
+  // is faster than binary search for such types. Might be wise to also
+  // configure linear search based on node-size.
+  typedef typename if_<
+    std::is_integral<key_type>::value ||
+    std::is_floating_point<key_type>::value,
+    linear_search_type, binary_search_type>::type search_type;
+
+  struct base_fields {
+    typedef typename Params::node_count_type field_type;
+
+    // A boolean indicating whether the node is a leaf or not.
+    bool leaf;
+    // The position of the node in the node's parent.
+    field_type position;
+    // The maximum number of values the node can hold.
+    field_type max_count;
+    // The count of the number of values in the node.
+    field_type count;
+    // A pointer to the node's parent.
+    btree_node *parent;
+  };
+
+  enum {
+    kValueSize = params_type::kValueSize,
+    kTargetNodeSize = params_type::kTargetNodeSize,
+
+    // Compute how many values we can fit onto a leaf node.
+    kNodeTargetValues = (kTargetNodeSize - sizeof(base_fields)) / kValueSize,
+    // We need a minimum of 3 values per internal node in order to perform
+    // splitting (1 value for the two nodes involved in the split and 1 value
+    // propagated to the parent as the delimiter for the split).
+    kNodeValues = kNodeTargetValues >= 3 ? kNodeTargetValues : 3,
+
+    kExactMatch = 1 << 30,
+    kMatchMask = kExactMatch - 1,
+  };
+
+  struct leaf_fields : public base_fields {
+    // The array of values. Only the first count of these values have been
+    // constructed and are valid.
+    mutable_value_type values[kNodeValues];
+  };
+
+  struct internal_fields : public leaf_fields {
+    // The array of child pointers. The keys in children_[i] are all less than
+    // key(i). The keys in children_[i + 1] are all greater than key(i). There
+    // are always count + 1 children.
+    btree_node *children[kNodeValues + 1];
+  };
+
+  struct root_fields : public internal_fields {
+    btree_node *rightmost;
+    size_type size;
+  };
+
+ public:
+  // Getter/setter for whether this is a leaf node or not. This value doesn't
+  // change after the node is created.
+  bool leaf() const { return fields_.leaf; }
+
+  // Getter for the position of this node in its parent.
+  int position() const { return fields_.position; }
+  void set_position(int v) { fields_.position = v; }
+
+  // Getter/setter for the number of values stored in this node.
+  int count() const { return fields_.count; }
+  void set_count(int v) { fields_.count = v; }
+  int max_count() const { return fields_.max_count; }
+
+  // Getter for the parent of this node.
+  btree_node* parent() const { return fields_.parent; }
+  // Getter for whether the node is the root of the tree. The parent of the
+  // root of the tree is the leftmost node in the tree which is guaranteed to
+  // be a leaf.
+  bool is_root() const { return parent()->leaf(); }
+  void make_root() {
+    assert(parent()->is_root());
+    fields_.parent = fields_.parent->parent();
+  }
+
+  // Getter for the rightmost root node field. Only valid on the root node.
+  btree_node* rightmost() const { return fields_.rightmost; }
+  btree_node** mutable_rightmost() { return &fields_.rightmost; }
+
+  // Getter for the size root node field. Only valid on the root node.
+  size_type size() const { return fields_.size; }
+  size_type* mutable_size() { return &fields_.size; }
+
+  // Getters for the key/value at position i in the node.
+  const key_type& key(int i) const {
+    return params_type::key(fields_.values[i]);
+  }
+  reference value(int i) {
+    return reinterpret_cast<reference>(fields_.values[i]);
+  }
+  const_reference value(int i) const {
+    return reinterpret_cast<const_reference>(fields_.values[i]);
+  }
+  mutable_value_type* mutable_value(int i) {
+    return &fields_.values[i];
+  }
+
+  // Swap value i in this node with value j in node x.
+  void value_swap(int i, btree_node *x, int j) {
+    params_type::swap(mutable_value(i), x->mutable_value(j));
+  }
+
+  // Getters/setter for the child at position i in the node.
+  btree_node* child(int i) const { return fields_.children[i]; }
+  btree_node** mutable_child(int i) { return &fields_.children[i]; }
+  void set_child(int i, btree_node *c) {
+    *mutable_child(i) = c;
+    c->fields_.parent = this;
+    c->fields_.position = i;
+  }
+
+  // Returns the position of the first value whose key is not less than k.
+  template <typename Compare>
+  int lower_bound(const key_type &k, const Compare &comp) const {
+    return search_type::lower_bound(k, *this, comp);
+  }
+  // Returns the position of the first value whose key is greater than k.
+  template <typename Compare>
+  int upper_bound(const key_type &k, const Compare &comp) const {
+    return search_type::upper_bound(k, *this, comp);
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using plain compare.
+  template <typename Compare>
+  int linear_search_plain_compare(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s < e) {
+      if (!btree_compare_keys(comp, key(s), k)) {
+        break;
+      }
+      ++s;
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // linear search performed using compare-to.
+  template <typename Compare>
+  int linear_search_compare_to(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s < e) {
+      int c = comp(key(s), k);
+      if (c == 0) {
+        return s | kExactMatch;
+      } else if (c > 0) {
+        break;
+      }
+      ++s;
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using plain compare.
+  template <typename Compare>
+  int binary_search_plain_compare(
+      const key_type &k, int s, int e, const Compare &comp) const {
+    while (s != e) {
+      int mid = (s + e) / 2;
+      if (btree_compare_keys(comp, key(mid), k)) {
+        s = mid + 1;
+      } else {
+        e = mid;
+      }
+    }
+    return s;
+  }
+
+  // Returns the position of the first value whose key is not less than k using
+  // binary search performed using compare-to.
+  template <typename CompareTo>
+  int binary_search_compare_to(
+      const key_type &k, int s, int e, const CompareTo &comp) const {
+    while (s != e) {
+      int mid = (s + e) / 2;
+      int c = comp(key(mid), k);
+      if (c < 0) {
+        s = mid + 1;
+      } else if (c > 0) {
+        e = mid;
+      } else {
+        // Need to return the first value whose key is not less than k, which
+        // requires continuing the binary search. Note that we are guaranteed
+        // that the result is an exact match because if "key(mid-1) < k" the
+        // call to binary_search_compare_to() will return "mid".
+        s = binary_search_compare_to(k, s, mid, comp);
+        return s | kExactMatch;
+      }
+    }
+    return s;
+  }
+
+  // Inserts the value x at position i, shifting all existing values and
+  // children at positions >= i to the right by 1.
+  void insert_value(int i, const value_type &x);
+
+  // Removes the value at position i, shifting all existing values and children
+  // at positions > i to the left by 1.
+  void remove_value(int i);
+
+  // Rebalances a node with its right sibling.
+  void rebalance_right_to_left(btree_node *sibling, int to_move);
+  void rebalance_left_to_right(btree_node *sibling, int to_move);
+
+  // Splits a node, moving a portion of the node's values to its right sibling.
+  void split(btree_node *sibling, int insert_position);
+
+  // Merges a node with its right sibling, moving all of the values and the
+  // delimiting key in the parent node onto itself.
+  void merge(btree_node *sibling);
+
+  // Swap the contents of "this" and "src".
+  void swap(btree_node *src);
+
+  // Node allocation/deletion routines.
+  static btree_node* init_leaf(
+      leaf_fields *f, btree_node *parent, int max_count) {
+    btree_node *n = reinterpret_cast<btree_node*>(f);
+    f->leaf = 1;
+    f->position = 0;
+    f->max_count = max_count;
+    f->count = 0;
+    f->parent = parent;
+    if (!NDEBUG) {
+      memset(&f->values, 0, max_count * sizeof(value_type));
+    }
+    return n;
+  }
+  static btree_node* init_internal(internal_fields *f, btree_node *parent) {
+    btree_node *n = init_leaf(f, parent, kNodeValues);
+    f->leaf = 0;
+    if (!NDEBUG) {
+      memset(f->children, 0, sizeof(f->children));
+    }
+    return n;
+  }
+  static btree_node* init_root(root_fields *f, btree_node *parent) {
+    btree_node *n = init_internal(f, parent);
+    f->rightmost = parent;
+    f->size = parent->count();
+    return n;
+  }
+  void destroy() {
+    for (int i = 0; i < count(); ++i) {
+      value_destroy(i);
+    }
+  }
+
+ private:
+  void value_init(int i) {
+    new (&fields_.values[i]) mutable_value_type;
+  }
+  void value_init(int i, const value_type &x) {
+    new (&fields_.values[i]) mutable_value_type(x);
+  }
+  void value_destroy(int i) {
+    fields_.values[i].~mutable_value_type();
+  }
+
+ private:
+  root_fields fields_;
+
+ private:
+  btree_node(const btree_node&);
+  void operator=(const btree_node&);
+};
+
+template <typename Node, typename Reference, typename Pointer>
+struct btree_iterator {
+  typedef typename Node::key_type key_type;
+  typedef typename Node::size_type size_type;
+  typedef typename Node::difference_type difference_type;
+  typedef typename Node::params_type params_type;
+
+  typedef Node node_type;
+  typedef typename std::remove_const<Node>::type normal_node;
+  typedef const Node const_node;
+  typedef typename params_type::value_type value_type;
+  typedef typename params_type::pointer normal_pointer;
+  typedef typename params_type::reference normal_reference;
+  typedef typename params_type::const_pointer const_pointer;
+  typedef typename params_type::const_reference const_reference;
+
+  typedef Pointer pointer;
+  typedef Reference reference;
+  typedef std::bidirectional_iterator_tag iterator_category;
+
+  typedef btree_iterator<
+    normal_node, normal_reference, normal_pointer> iterator;
+  typedef btree_iterator<
+    const_node, const_reference, const_pointer> const_iterator;
+  typedef btree_iterator<Node, Reference, Pointer> self_type;
+
+  btree_iterator()
+      : node(NULL),
+        position(-1) {
+  }
+  btree_iterator(Node *n, int p)
+      : node(n),
+        position(p) {
+  }
+  btree_iterator(const iterator &x)
+      : node(x.node),
+        position(x.position) {
+  }
+
+  // Increment/decrement the iterator.
+  void increment() {
+    if (node->leaf() && ++position < node->count()) {
+      return;
+    }
+    increment_slow();
+  }
+  void increment_by(int count);
+  void increment_slow();
+
+  void decrement() {
+    if (node->leaf() && --position >= 0) {
+      return;
+    }
+    decrement_slow();
+  }
+  void decrement_slow();
+
+  bool operator==(const const_iterator &x) const {
+    return node == x.node && position == x.position;
+  }
+  bool operator!=(const const_iterator &x) const {
+    return node != x.node || position != x.position;
+  }
+
+  // Accessors for the key/value the iterator is pointing at.
+  const key_type& key() const {
+    return node->key(position);
+  }
+  reference operator*() const {
+    return node->value(position);
+  }
+  pointer operator->() const {
+    return &node->value(position);
+  }
+
+  self_type& operator++() {
+    increment();
+    return *this;
+  }
+  self_type& operator--() {
+    decrement();
+    return *this;
+  }
+  self_type operator++(int) {
+    self_type tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  self_type operator--(int) {
+    self_type tmp = *this;
+    --*this;
+    return tmp;
+  }
+
+  // The node in the tree the iterator is pointing at.
+  Node *node;
+  // The position within the node of the tree the iterator is pointing at.
+  int position;
+};
+
+// Dispatch helper class for using btree::internal_locate with plain compare.
+struct btree_internal_locate_plain_compare {
+  template <typename K, typename T, typename Iter>
+  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+    return t.internal_locate_plain_compare(k, iter);
+  }
+};
+
+// Dispatch helper class for using btree::internal_locate with compare-to.
+struct btree_internal_locate_compare_to {
+  template <typename K, typename T, typename Iter>
+  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
+    return t.internal_locate_compare_to(k, iter);
+  }
+};
+
+template <typename Params>
+class btree : public Params::key_compare {
+  typedef btree<Params> self_type;
+  typedef btree_node<Params> node_type;
+  typedef typename node_type::base_fields base_fields;
+  typedef typename node_type::leaf_fields leaf_fields;
+  typedef typename node_type::internal_fields internal_fields;
+  typedef typename node_type::root_fields root_fields;
+  typedef typename Params::is_key_compare_to is_key_compare_to;
+
+  friend class btree_internal_locate_plain_compare;
+  friend class btree_internal_locate_compare_to;
+  typedef typename if_<
+    is_key_compare_to::value,
+    btree_internal_locate_compare_to,
+    btree_internal_locate_plain_compare>::type internal_locate_type;
+
+  enum {
+    kNodeValues = node_type::kNodeValues,
+    kMinNodeValues = kNodeValues / 2,
+    kValueSize = node_type::kValueSize,
+    kExactMatch = node_type::kExactMatch,
+    kMatchMask = node_type::kMatchMask,
+  };
+
+  // A helper class to get the empty base class optimization for 0-size
+  // allocators. Base is internal_allocator_type.
+  // (e.g. empty_base_handle<internal_allocator_type, node_type*>). If Base is
+  // 0-size, the compiler doesn't have to reserve any space for it and
+  // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base
+  // class optimization] for more details.
+  template <typename Base, typename Data>
+  struct empty_base_handle : public Base {
+    empty_base_handle(const Base &b, const Data &d)
+        : Base(b),
+          data(d) {
+    }
+    Data data;
+  };
+
+  struct node_stats {
+    node_stats(ssize_t l, ssize_t i)
+        : leaf_nodes(l),
+          internal_nodes(i) {
+    }
+
+    node_stats& operator+=(const node_stats &x) {
+      leaf_nodes += x.leaf_nodes;
+      internal_nodes += x.internal_nodes;
+      return *this;
+    }
+
+    ssize_t leaf_nodes;
+    ssize_t internal_nodes;
+  };
+
+ public:
+  typedef Params params_type;
+  typedef typename Params::key_type key_type;
+  typedef typename Params::data_type data_type;
+  typedef typename Params::mapped_type mapped_type;
+  typedef typename Params::value_type value_type;
+  typedef typename Params::key_compare key_compare;
+  typedef typename Params::pointer pointer;
+  typedef typename Params::const_pointer const_pointer;
+  typedef typename Params::reference reference;
+  typedef typename Params::const_reference const_reference;
+  typedef typename Params::size_type size_type;
+  typedef typename Params::difference_type difference_type;
+  typedef btree_iterator<node_type, reference, pointer> iterator;
+  typedef typename iterator::const_iterator const_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+
+  typedef typename Params::allocator_type allocator_type;
+  typedef typename allocator_type::template rebind<char>::other
+    internal_allocator_type;
+
+ public:
+  // Default constructor.
+  btree(const key_compare &comp, const allocator_type &alloc);
+
+  // Copy constructor.
+  btree(const self_type &x);
+
+  // Destructor.
+  ~btree() {
+    clear();
+  }
+
+  // Iterator routines.
+  iterator begin() {
+    return iterator(leftmost(), 0);
+  }
+  const_iterator begin() const {
+    return const_iterator(leftmost(), 0);
+  }
+  iterator end() {
+    return iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+  }
+  const_iterator end() const {
+    return const_iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
+  }
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Finds the first element whose key is not less than key.
+  iterator lower_bound(const key_type &key) {
+    return internal_end(
+        internal_lower_bound(key, iterator(root(), 0)));
+  }
+  const_iterator lower_bound(const key_type &key) const {
+    return internal_end(
+        internal_lower_bound(key, const_iterator(root(), 0)));
+  }
+
+  // Finds the first element whose key is greater than key.
+  iterator upper_bound(const key_type &key) {
+    return internal_end(
+        internal_upper_bound(key, iterator(root(), 0)));
+  }
+  const_iterator upper_bound(const key_type &key) const {
+    return internal_end(
+        internal_upper_bound(key, const_iterator(root(), 0)));
+  }
+
+  // Finds the range of values which compare equal to key. The first member of
+  // the returned pair is equal to lower_bound(key). The second member pair of
+  // the pair is equal to upper_bound(key).
+  std::pair<iterator,iterator> equal_range(const key_type &key) {
+    return std::make_pair(lower_bound(key), upper_bound(key));
+  }
+  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+    return std::make_pair(lower_bound(key), upper_bound(key));
+  }
+
+  // Inserts a value into the btree only if it does not already exist. The
+  // boolean return value indicates whether insertion succeeded or failed. The
+  // ValuePointer type is used to avoid instatiating the value unless the key
+  // is being inserted. Value is not dereferenced if the key already exists in
+  // the btree. See btree_map::operator[].
+  template <typename ValuePointer>
+  std::pair<iterator,bool> insert_unique(const key_type &key, ValuePointer value);
+
+  // Inserts a value into the btree only if it does not already exist. The
+  // boolean return value indicates whether insertion succeeded or failed.
+  std::pair<iterator,bool> insert_unique(const value_type &v) {
+    return insert_unique(params_type::key(v), &v);
+  }
+
+  // Insert with hint. Check to see if the value should be placed immediately
+  // before position in the tree. If it does, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_unique(v) were made.
+  iterator insert_unique(iterator position, const value_type &v);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_unique(InputIterator b, InputIterator e);
+
+  // Inserts a value into the btree. The ValuePointer type is used to avoid
+  // instatiating the value unless the key is being inserted. Value is not
+  // dereferenced if the key already exists in the btree. See
+  // btree_map::operator[].
+  template <typename ValuePointer>
+  iterator insert_multi(const key_type &key, ValuePointer value);
+
+  // Inserts a value into the btree.
+  iterator insert_multi(const value_type &v) {
+    return insert_multi(params_type::key(v), &v);
+  }
+
+  // Insert with hint. Check to see if the value should be placed immediately
+  // before position in the tree. If it does, then the insertion will take
+  // amortized constant time. If not, the insertion will take amortized
+  // logarithmic time as if a call to insert_multi(v) were made.
+  iterator insert_multi(iterator position, const value_type &v);
+
+  // Insert a range of values into the btree.
+  template <typename InputIterator>
+  void insert_multi(InputIterator b, InputIterator e);
+
+  void assign(const self_type &x);
+
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(iterator iter);
+
+  // Erases range. Returns the number of keys erased.
+  int erase(iterator begin, iterator end);
+
+  // Erases the specified key from the btree. Returns 1 if an element was
+  // erased and 0 otherwise.
+  int erase_unique(const key_type &key);
+
+  // Erases all of the entries matching the specified key from the
+  // btree. Returns the number of elements erased.
+  int erase_multi(const key_type &key);
+
+  // Finds the iterator corresponding to a key or returns end() if the key is
+  // not present.
+  iterator find_unique(const key_type &key) {
+    return internal_end(
+        internal_find_unique(key, iterator(root(), 0)));
+  }
+  const_iterator find_unique(const key_type &key) const {
+    return internal_end(
+        internal_find_unique(key, const_iterator(root(), 0)));
+  }
+  iterator find_multi(const key_type &key) {
+    return internal_end(
+        internal_find_multi(key, iterator(root(), 0)));
+  }
+  const_iterator find_multi(const key_type &key) const {
+    return internal_end(
+        internal_find_multi(key, const_iterator(root(), 0)));
+  }
+
+  // Returns a count of the number of times the key appears in the btree.
+  size_type count_unique(const key_type &key) const {
+    const_iterator begin = internal_find_unique(
+        key, const_iterator(root(), 0));
+    if (!begin.node) {
+      // The key doesn't exist in the tree.
+      return 0;
+    }
+    return 1;
+  }
+  // Returns a count of the number of times the key appears in the btree.
+  size_type count_multi(const key_type &key) const {
+    return distance(lower_bound(key), upper_bound(key));
+  }
+
+  // Clear the btree, deleting all of the values it contains.
+  void clear();
+
+  // Swap the contents of *this and x.
+  void swap(self_type &x);
+
+  // Assign the contents of x to *this.
+  self_type& operator=(const self_type &x) {
+    if (&x == this) {
+      // Don't copy onto ourselves.
+      return *this;
+    }
+    assign(x);
+    return *this;
+  }
+
+  key_compare* mutable_key_comp() {
+    return this;
+  }
+  const key_compare& key_comp() const {
+    return *this;
+  }
+  bool compare_keys(const key_type &x, const key_type &y) const {
+    return btree_compare_keys(key_comp(), x, y);
+  }
+
+  // Dump the btree to the specified ostream. Requires that operator<< is
+  // defined for Key and Value.
+  void dump(std::ostream &os) const {
+    if (root() != NULL) {
+      internal_dump(os, root(), 0);
+    }
+  }
+
+  // Verifies the structure of the btree.
+  void verify() const;
+
+  // Size routines. Note that empty() is slightly faster than doing size()==0.
+  size_type size() const {
+    if (empty()) return 0;
+    if (root()->leaf()) return root()->count();
+    return root()->size();
+  }
+  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+  bool empty() const { return root() == NULL; }
+
+  // The height of the btree. An empty tree will have height 0.
+  size_type height() const {
+    size_type h = 0;
+    if (root()) {
+      // Count the length of the chain from the leftmost node up to the
+      // root. We actually count from the root back around to the level below
+      // the root, but the calculation is the same because of the circularity
+      // of that traversal.
+      const node_type *n = root();
+      do {
+        ++h;
+        n = n->parent();
+      } while (n != root());
+    }
+    return h;
+  }
+
+  // The number of internal, leaf and total nodes used by the btree.
+  size_type leaf_nodes() const {
+    return internal_stats(root()).leaf_nodes;
+  }
+  size_type internal_nodes() const {
+    return internal_stats(root()).internal_nodes;
+  }
+  size_type nodes() const {
+    node_stats stats = internal_stats(root());
+    return stats.leaf_nodes + stats.internal_nodes;
+  }
+
+  // The total number of bytes used by the btree.
+  size_type bytes_used() const {
+    node_stats stats = internal_stats(root());
+    if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) {
+      return sizeof(*this) +
+          sizeof(base_fields) + root()->max_count() * sizeof(value_type);
+    } else {
+      return sizeof(*this) +
+          sizeof(root_fields) - sizeof(internal_fields) +
+          stats.leaf_nodes * sizeof(leaf_fields) +
+          stats.internal_nodes * sizeof(internal_fields);
+    }
+  }
+
+  // The average number of bytes used per value stored in the btree.
+  static double average_bytes_per_value() {
+    // Returns the number of bytes per value on a leaf node that is 75%
+    // full. Experimentally, this matches up nicely with the computed number of
+    // bytes per value in trees that had their values inserted in random order.
+    return sizeof(leaf_fields) / (kNodeValues * 0.75);
+  }
+
+  // The fullness of the btree. Computed as the number of elements in the btree
+  // divided by the maximum number of elements a tree with the current number
+  // of nodes could hold. A value of 1 indicates perfect space
+  // utilization. Smaller values indicate space wastage.
+  double fullness() const {
+    return double(size()) / (nodes() * kNodeValues);
+  }
+  // The overhead of the btree structure in bytes per node. Computed as the
+  // total number of bytes used by the btree minus the number of bytes used for
+  // storing elements divided by the number of elements.
+  double overhead() const {
+    if (empty()) {
+      return 0.0;
+    }
+    return (bytes_used() - size() * kValueSize) / double(size());
+  }
+
+ private:
+  // Internal accessor routines.
+  node_type* root() { return root_.data; }
+  const node_type* root() const { return root_.data; }
+  node_type** mutable_root() { return &root_.data; }
+
+  // The rightmost node is stored in the root node.
+  node_type* rightmost() {
+    return (!root() || root()->leaf()) ? root() : root()->rightmost();
+  }
+  const node_type* rightmost() const {
+    return (!root() || root()->leaf()) ? root() : root()->rightmost();
+  }
+  node_type** mutable_rightmost() { return root()->mutable_rightmost(); }
+
+  // The leftmost node is stored as the parent of the root node.
+  node_type* leftmost() { return root() ? root()->parent() : NULL; }
+  const node_type* leftmost() const { return root() ? root()->parent() : NULL; }
+
+  // The size of the tree is stored in the root node.
+  size_type* mutable_size() { return root()->mutable_size(); }
+
+  // Allocator routines.
+  internal_allocator_type* mutable_internal_allocator() {
+    return static_cast<internal_allocator_type*>(&root_);
+  }
+  const internal_allocator_type& internal_allocator() const {
+    return *static_cast<const internal_allocator_type*>(&root_);
+  }
+
+  // Node creation/deletion routines.
+  node_type* new_internal_node(node_type *parent) {
+    internal_fields *p = reinterpret_cast<internal_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(internal_fields)));
+    return node_type::init_internal(p, parent);
+  }
+  node_type* new_internal_root_node() {
+    root_fields *p = reinterpret_cast<root_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(root_fields)));
+    return node_type::init_root(p, root()->parent());
+  }
+  node_type* new_leaf_node(node_type *parent) {
+    leaf_fields *p = reinterpret_cast<leaf_fields*>(
+        mutable_internal_allocator()->allocate(sizeof(leaf_fields)));
+    return node_type::init_leaf(p, parent, kNodeValues);
+  }
+  node_type* new_leaf_root_node(int max_count) {
+    leaf_fields *p = reinterpret_cast<leaf_fields*>(
+        mutable_internal_allocator()->allocate(
+            sizeof(base_fields) + max_count * sizeof(value_type)));
+    return node_type::init_leaf(p, reinterpret_cast<node_type*>(p), max_count);
+  }
+  void delete_internal_node(node_type *node) {
+    node->destroy();
+    assert(node != root());
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(node), sizeof(internal_fields));
+  }
+  void delete_internal_root_node() {
+    root()->destroy();
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(root()), sizeof(root_fields));
+  }
+  void delete_leaf_node(node_type *node) {
+    node->destroy();
+    mutable_internal_allocator()->deallocate(
+        reinterpret_cast<char*>(node),
+        sizeof(base_fields) + node->max_count() * sizeof(value_type));
+  }
+
+  // Rebalances or splits the node iter points to.
+  void rebalance_or_split(iterator *iter);
+
+  // Merges the values of left, right and the delimiting key on their parent
+  // onto left, removing the delimiting key and deleting right.
+  void merge_nodes(node_type *left, node_type *right);
+
+  // Tries to merge node with its left or right sibling, and failing that,
+  // rebalance with its left or right sibling. Returns true if a merge
+  // occurred, at which point it is no longer valid to access node. Returns
+  // false if no merging took place.
+  bool try_merge_or_rebalance(iterator *iter);
+
+  // Tries to shrink the height of the tree by 1.
+  void try_shrink();
+
+  iterator internal_end(iterator iter) {
+    return iter.node ? iter : end();
+  }
+  const_iterator internal_end(const_iterator iter) const {
+    return iter.node ? iter : end();
+  }
+
+  // Inserts a value into the btree immediately before iter. Requires that
+  // key(v) <= iter.key() and (--iter).key() <= key(v).
+  iterator internal_insert(iterator iter, const value_type &v);
+
+  // Returns an iterator pointing to the first value >= the value "iter" is
+  // pointing at. Note that "iter" might be pointing to an invalid location as
+  // iter.position == iter.node->count(). This routine simply moves iter up in
+  // the tree to a valid location.
+  template <typename IterType>
+  static IterType internal_last(IterType iter);
+
+  // Returns an iterator pointing to the leaf position at which key would
+  // reside in the tree. We provide 2 versions of internal_locate. The first
+  // version (internal_locate_plain_compare) always returns 0 for the second
+  // field of the pair. The second version (internal_locate_compare_to) is for
+  // the key-compare-to specialization and returns either kExactMatch (if the
+  // key was found in the tree) or -kExactMatch (if it wasn't) in the second
+  // field of the pair. The compare_to specialization allows the caller to
+  // avoid a subsequent comparison to determine if an exact match was made,
+  // speeding up string keys.
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate(
+      const key_type &key, IterType iter) const;
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate_plain_compare(
+      const key_type &key, IterType iter) const;
+  template <typename IterType>
+  std::pair<IterType, int> internal_locate_compare_to(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements lower_bound().
+  template <typename IterType>
+  IterType internal_lower_bound(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements upper_bound().
+  template <typename IterType>
+  IterType internal_upper_bound(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements find_unique().
+  template <typename IterType>
+  IterType internal_find_unique(
+      const key_type &key, IterType iter) const;
+
+  // Internal routine which implements find_multi().
+  template <typename IterType>
+  IterType internal_find_multi(
+      const key_type &key, IterType iter) const;
+
+  // Deletes a node and all of its children.
+  void internal_clear(node_type *node);
+
+  // Dumps a node and all of its children to the specified ostream.
+  void internal_dump(std::ostream &os, const node_type *node, int level) const;
+
+  // Verifies the tree structure of node.
+  int internal_verify(const node_type *node,
+                      const key_type *lo, const key_type *hi) const;
+
+  node_stats internal_stats(const node_type *node) const {
+    if (!node) {
+      return node_stats(0, 0);
+    }
+    if (node->leaf()) {
+      return node_stats(1, 0);
+    }
+    node_stats res(0, 1);
+    for (int i = 0; i <= node->count(); ++i) {
+      res += internal_stats(node->child(i));
+    }
+    return res;
+  }
+
+ private:
+  empty_base_handle<internal_allocator_type, node_type*> root_;
+
+ private:
+  // A never instantiated helper function that returns big_ if we have a
+  // key-compare-to functor or if R is bool and small_ otherwise.
+  template <typename R>
+  static typename if_<
+   if_<is_key_compare_to::value,
+             std::is_same<R, int>,
+             std::is_same<R, bool> >::type::value,
+   big_, small_>::type key_compare_checker(R);
+
+  // A never instantiated helper function that returns the key comparison
+  // functor.
+  static key_compare key_compare_helper();
+
+  // Verify that key_compare returns a bool. This is similar to the way
+  // is_convertible in base/type_traits.h works. Note that key_compare_checker
+  // is never actually invoked. The compiler will select which
+  // key_compare_checker() to instantiate and then figure out the size of the
+  // return type of key_compare_checker() at compile time which we then check
+  // against the sizeof of big_.
+  COMPILE_ASSERT(
+      sizeof(key_compare_checker(key_compare_helper()(key_type(), key_type()))) ==
+      sizeof(big_),
+      key_comparison_function_must_return_bool);
+
+  // Note: We insist on kTargetValues, which is computed from
+  // Params::kTargetNodeSize, must fit the base_fields::field_type.
+  COMPILE_ASSERT(kNodeValues <
+                 (1 << (8 * sizeof(typename base_fields::field_type))),
+                 target_node_size_too_large);
+
+  // Test the assumption made in setting kNodeValueSpace.
+  COMPILE_ASSERT(sizeof(base_fields) >= 2 * sizeof(void*),
+                 node_space_assumption_incorrect);
+};
+
+////
+// btree_node methods
+template <typename P>
+inline void btree_node<P>::insert_value(int i, const value_type &x) {
+  assert(i <= count());
+  value_init(count(), x);
+  for (int j = count(); j > i; --j) {
+    value_swap(j, this, j - 1);
+  }
+  set_count(count() + 1);
+
+  if (!leaf()) {
+    ++i;
+    for (int j = count(); j > i; --j) {
+      *mutable_child(j) = child(j - 1);
+      child(j)->set_position(j);
+    }
+    *mutable_child(i) = NULL;
+  }
+}
+
+template <typename P>
+inline void btree_node<P>::remove_value(int i) {
+  if (!leaf()) {
+    assert(child(i + 1)->count() == 0);
+    for (int j = i + 1; j < count(); ++j) {
+      *mutable_child(j) = child(j + 1);
+      child(j)->set_position(j);
+    }
+    *mutable_child(count()) = NULL;
+  }
+
+  set_count(count() - 1);
+  for (; i < count(); ++i) {
+    value_swap(i, this, i + 1);
+  }
+  value_destroy(i);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_right_to_left(btree_node *src, int to_move) {
+  assert(parent() == src->parent());
+  assert(position() + 1 == src->position());
+  assert(src->count() >= count());
+  assert(to_move >= 1);
+  assert(to_move <= src->count());
+
+  // Make room in the left node for the new values.
+  for (int i = 0; i < to_move; ++i) {
+    value_init(i + count());
+  }
+
+  // Move the delimiting value to the left node and the new delimiting value
+  // from the right node.
+  value_swap(count(), parent(), position());
+  parent()->value_swap(position(), src, to_move - 1);
+
+  // Move the values from the right to the left node.
+  for (int i = 1; i < to_move; ++i) {
+    value_swap(count() + i, src, i - 1);
+  }
+  // Shift the values in the right node to their correct position.
+  for (int i = to_move; i < src->count(); ++i) {
+    src->value_swap(i - to_move, src, i);
+  }
+  for (int i = 1; i <= to_move; ++i) {
+    src->value_destroy(src->count() - i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i < to_move; ++i) {
+      set_child(1 + count() + i, src->child(i));
+    }
+    for (int i = 0; i <= src->count() - to_move; ++i) {
+      assert(i + to_move <= src->max_count());
+      src->set_child(i, src->child(i + to_move));
+      *src->mutable_child(i + to_move) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(count() + to_move);
+  src->set_count(src->count() - to_move);
+}
+
+template <typename P>
+void btree_node<P>::rebalance_left_to_right(btree_node *dest, int to_move) {
+  assert(parent() == dest->parent());
+  assert(position() + 1 == dest->position());
+  assert(count() >= dest->count());
+  assert(to_move >= 1);
+  assert(to_move <= count());
+
+  // Make room in the right node for the new values.
+  for (int i = 0; i < to_move; ++i) {
+    dest->value_init(i + dest->count());
+  }
+  for (int i = dest->count() - 1; i >= 0; --i) {
+    dest->value_swap(i, dest, i + to_move);
+  }
+
+  // Move the delimiting value to the right node and the new delimiting value
+  // from the left node.
+  dest->value_swap(to_move - 1, parent(), position());
+  parent()->value_swap(position(), this, count() - to_move);
+  value_destroy(count() - to_move);
+
+  // Move the values from the left to the right node.
+  for (int i = 1; i < to_move; ++i) {
+    value_swap(count() - to_move + i, dest, i - 1);
+    value_destroy(count() - to_move + i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the left to the right node.
+    for (int i = dest->count(); i >= 0; --i) {
+      dest->set_child(i + to_move, dest->child(i));
+      *dest->mutable_child(i) = NULL;
+    }
+    for (int i = 1; i <= to_move; ++i) {
+      dest->set_child(i - 1, child(count() - to_move + i));
+      *mutable_child(count() - to_move + i) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(count() - to_move);
+  dest->set_count(dest->count() + to_move);
+}
+
+template <typename P>
+void btree_node<P>::split(btree_node *dest, int insert_position) {
+  assert(dest->count() == 0);
+
+  // We bias the split based on the position being inserted. If we're
+  // inserting at the beginning of the left node then bias the split to put
+  // more values on the right node. If we're inserting at the end of the
+  // right node then bias the split to put more values on the left node.
+  if (insert_position == 0) {
+    dest->set_count(count() - 1);
+  } else if (insert_position == max_count()) {
+    dest->set_count(0);
+  } else {
+    dest->set_count(count() / 2);
+  }
+  set_count(count() - dest->count());
+  assert(count() >= 1);
+
+  // Move values from the left sibling to the right sibling.
+  for (int i = 0; i < dest->count(); ++i) {
+    dest->value_init(i);
+    value_swap(count() + i, dest, i);
+    value_destroy(count() + i);
+  }
+
+  // The split key is the largest value in the left sibling.
+  set_count(count() - 1);
+  parent()->insert_value(position(), value_type());
+  value_swap(count(), parent(), position());
+  value_destroy(count());
+  parent()->set_child(position() + 1, dest);
+
+  if (!leaf()) {
+    for (int i = 0; i <= dest->count(); ++i) {
+      assert(child(count() + i + 1) != NULL);
+      dest->set_child(i, child(count() + i + 1));
+      *mutable_child(count() + i + 1) = NULL;
+    }
+  }
+}
+
+template <typename P>
+void btree_node<P>::merge(btree_node *src) {
+  assert(parent() == src->parent());
+  assert(position() + 1 == src->position());
+
+  // Move the delimiting value to the left node.
+  value_init(count());
+  value_swap(count(), parent(), position());
+
+  // Move the values from the right to the left node.
+  for (int i = 0; i < src->count(); ++i) {
+    value_init(1 + count() + i);
+    value_swap(1 + count() + i, src, i);
+    src->value_destroy(i);
+  }
+
+  if (!leaf()) {
+    // Move the child pointers from the right to the left node.
+    for (int i = 0; i <= src->count(); ++i) {
+      set_child(1 + count() + i, src->child(i));
+      *src->mutable_child(i) = NULL;
+    }
+  }
+
+  // Fixup the counts on the src and dest nodes.
+  set_count(1 + count() + src->count());
+  src->set_count(0);
+
+  // Remove the value on the parent node.
+  parent()->remove_value(position());
+}
+
+template <typename P>
+void btree_node<P>::swap(btree_node *x) {
+  assert(leaf() == x->leaf());
+
+  // Swap the values.
+  for (int i = count(); i < x->count(); ++i) {
+    value_init(i);
+  }
+  for (int i = x->count(); i < count(); ++i) {
+    x->value_init(i);
+  }
+  int n = std::max(count(), x->count());
+  for (int i = 0; i < n; ++i) {
+    value_swap(i, x, i);
+  }
+  for (int i = count(); i < x->count(); ++i) {
+    x->value_destroy(i);
+  }
+  for (int i = x->count(); i < count(); ++i) {
+    value_destroy(i);
+  }
+
+  if (!leaf()) {
+    // Swap the child pointers.
+    for (int i = 0; i <= n; ++i) {
+      btree_swap_helper(*mutable_child(i), *x->mutable_child(i));
+    }
+    for (int i = 0; i <= count(); ++i) {
+      x->child(i)->fields_.parent = x;
+    }
+    for (int i = 0; i <= x->count(); ++i) {
+      child(i)->fields_.parent = this;
+    }
+  }
+
+  // Swap the counts.
+  btree_swap_helper(fields_.count, x->fields_.count);
+}
+
+////
+// btree_iterator methods
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_slow() {
+  if (node->leaf()) {
+    assert(position >= node->count());
+    self_type save(*this);
+    while (position == node->count() && !node->is_root()) {
+      assert(node->parent()->child(node->position()) == node);
+      position = node->position();
+      node = node->parent();
+    }
+    if (position == node->count()) {
+      *this = save;
+    }
+  } else {
+    assert(position < node->count());
+    node = node->child(position + 1);
+    while (!node->leaf()) {
+      node = node->child(0);
+    }
+    position = 0;
+  }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::increment_by(int count) {
+  while (count > 0) {
+    if (node->leaf()) {
+      int rest = node->count() - position;
+      position += std::min(rest, count);
+      count = count - rest;
+      if (position < node->count()) {
+        return;
+      }
+    } else {
+      --count;
+    }
+    increment_slow();
+  }
+}
+
+template <typename N, typename R, typename P>
+void btree_iterator<N, R, P>::decrement_slow() {
+  if (node->leaf()) {
+    assert(position <= -1);
+    self_type save(*this);
+    while (position < 0 && !node->is_root()) {
+      assert(node->parent()->child(node->position()) == node);
+      position = node->position() - 1;
+      node = node->parent();
+    }
+    if (position < 0) {
+      *this = save;
+    }
+  } else {
+    assert(position >= 0);
+    node = node->child(position);
+    while (!node->leaf()) {
+      node = node->child(node->count());
+    }
+    position = node->count() - 1;
+  }
+}
+
+////
+// btree methods
+template <typename P>
+btree<P>::btree(const key_compare &comp, const allocator_type &alloc)
+    : key_compare(comp),
+      root_(alloc, NULL) {
+}
+
+template <typename P>
+btree<P>::btree(const self_type &x)
+    : key_compare(x.key_comp()),
+      root_(x.internal_allocator(), NULL) {
+  assign(x);
+}
+
+template <typename P> template <typename ValuePointer>
+std::pair<typename btree<P>::iterator, bool>
+btree<P>::insert_unique(const key_type &key, ValuePointer value) {
+  if (empty()) {
+    *mutable_root() = new_leaf_root_node(1);
+  }
+
+  std::pair<iterator, int> res = internal_locate(key, iterator(root(), 0));
+  iterator &iter = res.first;
+  if (res.second == kExactMatch) {
+    // The key already exists in the tree, do nothing.
+    return std::make_pair(internal_last(iter), false);
+  } else if (!res.second) {
+    iterator last = internal_last(iter);
+    if (last.node && !compare_keys(key, last.key())) {
+      // The key already exists in the tree, do nothing.
+      return std::make_pair(last, false);
+    }
+  }
+
+  return std::make_pair(internal_insert(iter, *value), true);
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::insert_unique(iterator position, const value_type &v) {
+  if (!empty()) {
+    const key_type &key = params_type::key(v);
+    if (position == end() || compare_keys(key, position.key())) {
+      iterator prev = position;
+      if (position == begin() || compare_keys((--prev).key(), key)) {
+        // prev.key() < key < position.key()
+        return internal_insert(position, v);
+      }
+    } else if (compare_keys(position.key(), key)) {
+      iterator next = position;
+      ++next;
+      if (next == end() || compare_keys(key, next.key())) {
+        // position.key() < key < next.key()
+        return internal_insert(next, v);
+      }
+    } else {
+      // position.key() == key
+      return position;
+    }
+  }
+  return insert_unique(v).first;
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_unique(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_unique(end(), *b);
+  }
+}
+
+template <typename P> template <typename ValuePointer>
+typename btree<P>::iterator
+btree<P>::insert_multi(const key_type &key, ValuePointer value) {
+  if (empty()) {
+    *mutable_root() = new_leaf_root_node(1);
+  }
+
+  iterator iter = internal_upper_bound(key, iterator(root(), 0));
+  if (!iter.node) {
+    iter = end();
+  }
+  return internal_insert(iter, *value);
+}
+
+template <typename P>
+typename btree<P>::iterator
+btree<P>::insert_multi(iterator position, const value_type &v) {
+  if (!empty()) {
+    const key_type &key = params_type::key(v);
+    if (position == end() || !compare_keys(position.key(), key)) {
+      iterator prev = position;
+      if (position == begin() || !compare_keys(key, (--prev).key())) {
+        // prev.key() <= key <= position.key()
+        return internal_insert(position, v);
+      }
+    } else {
+      iterator next = position;
+      ++next;
+      if (next == end() || !compare_keys(next.key(), key)) {
+        // position.key() < key <= next.key()
+        return internal_insert(next, v);
+      }
+    }
+  }
+  return insert_multi(v);
+}
+
+template <typename P> template <typename InputIterator>
+void btree<P>::insert_multi(InputIterator b, InputIterator e) {
+  for (; b != e; ++b) {
+    insert_multi(end(), *b);
+  }
+}
+
+template <typename P>
+void btree<P>::assign(const self_type &x) {
+  clear();
+
+  *mutable_key_comp() = x.key_comp();
+  *mutable_internal_allocator() = x.internal_allocator();
+
+  // Assignment can avoid key comparisons because we know the order of the
+  // values is the same order we'll store them in.
+  for (const_iterator iter = x.begin(); iter != x.end(); ++iter) {
+    if (empty()) {
+      insert_multi(*iter);
+    } else {
+      // If the btree is not empty, we can just insert the new value at the end
+      // of the tree!
+      internal_insert(end(), *iter);
+    }
+  }
+}
+
+template <typename P>
+typename btree<P>::iterator btree<P>::erase(iterator iter) {
+  bool internal_delete = false;
+  if (!iter.node->leaf()) {
+    // Deletion of a value on an internal node. Swap the key with the largest
+    // value of our left child. This is easy, we just decrement iter.
+    iterator tmp_iter(iter--);
+    assert(iter.node->leaf());
+    assert(!compare_keys(tmp_iter.key(), iter.key()));
+    iter.node->value_swap(iter.position, tmp_iter.node, tmp_iter.position);
+    internal_delete = true;
+    --*mutable_size();
+  } else if (!root()->leaf()) {
+    --*mutable_size();
+  }
+
+  // Delete the key from the leaf.
+  iter.node->remove_value(iter.position);
+
+  // We want to return the next value after the one we just erased. If we
+  // erased from an internal node (internal_delete == true), then the next
+  // value is ++(++iter). If we erased from a leaf node (internal_delete ==
+  // false) then the next value is ++iter. Note that ++iter may point to an
+  // internal node and the value in the internal node may move to a leaf node
+  // (iter.node) when rebalancing is performed at the leaf level.
+
+  // Merge/rebalance as we walk back up the tree.
+  iterator res(iter);
+  for (;;) {
+    if (iter.node == root()) {
+      try_shrink();
+      if (empty()) {
+        return end();
+      }
+      break;
+    }
+    if (iter.node->count() >= kMinNodeValues) {
+      break;
+    }
+    bool merged = try_merge_or_rebalance(&iter);
+    if (iter.node->leaf()) {
+      res = iter;
+    }
+    if (!merged) {
+      break;
+    }
+    iter.node = iter.node->parent();
+  }
+
+  // Adjust our return value. If we're pointing at the end of a node, advance
+  // the iterator.
+  if (res.position == res.node->count()) {
+    res.position = res.node->count() - 1;
+    ++res;
+  }
+  // If we erased from an internal node, advance the iterator.
+  if (internal_delete) {
+    ++res;
+  }
+  return res;
+}
+
+template <typename P>
+int btree<P>::erase(iterator begin, iterator end) {
+  int count = distance(begin, end);
+  for (int i = 0; i < count; i++) {
+    begin = erase(begin);
+  }
+  return count;
+}
+
+template <typename P>
+int btree<P>::erase_unique(const key_type &key) {
+  iterator iter = internal_find_unique(key, iterator(root(), 0));
+  if (!iter.node) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  erase(iter);
+  return 1;
+}
+
+template <typename P>
+int btree<P>::erase_multi(const key_type &key) {
+  iterator begin = internal_lower_bound(key, iterator(root(), 0));
+  if (!begin.node) {
+    // The key doesn't exist in the tree, return nothing done.
+    return 0;
+  }
+  // Delete all of the keys between begin and upper_bound(key).
+  iterator end = internal_end(
+      internal_upper_bound(key, iterator(root(), 0)));
+  return erase(begin, end);
+}
+
+template <typename P>
+void btree<P>::clear() {
+  if (root() != NULL) {
+    internal_clear(root());
+  }
+  *mutable_root() = NULL;
+}
+
+template <typename P>
+void btree<P>::swap(self_type &x) {
+  std::swap(static_cast<key_compare&>(*this), static_cast<key_compare&>(x));
+  std::swap(root_, x.root_);
+}
+
+template <typename P>
+void btree<P>::verify() const {
+  if (root() != NULL) {
+    assert(size() == internal_verify(root(), NULL, NULL));
+    assert(leftmost() == (++const_iterator(root(), -1)).node);
+    assert(rightmost() == (--const_iterator(root(), root()->count())).node);
+    assert(leftmost()->leaf());
+    assert(rightmost()->leaf());
+  } else {
+    assert(size() == 0);
+    assert(leftmost() == NULL);
+    assert(rightmost() == NULL);
+  }
+}
+
+template <typename P>
+void btree<P>::rebalance_or_split(iterator *iter) {
+  node_type *&node = iter->node;
+  int &insert_position = iter->position;
+  assert(node->count() == node->max_count());
+
+  // First try to make room on the node by rebalancing.
+  node_type *parent = node->parent();
+  if (node != root()) {
+    if (node->position() > 0) {
+      // Try rebalancing with our left sibling.
+      node_type *left = parent->child(node->position() - 1);
+      if (left->count() < left->max_count()) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the end of the right node then we bias rebalancing to
+        // fill up the left node.
+        int to_move = (left->max_count() - left->count()) /
+            (1 + (insert_position < left->max_count()));
+        to_move = std::max(1, to_move);
+
+        if (((insert_position - to_move) >= 0) ||
+            ((left->count() + to_move) < left->max_count())) {
+          left->rebalance_right_to_left(node, to_move);
+
+          assert(node->max_count() - node->count() == to_move);
+          insert_position = insert_position - to_move;
+          if (insert_position < 0) {
+            insert_position = insert_position + left->count() + 1;
+            node = left;
+          }
+
+          assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    if (node->position() < parent->count()) {
+      // Try rebalancing with our right sibling.
+      node_type *right = parent->child(node->position() + 1);
+      if (right->count() < right->max_count()) {
+        // We bias rebalancing based on the position being inserted. If we're
+        // inserting at the beginning of the left node then we bias rebalancing
+        // to fill up the right node.
+        int to_move = (right->max_count() - right->count()) /
+            (1 + (insert_position > 0));
+        to_move = std::max(1, to_move);
+
+        if ((insert_position <= (node->count() - to_move)) ||
+            ((right->count() + to_move) < right->max_count())) {
+          node->rebalance_left_to_right(right, to_move);
+
+          if (insert_position > node->count()) {
+            insert_position = insert_position - node->count() - 1;
+            node = right;
+          }
+
+          assert(node->count() < node->max_count());
+          return;
+        }
+      }
+    }
+
+    // Rebalancing failed, make sure there is room on the parent node for a new
+    // value.
+    if (parent->count() == parent->max_count()) {
+      iterator parent_iter(node->parent(), node->position());
+      rebalance_or_split(&parent_iter);
+    }
+  } else {
+    // Rebalancing not possible because this is the root node.
+    if (root()->leaf()) {
+      // The root node is currently a leaf node: create a new root node and set
+      // the current root node as the child of the new root.
+      parent = new_internal_root_node();
+      parent->set_child(0, root());
+      *mutable_root() = parent;
+      assert(*mutable_rightmost() == parent->child(0));
+    } else {
+      // The root node is an internal node. We do not want to create a new root
+      // node because the root node is special and holds the size of the tree
+      // and a pointer to the rightmost node. So we create a new internal node
+      // and move all of the items on the current root into the new node.
+      parent = new_internal_node(parent);
+      parent->set_child(0, parent);
+      parent->swap(root());
+      node = parent;
+    }
+  }
+
+  // Split the node.
+  node_type *split_node;
+  if (node->leaf()) {
+    split_node = new_leaf_node(parent);
+    node->split(split_node, insert_position);
+    if (rightmost() == node) {
+      *mutable_rightmost() = split_node;
+    }
+  } else {
+    split_node = new_internal_node(parent);
+    node->split(split_node, insert_position);
+  }
+
+  if (insert_position > node->count()) {
+    insert_position = insert_position - node->count() - 1;
+    node = split_node;
+  }
+}
+
+template <typename P>
+void btree<P>::merge_nodes(node_type *left, node_type *right) {
+  left->merge(right);
+  if (right->leaf()) {
+    if (rightmost() == right) {
+      *mutable_rightmost() = left;
+    }
+    delete_leaf_node(right);
+  } else {
+    delete_internal_node(right);
+  }
+}
+
+template <typename P>
+bool btree<P>::try_merge_or_rebalance(iterator *iter) {
+  node_type *parent = iter->node->parent();
+  if (iter->node->position() > 0) {
+    // Try merging with our left sibling.
+    node_type *left = parent->child(iter->node->position() - 1);
+    if ((1 + left->count() + iter->node->count()) <= left->max_count()) {
+      iter->position += 1 + left->count();
+      merge_nodes(left, iter->node);
+      iter->node = left;
+      return true;
+    }
+  }
+  if (iter->node->position() < parent->count()) {
+    // Try merging with our right sibling.
+    node_type *right = parent->child(iter->node->position() + 1);
+    if ((1 + iter->node->count() + right->count()) <= right->max_count()) {
+      merge_nodes(iter->node, right);
+      return true;
+    }
+    // Try rebalancing with our right sibling. We don't perform rebalancing if
+    // we deleted the first element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the front of the tree.
+    if ((right->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position > 0))) {
+      int to_move = (right->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, right->count() - 1);
+      iter->node->rebalance_right_to_left(right, to_move);
+      return false;
+    }
+  }
+  if (iter->node->position() > 0) {
+    // Try rebalancing with our left sibling. We don't perform rebalancing if
+    // we deleted the last element from iter->node and the node is not
+    // empty. This is a small optimization for the common pattern of deleting
+    // from the back of the tree.
+    node_type *left = parent->child(iter->node->position() - 1);
+    if ((left->count() > kMinNodeValues) &&
+        ((iter->node->count() == 0) ||
+         (iter->position < iter->node->count()))) {
+      int to_move = (left->count() - iter->node->count()) / 2;
+      to_move = std::min(to_move, left->count() - 1);
+      left->rebalance_left_to_right(iter->node, to_move);
+      iter->position += to_move;
+      return false;
+    }
+  }
+  return false;
+}
+
+template <typename P>
+void btree<P>::try_shrink() {
+  if (root()->count() > 0) {
+    return;
+  }
+  // Deleted the last item on the root node, shrink the height of the tree.
+  if (root()->leaf()) {
+    assert(size() == 0);
+    delete_leaf_node(root());
+    *mutable_root() = NULL;
+  } else {
+    node_type *child = root()->child(0);
+    if (child->leaf()) {
+      // The child is a leaf node so simply make it the root node in the tree.
+      child->make_root();
+      delete_internal_root_node();
+      *mutable_root() = child;
+    } else {
+      // The child is an internal node. We want to keep the existing root node
+      // so we move all of the values from the child node into the existing
+      // (empty) root node.
+      child->swap(root());
+      delete_internal_node(child);
+    }
+  }
+}
+
+template <typename P> template <typename IterType>
+inline IterType btree<P>::internal_last(IterType iter) {
+  while (iter.node && iter.position == iter.node->count()) {
+    iter.position = iter.node->position();
+    iter.node = iter.node->parent();
+    if (iter.node->leaf()) {
+      iter.node = NULL;
+    }
+  }
+  return iter;
+}
+
+template <typename P>
+inline typename btree<P>::iterator
+btree<P>::internal_insert(iterator iter, const value_type &v) {
+  if (!iter.node->leaf()) {
+    // We can't insert on an internal node. Instead, we'll insert after the
+    // previous value which is guaranteed to be on a leaf node.
+    --iter;
+    ++iter.position;
+  }
+  if (iter.node->count() == iter.node->max_count()) {
+    // Make room in the leaf for the new item.
+    if (iter.node->max_count() < kNodeValues) {
+      // Insertion into the root where the root is smaller that the full node
+      // size. Simply grow the size of the root node.
+      assert(iter.node == root());
+      iter.node = new_leaf_root_node(
+          std::min<int>(kNodeValues, 2 * iter.node->max_count()));
+      iter.node->swap(root());
+      delete_leaf_node(root());
+      *mutable_root() = iter.node;
+    } else {
+      rebalance_or_split(&iter);
+      ++*mutable_size();
+    }
+  } else if (!root()->leaf()) {
+    ++*mutable_size();
+  }
+  iter.node->insert_value(iter.position, v);
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate(
+    const key_type &key, IterType iter) const {
+  return internal_locate_type::dispatch(key, *this, iter);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_plain_compare(
+    const key_type &key, IterType iter) const {
+  for (;;) {
+    iter.position = iter.node->lower_bound(key, key_comp());
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return std::make_pair(iter, 0);
+}
+
+template <typename P> template <typename IterType>
+inline std::pair<IterType, int> btree<P>::internal_locate_compare_to(
+    const key_type &key, IterType iter) const {
+  for (;;) {
+    int res = iter.node->lower_bound(key, key_comp());
+    iter.position = res & kMatchMask;
+    if (res & kExactMatch) {
+      return std::make_pair(iter, static_cast<int>(kExactMatch));
+    }
+    if (iter.node->leaf()) {
+      break;
+    }
+    iter.node = iter.node->child(iter.position);
+  }
+  return std::make_pair(iter, -kExactMatch);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_lower_bound(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    for (;;) {
+      iter.position =
+          iter.node->lower_bound(key, key_comp()) & kMatchMask;
+      if (iter.node->leaf()) {
+        break;
+      }
+      iter.node = iter.node->child(iter.position);
+    }
+    iter = internal_last(iter);
+  }
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_upper_bound(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    for (;;) {
+      iter.position = iter.node->upper_bound(key, key_comp());
+      if (iter.node->leaf()) {
+        break;
+      }
+      iter.node = iter.node->child(iter.position);
+    }
+    iter = internal_last(iter);
+  }
+  return iter;
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_unique(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    std::pair<IterType, int> res = internal_locate(key, iter);
+    if (res.second == kExactMatch) {
+      return res.first;
+    }
+    if (!res.second) {
+      iter = internal_last(res.first);
+      if (iter.node && !compare_keys(key, iter.key())) {
+        return iter;
+      }
+    }
+  }
+  return IterType(NULL, 0);
+}
+
+template <typename P> template <typename IterType>
+IterType btree<P>::internal_find_multi(
+    const key_type &key, IterType iter) const {
+  if (iter.node) {
+    iter = internal_lower_bound(key, iter);
+    if (iter.node) {
+      iter = internal_last(iter);
+      if (iter.node && !compare_keys(key, iter.key())) {
+        return iter;
+      }
+    }
+  }
+  return IterType(NULL, 0);
+}
+
+template <typename P>
+void btree<P>::internal_clear(node_type *node) {
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      internal_clear(node->child(i));
+    }
+    if (node == root()) {
+      delete_internal_root_node();
+    } else {
+      delete_internal_node(node);
+    }
+  } else {
+    delete_leaf_node(node);
+  }
+}
+
+template <typename P>
+void btree<P>::internal_dump(
+    std::ostream &os, const node_type *node, int level) const {
+  for (int i = 0; i < node->count(); ++i) {
+    if (!node->leaf()) {
+      internal_dump(os, node->child(i), level + 1);
+    }
+    for (int j = 0; j < level; ++j) {
+      os << "  ";
+    }
+    os << node->key(i) << " [" << level << "]\n";
+  }
+  if (!node->leaf()) {
+    internal_dump(os, node->child(node->count()), level + 1);
+  }
+}
+
+template <typename P>
+int btree<P>::internal_verify(
+    const node_type *node, const key_type *lo, const key_type *hi) const {
+  assert(node->count() > 0);
+  assert(node->count() <= node->max_count());
+  if (lo) {
+    assert(!compare_keys(node->key(0), *lo));
+  }
+  if (hi) {
+    assert(!compare_keys(*hi, node->key(node->count() - 1)));
+  }
+  for (int i = 1; i < node->count(); ++i) {
+    assert(!compare_keys(node->key(i), node->key(i - 1)));
+  }
+  int count = node->count();
+  if (!node->leaf()) {
+    for (int i = 0; i <= node->count(); ++i) {
+      assert(node->child(i) != NULL);
+      assert(node->child(i)->parent() == node);
+      assert(node->child(i)->position() == i);
+      count += internal_verify(
+          node->child(i),
+          (i == 0) ? lo : &node->key(i - 1),
+          (i == node->count()) ? hi : &node->key(i));
+    }
+  }
+  return count;
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_H__
diff --git a/include/btree/btree_container.h b/include/btree/btree_container.h
new file mode 100644
index 0000000..fb617ab
--- /dev/null
+++ b/include/btree/btree_container.h
@@ -0,0 +1,349 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_BTREE_BTREE_CONTAINER_H__
+#define UTIL_BTREE_BTREE_CONTAINER_H__
+
+#include <iosfwd>
+#include <utility>
+
+#include "btree.h"
+
+namespace btree {
+
+// A common base class for btree_set, btree_map, btree_multiset and
+// btree_multimap.
+template <typename Tree>
+class btree_container {
+  typedef btree_container<Tree> self_type;
+
+ public:
+  typedef typename Tree::params_type params_type;
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::pointer pointer;
+  typedef typename Tree::const_pointer const_pointer;
+  typedef typename Tree::reference reference;
+  typedef typename Tree::const_reference const_reference;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::difference_type difference_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+  typedef typename Tree::reverse_iterator reverse_iterator;
+  typedef typename Tree::const_reverse_iterator const_reverse_iterator;
+
+ public:
+  // Default constructor.
+  btree_container(const key_compare &comp, const allocator_type &alloc)
+      : tree_(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_container(const self_type &x)
+      : tree_(x.tree_) {
+  }
+
+  // Iterator routines.
+  iterator begin() { return tree_.begin(); }
+  const_iterator begin() const { return tree_.begin(); }
+  iterator end() { return tree_.end(); }
+  const_iterator end() const { return tree_.end(); }
+  reverse_iterator rbegin() { return tree_.rbegin(); }
+  const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+  reverse_iterator rend() { return tree_.rend(); }
+  const_reverse_iterator rend() const { return tree_.rend(); }
+
+  // Lookup routines.
+  iterator lower_bound(const key_type &key) {
+    return tree_.lower_bound(key);
+  }
+  const_iterator lower_bound(const key_type &key) const {
+    return tree_.lower_bound(key);
+  }
+  iterator upper_bound(const key_type &key) {
+    return tree_.upper_bound(key);
+  }
+  const_iterator upper_bound(const key_type &key) const {
+    return tree_.upper_bound(key);
+  }
+  std::pair<iterator,iterator> equal_range(const key_type &key) {
+    return tree_.equal_range(key);
+  }
+  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
+    return tree_.equal_range(key);
+  }
+
+  // Utility routines.
+  void clear() {
+    tree_.clear();
+  }
+  void swap(self_type &x) {
+    tree_.swap(x.tree_);
+  }
+  void dump(std::ostream &os) const {
+    tree_.dump(os);
+  }
+  void verify() const {
+    tree_.verify();
+  }
+
+  // Size routines.
+  size_type size() const { return tree_.size(); }
+  size_type max_size() const { return tree_.max_size(); }
+  bool empty() const { return tree_.empty(); }
+  size_type height() const { return tree_.height(); }
+  size_type internal_nodes() const { return tree_.internal_nodes(); }
+  size_type leaf_nodes() const { return tree_.leaf_nodes(); }
+  size_type nodes() const { return tree_.nodes(); }
+  size_type bytes_used() const { return tree_.bytes_used(); }
+  static double average_bytes_per_value() {
+    return Tree::average_bytes_per_value();
+  }
+  double fullness() const { return tree_.fullness(); }
+  double overhead() const { return tree_.overhead(); }
+
+  bool operator==(const self_type& x) const {
+    if (size() != x.size()) {
+      return false;
+    }
+    for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi) {
+      if (*i != *xi) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool operator!=(const self_type& other) const {
+    return !operator==(other);
+  }
+
+
+ protected:
+  Tree tree_;
+};
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const btree_container<T> &b) {
+  b.dump(os);
+  return os;
+}
+
+// A common base class for btree_set and safe_btree_set.
+template <typename Tree>
+class btree_unique_container : public btree_container<Tree> {
+  typedef btree_unique_container<Tree> self_type;
+  typedef btree_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+
+ public:
+  // Default constructor.
+  btree_unique_container(const key_compare &comp = key_compare(),
+                         const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_unique_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_unique_container(InputIterator b, InputIterator e,
+                         const key_compare &comp = key_compare(),
+                         const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Lookup routines.
+  iterator find(const key_type &key) {
+    return this->tree_.find_unique(key);
+  }
+  const_iterator find(const key_type &key) const {
+    return this->tree_.find_unique(key);
+  }
+  size_type count(const key_type &key) const {
+    return this->tree_.count_unique(key);
+  }
+
+  // Insertion routines.
+  std::pair<iterator,bool> insert(const value_type &x) {
+    return this->tree_.insert_unique(x);
+  }
+  iterator insert(iterator position, const value_type &x) {
+    return this->tree_.insert_unique(position, x);
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_unique(b, e);
+  }
+
+  // Deletion routines.
+  int erase(const key_type &key) {
+    return this->tree_.erase_unique(key);
+  }
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(const iterator &iter) {
+    return this->tree_.erase(iter);
+  }
+  void erase(const iterator &first, const iterator &last) {
+    this->tree_.erase(first, last);
+  }
+};
+
+// A common base class for btree_map and safe_btree_map.
+template <typename Tree>
+class btree_map_container : public btree_unique_container<Tree> {
+  typedef btree_map_container<Tree> self_type;
+  typedef btree_unique_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::data_type data_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::mapped_type mapped_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+
+ private:
+  // A pointer-like object which only generates its value when
+  // dereferenced. Used by operator[] to avoid constructing an empty data_type
+  // if the key already exists in the map.
+  struct generate_value {
+    generate_value(const key_type &k)
+        : key(k) {
+    }
+    value_type operator*() const {
+      return std::make_pair(key, data_type());
+    }
+    const key_type &key;
+  };
+
+ public:
+  // Default constructor.
+  btree_map_container(const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_map_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_map_container(InputIterator b, InputIterator e,
+                      const key_compare &comp = key_compare(),
+                      const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+
+  // Insertion routines.
+  data_type& operator[](const key_type &key) {
+    return this->tree_.insert_unique(key, generate_value(key)).first->second;
+  }
+};
+
+// A common base class for btree_multiset and btree_multimap.
+template <typename Tree>
+class btree_multi_container : public btree_container<Tree> {
+  typedef btree_multi_container<Tree> self_type;
+  typedef btree_container<Tree> super_type;
+
+ public:
+  typedef typename Tree::key_type key_type;
+  typedef typename Tree::value_type value_type;
+  typedef typename Tree::size_type size_type;
+  typedef typename Tree::key_compare key_compare;
+  typedef typename Tree::allocator_type allocator_type;
+  typedef typename Tree::iterator iterator;
+  typedef typename Tree::const_iterator const_iterator;
+
+ public:
+  // Default constructor.
+  btree_multi_container(const key_compare &comp = key_compare(),
+                        const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multi_container(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multi_container(InputIterator b, InputIterator e,
+                        const key_compare &comp = key_compare(),
+                        const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+    insert(b, e);
+  }
+
+  // Lookup routines.
+  iterator find(const key_type &key) {
+    return this->tree_.find_multi(key);
+  }
+  const_iterator find(const key_type &key) const {
+    return this->tree_.find_multi(key);
+  }
+  size_type count(const key_type &key) const {
+    return this->tree_.count_multi(key);
+  }
+
+  // Insertion routines.
+  iterator insert(const value_type &x) {
+    return this->tree_.insert_multi(x);
+  }
+  iterator insert(iterator position, const value_type &x) {
+    return this->tree_.insert_multi(position, x);
+  }
+  template <typename InputIterator>
+  void insert(InputIterator b, InputIterator e) {
+    this->tree_.insert_multi(b, e);
+  }
+
+  // Deletion routines.
+  int erase(const key_type &key) {
+    return this->tree_.erase_multi(key);
+  }
+  // Erase the specified iterator from the btree. The iterator must be valid
+  // (i.e. not equal to end()).  Return an iterator pointing to the node after
+  // the one that was erased (or end() if none exists).
+  iterator erase(const iterator &iter) {
+    return this->tree_.erase(iter);
+  }
+  void erase(const iterator &first, const iterator &last) {
+    this->tree_.erase(first, last);
+  }
+};
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_CONTAINER_H__
diff --git a/include/btree/btree_map.h b/include/btree/btree_map.h
new file mode 100644
index 0000000..b83489f
--- /dev/null
+++ b/include/btree/btree_map.h
@@ -0,0 +1,130 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// A btree_map<> implements the STL unique sorted associative container
+// interface and the pair associative container interface (a.k.a map<>) using a
+// btree. A btree_multimap<> implements the STL multiple sorted associative
+// container interface and the pair associtive container interface (a.k.a
+// multimap<>) using a btree. See btree.h for details of the btree
+// implementation and caveats.
+
+#ifndef UTIL_BTREE_BTREE_MAP_H__
+#define UTIL_BTREE_BTREE_MAP_H__
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "btree.h"
+#include "btree_container.h"
+
+namespace btree {
+
+// The btree_map class is needed mainly for its constructors.
+template <typename Key, typename Value,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value> >,
+          int TargetNodeSize = 256>
+class btree_map : public btree_map_container<
+  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_map_params<
+    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_map_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+
+ public:
+  // Default constructor.
+  btree_map(const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_map(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_map(InputIterator b, InputIterator e,
+            const key_compare &comp = key_compare(),
+            const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_map<K, V, C, A, N> &x,
+                 btree_map<K, V, C, A, N> &y) {
+  x.swap(y);
+}
+
+// The btree_multimap class is needed mainly for its constructors.
+template <typename Key, typename Value,
+          typename Compare = std::less<Key>,
+          typename Alloc = std::allocator<std::pair<const Key, Value> >,
+          int TargetNodeSize = 256>
+class btree_multimap : public btree_multi_container<
+  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
+
+  typedef btree_multimap<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
+  typedef btree_map_params<
+    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
+  typedef btree<params_type> btree_type;
+  typedef btree_multi_container<btree_type> super_type;
+
+ public:
+  typedef typename btree_type::key_compare key_compare;
+  typedef typename btree_type::allocator_type allocator_type;
+  typedef typename btree_type::data_type data_type;
+  typedef typename btree_type::mapped_type mapped_type;
+
+ public:
+  // Default constructor.
+  btree_multimap(const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(comp, alloc) {
+  }
+
+  // Copy constructor.
+  btree_multimap(const self_type &x)
+      : super_type(x) {
+  }
+
+  // Range constructor.
+  template <class InputIterator>
+  btree_multimap(InputIterator b, InputIterator e,
+                 const key_compare &comp = key_compare(),
+                 const allocator_type &alloc = allocator_type())
+      : super_type(b, e, comp, alloc) {
+  }
+};
+
+template <typename K, typename V, typename C, typename A, int N>
+inline void swap(btree_multimap<K, V, C, A, N> &x,
+                 btree_multimap<K, V, C, A, N> &y) {
+  x.swap(y);
+}
+
+} // namespace btree
+
+#endif  // UTIL_BTREE_BTREE_MAP_H__
diff --git a/include/kseq.h b/include/kseq.h
new file mode 100644
index 0000000..b2238d1
--- /dev/null
+++ b/include/kseq.h
@@ -0,0 +1,235 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t)						\
+	typedef struct __kstream_t {				\
+		unsigned char *buf;						\
+		int begin, end, is_eof;					\
+		type_t f;								\
+	} kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)								\
+	static inline kstream_t *ks_init(type_t f)						\
+	{																\
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
+		ks->f = f;													\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
+		return ks;													\
+	}																\
+	static inline void ks_destroy(kstream_t *ks)					\
+	{																\
+		if (ks) {													\
+			free(ks->buf);											\
+			free(ks);												\
+		}															\
+	}
+
+#define __KS_GETC(__read, __bufsize)						\
+	static inline int ks_getc(kstream_t *ks)				\
+	{														\
+		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
+		if (ks->begin >= ks->end) {							\
+			ks->begin = 0;									\
+			ks->end = __read(ks->f, ks->buf, __bufsize);	\
+			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
+		}													\
+		return (int)ks->buf[ks->begin++];					\
+	}
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)								\
+	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+	{																	\
+		int gotany = 0;													\
+		if (dret) *dret = 0;											\
+		str->l = append? str->l : 0;									\
+		for (;;) {														\
+			int i;														\
+			if (ks->begin >= ks->end) {									\
+				if (!ks->is_eof) {										\
+					ks->begin = 0;										\
+					ks->end = __read(ks->f, ks->buf, __bufsize);		\
+					if (ks->end == 0) { ks->is_eof = 1; break; }		\
+				} else break;											\
+			}															\
+			if (delimiter == KS_SEP_LINE) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (ks->buf[i] == '\n') break; \
+			} else if (delimiter > KS_SEP_MAX) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (ks->buf[i] == delimiter) break;					\
+			} else if (delimiter == KS_SEP_SPACE) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i])) break;						\
+			} else if (delimiter == KS_SEP_TAB) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+			} else i = 0; /* never come to here! */						\
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
+				str->m = str->l + (i - ks->begin) + 1;					\
+				kroundup32(str->m);										\
+				str->s = (char*)realloc(str->s, str->m);				\
+			}															\
+			gotany = 1;													\
+			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+			str->l = str->l + (i - ks->begin);							\
+			ks->begin = i + 1;											\
+			if (i < ks->end) {											\
+				if (dret) *dret = ks->buf[i];							\
+				break;													\
+			}															\
+		}																\
+		if (!gotany && ks_eof(ks)) return -1;							\
+		if (str->s == 0) {												\
+			str->m = 1;													\
+			str->s = (char*)calloc(1, 1);								\
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+		str->s[str->l] = '\0';											\
+		return str->l;													\
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+	__KS_TYPE(type_t)							\
+	__KS_BASIC(type_t, __bufsize)				\
+	__KS_GETC(__read, __bufsize)				\
+	__KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
+	{																	\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+		s->f = ks_init(fd);												\
+		return s;														\
+	}																	\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
+	{																	\
+		if (!ks) return;												\
+		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+		ks_destroy(ks->f);												\
+		free(ks);														\
+	}
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c; \
+		kstream_t *ks = seq->f; \
+		if (seq->last_char == 0) { /* then jump to the next header line */ \
+			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+			if (c == -1) return -1; /* end of file */ \
+			seq->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
+		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+		if (c == -1) return -2; /* error: no quality string */ \
+		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+		seq->last_char = 0;	/* we have not come to the next header line */ \
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
+	}
+
+#define __KSEQ_TYPE(type_t)						\
+	typedef struct {							\
+		kstring_t name, comment, seq, qual;		\
+		int last_char;							\
+		kstream_t *f;							\
+	} kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT(type_t, __read, 16384)			\
+	__KSEQ_TYPE(type_t)							\
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/include/macros.h b/include/macros.h
new file mode 100644
index 0000000..8a0853d
--- /dev/null
+++ b/include/macros.h
@@ -0,0 +1,59 @@
+/*		 
+ * Sux: Succinct data structures
+ *
+ * Copyright (C) 2007-2013 Sebastiano Vigna 
+ *
+ *  This library is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU Lesser General Public License as published by the Free
+ *  Software Foundation; either version 3 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This library is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef ranksel_macros_h
+#define ranksel_macros_h
+
+#define ONES_STEP_4 ( 0x1111111111111111ULL )
+#define ONES_STEP_8 ( 0x0101010101010101ULL )
+#define ONES_STEP_9 ( 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | 1ULL << 36 | 1ULL << 45 | 1ULL << 54 )
+#define ONES_STEP_16 ( 1ULL << 0 | 1ULL << 16 | 1ULL << 32 | 1ULL << 48 )
+#define MSBS_STEP_4 ( 0x8ULL * ONES_STEP_4 )
+#define MSBS_STEP_8 ( 0x80ULL * ONES_STEP_8 )
+#define MSBS_STEP_9 ( 0x100ULL * ONES_STEP_9 )
+#define MSBS_STEP_16 ( 0x8000ULL * ONES_STEP_16 )
+#define INCR_STEP_8 ( 0x80ULL << 56 | 0x40ULL << 48 | 0x20ULL << 40 | 0x10ULL << 32 | 0x8ULL << 24 | 0x4ULL << 16 | 0x2ULL << 8 | 0x1 )
+
+#define ONES_STEP_32 ( 0x0000000100000001ULL )
+#define MSBS_STEP_32 ( 0x8000000080000000ULL )
+	
+#define COMPARE_STEP_8(x,y) ( ( ( ( ( (x) | MSBS_STEP_8 ) - ( (y) & ~MSBS_STEP_8 ) ) ^ (x) ^ ~(y) ) & MSBS_STEP_8 ) >> 7 )
+#define LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( (x) & ~MSBS_STEP_8 ) ) ^ (x) ^ (y) ) & MSBS_STEP_8 ) >> 7 )
+
+#define UCOMPARE_STEP_9(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_9 ) - ( (y) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_9 ) >> 8 )
+#define UCOMPARE_STEP_16(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_16 ) - ( (y) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_16 ) >> 15 )
+#define ULEQ_STEP_9(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_9 ) - ( (x) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_9 ) >> 8 )
+#define ULEQ_STEP_16(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_16 ) - ( (x) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_16 ) >> 15 )
+#define ZCOMPARE_STEP_8(x) ( ( ( x | ( ( x | MSBS_STEP_8 ) - ONES_STEP_8 ) ) & MSBS_STEP_8 ) >> 7 )
+
+#define EASY_LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( x ) ) ) & MSBS_STEP_8 ) >> 7 )
+#define EASY_LEQ_STEP_8_MSBS(x,y) ( ( ( ( (y) | MSBS_STEP_8 ) - ( x ) ) ) & MSBS_STEP_8 )
+
+__inline static int ceil_log2( const uint64_t x ) {
+	return x <= 2 ? x - 1 : 64 - __builtin_clzll( x - 1 );
+}
+
+__inline static int msb( const uint64_t x ) {
+	if ( x == 0 ) return -1;
+	return 63 - __builtin_clzll( x );
+}
+
+
+#endif
diff --git a/include/rank9b.h b/include/rank9b.h
new file mode 100644
index 0000000..080d69a
--- /dev/null
+++ b/include/rank9b.h
@@ -0,0 +1,42 @@
+/*		 
+ * Sux: Succinct data structures
+ *
+ * Copyright (C) 2007-2013 Sebastiano Vigna 
+ *
+ *  This library is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU Lesser General Public License as published by the Free
+ *  Software Foundation; either version 3 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This library is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef rank9b_h
+#define rank9b_h
+#include <stdint.h>
+#include "macros.h"
+
+class rank9b {
+private:
+	const uint64_t *bits;
+	uint64_t *counts, *inventory;
+	uint64_t num_words, num_counts, inventory_size, ones_per_inventory, log2_ones_per_inventory, num_ones;
+
+public:
+	rank9b();
+	rank9b( const uint64_t * const bits, const uint64_t num_bits );
+	~rank9b();
+	uint64_t rank( const uint64_t pos );
+	// Just for analysis purposes
+	void print_counts();
+	uint64_t bit_count();
+};
+
+#endif
diff --git a/include/stringpiece.h b/include/stringpiece.h
new file mode 100644
index 0000000..223e701
--- /dev/null
+++ b/include/stringpiece.h
@@ -0,0 +1,181 @@
+// Copyright 2001-2010 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// A string-like object that points to a sized piece of memory.
+//
+// Functions or methods may use const StringPiece& parameters to accept either
+// a "const char*" or a "string" value that will be implicitly converted to
+// a StringPiece.  The implicit conversion means that it is often appropriate
+// to include this .h file in other files rather than forward-declaring
+// StringPiece as would be appropriate for most other Google classes.
+//
+// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
+// conversions from "const char*" to "string" and back again.
+//
+//
+// Arghh!  I wish C++ literals were "string".
+
+#ifndef STRINGS_STRINGPIECE_H__
+#define STRINGS_STRINGPIECE_H__
+
+#include <string.h>
+#include <algorithm>
+#include <cstddef>
+#include <iosfwd>
+#include <string>
+
+class StringPiece {
+ private:
+  const char*   ptr_;
+  int           length_;
+
+ public:
+  // We provide non-explicit singleton constructors so users can pass
+  // in a "const char*" or a "string" wherever a "StringPiece" is
+  // expected.
+  StringPiece() : ptr_(NULL), length_(0) { }
+  StringPiece(const char* str)
+    : ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
+  StringPiece(const std::string& str)
+    : ptr_(str.data()), length_(static_cast<int>(str.size())) { }
+  StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
+
+  // data() may return a pointer to a buffer with embedded NULs, and the
+  // returned buffer may or may not be null terminated.  Therefore it is
+  // typically a mistake to pass data() to a routine that expects a NUL
+  // terminated string.
+  const char* data() const { return ptr_; }
+  int size() const { return length_; }
+  int length() const { return length_; }
+  bool empty() const { return length_ == 0; }
+
+  void clear() { ptr_ = NULL; length_ = 0; }
+  void set(const char* data, int len) { ptr_ = data; length_ = len; }
+  void set(const char* str) {
+    ptr_ = str;
+    if (str != NULL)
+      length_ = static_cast<int>(strlen(str));
+    else
+      length_ = 0;
+  }
+  void set(const void* data, int len) {
+    ptr_ = reinterpret_cast<const char*>(data);
+    length_ = len;
+  }
+
+  char operator[](int i) const { return ptr_[i]; }
+
+  void remove_prefix(int n) {
+    ptr_ += n;
+    length_ -= n;
+  }
+
+  void remove_suffix(int n) {
+    length_ -= n;
+  }
+
+  int compare(const StringPiece& x) const {
+    int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
+    if (r == 0) {
+      if (length_ < x.length_) r = -1;
+      else if (length_ > x.length_) r = +1;
+    }
+    return r;
+  }
+
+  std::string as_string() const {
+    return std::string(data(), size());
+  }
+  // We also define ToString() here, since many other string-like
+  // interfaces name the routine that converts to a C++ string
+  // "ToString", and it's confusing to have the method that does that
+  // for a StringPiece be called "as_string()".  We also leave the
+  // "as_string()" method defined here for existing code.
+  std::string ToString() const {
+    return std::string(data(), size());
+  }
+
+  void CopyToString(std::string* target) const;
+  void AppendToString(std::string* target) const;
+
+  // Does "this" start with "x"
+  bool starts_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (memcmp(ptr_, x.ptr_, x.length_) == 0));
+  }
+
+  // Does "this" end with "x"
+  bool ends_with(const StringPiece& x) const {
+    return ((length_ >= x.length_) &&
+            (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
+  }
+
+  // standard STL container boilerplate
+  typedef char value_type;
+  typedef const char* pointer;
+  typedef const char& reference;
+  typedef const char& const_reference;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  static const size_type npos;
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  iterator begin() const { return ptr_; }
+  iterator end() const { return ptr_ + length_; }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(ptr_ + length_);
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(ptr_);
+  }
+  // STLS says return size_type, but Google says return int
+  int max_size() const { return length_; }
+  int capacity() const { return length_; }
+
+  int copy(char* buf, size_type n, size_type pos = 0) const;
+
+  bool contains(StringPiece s) const;
+
+  int find(const StringPiece& s, size_type pos = 0) const;
+  int find(char c, size_type pos = 0) const;
+  int rfind(const StringPiece& s, size_type pos = npos) const;
+  int rfind(char c, size_type pos = npos) const;
+
+  StringPiece substr(size_type pos, size_type n = npos) const;
+  
+  static bool _equal(const StringPiece&, const StringPiece&);
+};
+
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+  return StringPiece::_equal(x, y);
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+  return !(x == y);
+}
+
+inline bool operator<(const StringPiece& x, const StringPiece& y) {
+  const int r = memcmp(x.data(), y.data(),
+                       std::min(x.size(), y.size()));
+  return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece& x, const StringPiece& y) {
+  return y < x;
+}
+
+inline bool operator<=(const StringPiece& x, const StringPiece& y) {
+  return !(x > y);
+}
+
+inline bool operator>=(const StringPiece& x, const StringPiece& y) {
+  return !(x < y);
+}
+
+// allow StringPiece to be logged
+extern std::ostream& operator<<(std::ostream& o, const StringPiece& piece);
+
+#endif  // STRINGS_STRINGPIECE_H__
diff --git a/include/xxhash.h b/include/xxhash.h
new file mode 100644
index 0000000..c60aa61
--- /dev/null
+++ b/include/xxhash.h
@@ -0,0 +1,192 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*****************************
+*  Definitions
+*****************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*****************************
+*  Namespace Emulation
+*****************************/
+/* Motivations :
+
+If you need to include xxHash into your library,
+but wish to avoid xxHash symbols to be present on your library interface
+in an effort to avoid potential name collision if another library also includes xxHash,
+
+you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
+with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
+
+Note that no change is required within the calling program :
+it can still call xxHash functions using their regular name.
+They will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#endif
+
+
+/*****************************
+*  Simple Hash Functions
+*****************************/
+
+unsigned int       XXH32 (const void* input, size_t length, unsigned seed);
+unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*
+XXH32() :
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    Faster on 64-bits systems. Slower on 32-bits systems.
+*/
+
+
+
+/*****************************
+*  Advanced Hash Functions
+*****************************/
+typedef struct { long long ll[ 6]; } XXH32_state_t;
+typedef struct { long long ll[11]; } XXH64_state_t;
+
+/*
+These structures allow static allocation of XXH states.
+States must then be initialized using XXHnn_reset() before first use.
+
+If you prefer dynamic allocation, please refer to functions below.
+*/
+
+XXH32_state_t* XXH32_createState(void);
+XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+/*
+These functions create and release memory for XXH state.
+States must then be initialized using XXHnn_reset() before first use.
+*/
+
+
+XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
+XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_errorcode      XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode      XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+unsigned long long XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions calculate the xxHash of an input provided in multiple smaller packets,
+as opposed to an input provided as a single block.
+
+XXH state space must first be allocated, using either static or dynamic method provided above.
+
+Start a new hash by initializing state with a seed, using XXHnn_reset().
+
+Then, feed the hash state by calling XXHnn_update() as many times as necessary.
+Obviously, input must be valid, meaning allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, you can produce a hash anytime, by using XXHnn_digest().
+This function returns the final nn-bits hash.
+You can nonetheless continue feeding the hash state with more input,
+and therefore get some new hashes, by calling again XXHnn_digest().
+
+When you are done, don't forget to free XXH state space, using typically XXHnn_freeState().
+*/
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/scripts/compile.sh b/scripts/compile.sh
new file mode 100755
index 0000000..96ba356
--- /dev/null
+++ b/scripts/compile.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+set -e
+
+# from http://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash
+no_native_arch=false
+cxxflags=""
+
+while [[ $# > 1 ]]
+do
+    key="$1"
+
+    case $key in
+        -b|--branch)
+            branch="$2"
+            shift # past argument
+            ;;
+        -v|--version)
+            version="$2"
+            shift # past argument
+            ;;
+        -f|--cxxflags)
+            cxxflags="$2"
+            shift # past argument
+            ;;
+        --no-native)
+            no_native_arch=true
+            ;;
+        *)
+            # unknown option
+            ;;
+    esac
+    shift # past argument or value
+done
+
+echo "Building rapmap [branch = ${branch}]. Tagging version as ${version}"
+if [ "$no_native_arch" = true ] ; then 
+    echo "Disabling -march=native"
+fi
+
+if [[ -z $cxxflags ]] ; then
+    echo "Passed CXXFLAGS ${cxxflags}"
+fi
+
+# Activate Holy Build Box environment.
+source /hbb_exe/activate
+
+set -x
+
+# Install things we need
+yum install -y --quiet wget
+wget http://download.fedoraproject.org/pub/epel/5/x86_64/epel-release-5-4.noarch.rpm
+rpm -i --quiet epel-release-5-4.noarch.rpm
+#yum install -y --quiet git
+#yum install -y --quiet xz-devel.x86_64
+#yum install -y --quiet bzip2-devel.x86_64
+yum install -y --quiet unzip
+
+curl -k -L https://github.com/COMBINE-lab/RapMap/archive/${branch}.zip -o ${branch}.zip
+unzip ${branch}.zip
+mv RapMap-${branch} RapMap
+cd RapMap
+mkdir build
+cd build
+
+
+if [ "$no_native_arch" = true ] ; then 
+    cmake -DFETCH_BOOST=TRUE -DCMAKE_CXX_FLAGS=${cxxflags} -DNO_NATIVE_ARCH=TRUE ..
+else
+    cmake -DFETCH_BOOST=TRUE -DCMAKE_CXX_FLAGS=${cxxflags} ..
+fi
+
+make
+make install
+make test
+cd ../scripts
+bash make-release.sh -v ${version} -n CentOS5
+cd ../RELEASES
+cp *.tar.gz /io/
diff --git a/scripts/make-release.sh b/scripts/make-release.sh
new file mode 100755
index 0000000..e108d35
--- /dev/null
+++ b/scripts/make-release.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+  DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+  SOURCE="$(readlink "$SOURCE")"
+  [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
+done
+DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+
+host=
+version=
+
+while getopts "v:n:" opt; do
+  case $opt in
+    n)
+      echo "Host is $OPTARG" >&2
+      host=$OPTARG
+      ;;
+    v)
+      echo "Version is $OPTARG" >&2
+      version=$OPTARG
+      ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      exit 1
+      ;;
+  esac
+done
+
+echo -e "Preparing binary release\n=====================\n"
+echo -e "Version = ${version}"
+echo -e "Host = ${host}"
+
+# create the binary directory 
+betaname=RapMap-${version}_${host}
+mkdir -p ${DIR}/../RELEASES
+mkdir -p ${DIR}/../RELEASES/${betaname}
+mkdir -p ${DIR}/../RELEASES/${betaname}/bin
+mkdir -p ${DIR}/../RELEASES/${betaname}/lib
+
+echo -e "Copying over the binary\n"
+cp ${DIR}/../bin/rapmap ${DIR}/../RELEASES/${betaname}/bin/
+
+# copy other dependencies (shared libraries)
+#echo -e "Copying over other shared library dependencies\n"
+#bash ${DIR}/../scripts/cpld.bash ${DIR}/../bin/salmon ${DIR}/../RELEASES/${betaname}/lib/
+#echo -e "Removing dangerous dependencies\n"
+#rm ${DIR}/../RELEASES/${betaname}/lib/libc.so.6
+#rm ${DIR}/../RELEASES/${betaname}/lib/ld-linux-x86-64.so.2
+#rm ${DIR}/../RELEASES/${betaname}/lib/libdl.so.2
+#rm ${DIR}/../RELEASES/${betaname}/lib/libpthread*.so.*
+
+# now make the tarball
+echo -e "Making the tarball\n"
+cd ${DIR}/../RELEASES
+tar czvf ${betaname}.tar.gz ${betaname}
+
+echo -e "Done making release!"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100755
index 0000000..2708677
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,143 @@
+set (RAPMAP_MAIN_SRCS
+    RapMap.cpp
+    RapMapIndexer.cpp
+    RapMapSAIndexer.cpp
+    RapMapUtils.cpp
+    RapMapMapper.cpp
+    RapMapSAMapper.cpp
+    RapMapFileSystem.cpp
+    RapMapSAIndex.cpp
+    RapMapIndex.cpp
+    HitManager.cpp
+    rank9b.cpp
+    stringpiece.cc
+    xxhash.c
+    bit_array.c
+)
+
+set (RAPMAP_ALIGN_SRCS
+)
+
+set (RAPMAP_LIB_SRCS
+)
+
+#set (RSDICT_LIB_SRCS
+#    EnumCoder.cpp
+#    RSDic.cpp
+#    RSDicBuilder.cpp
+#)
+
+include_directories(
+${GAT_SOURCE_DIR}/include
+${GAT_SOURCE_DIR}/external
+${GAT_SOURCE_DIR}/external/cereal/include
+${GAT_SOURCE_DIR}/external/install/include
+${GAT_SOURCE_DIR}/external/install/include/jellyfish-2.2.5
+${ZLIB_INCLUDE_DIR}
+${CEREAL_INCLUDE_DIRS}
+)
+
+if (JELLYFISH_FOUND)
+    include_directories(${JELLYFISH_INCLUDE_DIR})
+else()
+    include_directories(${GAT_SOURCE_DIR}/external/install/include/jellyfish-2.2.5)
+endif()
+
+link_directories(
+${GAT_SOURCE_DIR}/lib
+${GAT_SOURCE_DIR}/external/install/lib
+#${Boost_LIBRARY_DIRS}
+#${TBB_LIBRARY_DIRS}
+#${LAPACK_LIBRARY_DIR}
+#${BLAS_LIBRARY_DIR}
+)
+
+#message("Boost_LIBRARIES = ${Boost_LIBRARIES}")
+
+# Set the RPATH
+if (APPLE)
+    ## This DOES NOT do what I / any one sane, expects.  Setting the
+    ## linker path on OSX is messed up.  Just tell the user to use
+    ## DYLD_FALLBACK_LIBRARY_PATH for now
+    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+else()
+    set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib:$ORIGIN/../../lib:$ORIGIN/:$ORIGIN/../../external/install/lib")
+endif()
+
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+
+# Build the rsdic library
+# add_library(rsdic STATIC ${RSDICT_LIB_SRCS} )
+
+# Build the rapmap executable
+add_executable(rapmap ${RAPMAP_MAIN_SRCS})
+
+#set_target_properties(salmon_core salmon PROPERTIES LINK_SEARCH_END_STATIC TRUE)
+
+# our suffix array construction libraries
+set (SUFFARRAY_LIB ${GAT_SOURCE_DIR}/external/install/lib/libdivsufsort.a)
+set (SUFFARRAY64_LIB ${GAT_SOURCE_DIR}/external/install/lib/libdivsufsort64.a)
+
+# Link the executable
+target_link_libraries(rapmap
+    ${PTHREAD_LIB}
+    #${Boost_LIBRARIES}
+    ${ZLIB_LIBRARY}
+    ${SUFFARRAY_LIB}
+    ${SUFFARRAY64_LIB}
+    ${GAT_SOURCE_DIR}/external/install/lib/libjellyfish-2.0.a
+    m
+    ${LIBLZMA_LIBRARIES}
+    #${LIBSALMON_LINKER_FLAGS}
+    ${NON_APPLECLANG_LIBS}
+    ${FAST_MALLOC_LIB}
+)
+
+#add_dependencies(salmon libbwa)
+
+##
+#  This ensures that the salmon executable should work with or without `make install`
+##
+if (APPLE)
+	add_custom_command(TARGET rapmap
+		POST_BUILD
+		COMMAND install_name_tool -add_rpath ${GAT_SOURCE_DIR}/external/install/lib rapmap
+		COMMAND install_name_tool -add_rpath @executable_path/../lib rapmap
+		)
+endif()
+
+##### ======================================
+
+IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  SET(CMAKE_INSTALL_PREFIX
+    "${GAT_SOURCE_DIR}" CACHE PATH "Default install prefix" FORCE
+    )
+ENDIF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+
+set(INSTALL_LIB_DIR lib )
+set(INSTALL_BIN_DIR bin )
+set(INSTALL_INCLUDE_DIR include )
+
+# install(FILES ${Boost_LIBRARIES}
+# 	           DESTINATION ${INSTALL_LIB_DIR})
+
+install(TARGETS rapmap
+                RUNTIME DESTINATION bin
+                LIBRARY DESTINATION lib
+                ARCHIVE DESTINATION lib
+        )
+
+    #set(POST_INSTALL_SCRIPT ${GAT_SOURCE_DIR}/cmake/PostInstall.cmake)
+
+    #install(
+    #    CODE
+    #    "
+    #    execute_process(COMMAND \"${CMAKE_COMMAND}\"
+    #                            -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+    #                            -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+    #                            -P \"${POST_INSTALL_SCRIPT}\")
+    #    "
+    #)
+    #
+    #include(InstallRequiredSystemLibraries)
+    #add_test( NAME salmon_read_test COMMAND ${CMAKE_COMMAND} -DTOPLEVEL_DIR=${GAT_SOURCE_DIR} -P ${GAT_SOURCE_DIR}/cmake/TestSalmon.cmake )
diff --git a/src/EnumCoder.cpp b/src/EnumCoder.cpp
new file mode 100644
index 0000000..e415f2f
--- /dev/null
+++ b/src/EnumCoder.cpp
@@ -0,0 +1,264 @@
+/* 
+ *  Copyright (c) 2012 Daisuke Okanohara
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *   1. Redistributions of source code must retain the above Copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above Copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the authors nor the names of its contributors
+ *      may be used to endorse or promote products derived from this
+ *      software without specific prior written permission.
+ */
+
+#include <cassert>
+#include "Const.hpp"
+#include "EnumCoder.hpp"
+
+namespace rsdic{
+
+uint64_t EnumCoder::Encode(uint64_t val, uint64_t rank_sb){
+  uint64_t code = 0;
+  for (uint64_t i = 0; i < kSmallBlockSize; ++i){
+    if ((val >> i) & 1LLU){
+      code += kCombinationTable64_[kSmallBlockSize - i - 1][rank_sb];
+      --rank_sb;
+    }
+  }
+  return code;
+}
+
+uint64_t EnumCoder::Decode(uint64_t code, uint64_t rank_sb){
+  uint64_t ret = 0;
+  for (uint64_t i = 0; i < kSmallBlockSize; ++i){
+    uint64_t zero_case_num = 
+      kCombinationTable64_[kSmallBlockSize - i - 1][rank_sb];
+    if (code >= zero_case_num){
+      ret |= (1LLU << i);
+      code -= zero_case_num;
+      --rank_sb;
+    }
+  }
+  return ret;
+}
+
+
+bool EnumCoder::GetBit(uint64_t code, uint64_t rank_sb, uint64_t pos){
+  if (Len(rank_sb) == kSmallBlockSize){
+    return (code >> pos) & 1LLU;
+  }
+  for (uint64_t i = 0; i < pos; ++i){
+    uint64_t zero_case_num = 
+      kCombinationTable64_[kSmallBlockSize - i - 1][rank_sb];
+    if (code >= zero_case_num){
+      code -= zero_case_num;
+      --rank_sb;
+    }
+  }
+  return (code >= kCombinationTable64_[kSmallBlockSize - pos - 1][rank_sb]);
+}
+
+uint64_t EnumCoder::PopCount(uint64_t code){
+  uint64_t r = code;
+  r = (r & 0x5555555555555555ULL) +
+    ((r >> 1) & 0x5555555555555555ULL);
+  r = (r & 0x3333333333333333ULL) +
+    ((r >> 2) & 0x3333333333333333ULL);
+  r = (r + (r >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+  r = r + (r >>  8);
+  r = r + (r >> 16);
+  r = r + (r >> 32);
+  return (uint64_t)(r & 0x7f);
+
+}
+
+uint64_t EnumCoder::Rank(uint64_t code, uint64_t rank_sb, uint64_t pos){
+  if (Len(rank_sb) == kSmallBlockSize){
+    return PopCount(code & ((1LLU << pos) - 1));
+  }
+
+  uint64_t cur_rank = rank_sb;
+  for (uint64_t i = 0; i < pos; ++i){
+    uint64_t zero_case_num = 
+      kCombinationTable64_[kSmallBlockSize - i - 1][cur_rank];
+    if (code >= zero_case_num){
+      code -= zero_case_num;
+      --cur_rank;
+    }
+  }
+  return rank_sb - cur_rank;
+}
+
+uint64_t EnumCoder::SelectRaw(uint64_t code, uint64_t num){
+  uint64_t offset = 0;
+  for (; offset < kSmallBlockSize; offset += 8){
+    uint8_t r = kPopCount_[(code >> offset) & 0xff];
+    if (num > r){
+      num -= r;
+    } else {
+      break;
+    }
+  }
+  
+  for (; offset < kSmallBlockSize; ++offset){
+    if ((code >> offset) & 1LLU){
+      --num;
+      if (num == 0) return offset;
+      }
+  }
+  assert(false);
+}
+
+uint64_t EnumCoder::Select0(uint64_t code, uint64_t rank_sb, uint64_t num){
+  if (Len(rank_sb) == kSmallBlockSize){
+    return SelectRaw(~code, num);
+  }
+  for (uint64_t offset = 0; offset < kSmallBlockSize; ++ offset){
+    uint64_t zero_case_num = kCombinationTable64_[kSmallBlockSize - offset - 1][rank_sb];
+    if (code >= zero_case_num){
+      code -= zero_case_num;
+      --rank_sb;
+    } else {
+      --num;
+      if (num == 0) return offset;
+    }
+  }
+  assert(false);
+}
+
+uint64_t EnumCoder::Select1(uint64_t code, uint64_t rank_sb, uint64_t num){
+  assert(num <= rank_sb);
+  if (Len(rank_sb) == kSmallBlockSize){
+    return SelectRaw(code, num);
+  }
+
+  for (uint64_t offset = 0; offset < kSmallBlockSize; ++ offset){
+    uint64_t zero_case_num = kCombinationTable64_[kSmallBlockSize - offset - 1][rank_sb];
+    if (code >= zero_case_num){
+      --num;
+      if (num == 0) return offset;
+      code -= zero_case_num;
+      --rank_sb;
+    } 
+  }
+  assert(false);
+}
+
+uint64_t EnumCoder::Select(uint64_t code, uint64_t rank_sb, uint64_t num, bool bit){
+  if (num == 0) return 0;
+  if (bit) return Select1(code, rank_sb, num);
+  else return Select0(code, rank_sb, num);
+}
+
+const uint8_t EnumCoder::kEnumCodeLength_[65] = {
+  0,  6,  11, 16, 20, 23, 27, 30, 33, 35, 38, 40, 42, 44, 46, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 46, 44, 42, 40, 38, 35, 33, 30, 27, 23, 20, 16, 11, 6, 
+  0};
+
+
+/*
+const uint64_t EnumCoder::kEnumCodeLength_[65] = {
+  0,  6,  11, 16, 20, 23, 27, 30, 33, 35, 38, 40, 42, 44, 46, 48, 
+  49, 51, 52, 53, 55, 56, 57, 58, 58, 59, 60, 60, 60, 61, 61, 61,
+  61, 61, 61, 61, 60, 60, 60, 59, 58, 58, 57, 56, 55, 53, 52, 51, 
+  49, 48, 46, 44, 42, 40, 38, 35, 33, 30, 27, 23, 20, 16, 11, 6, 
+  0};
+*/
+
+const uint8_t EnumCoder::kPopCount_[256] = {
+0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
+  1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+  1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+  1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+  3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+  1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+  3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+  3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+  3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+  4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
+};
+
+
+const uint64_t EnumCoder::kCombinationTable64_[65][65] = 
+{ 
+{ 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 2LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 3LLU, 3LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 4LLU, 6LLU, 4LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 5LLU, 10LLU, 10LLU, 5LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 6LLU, 15LLU, 20LLU, 15LLU, 6LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 7LLU, 21LLU, 35LLU, 35LLU, 21LLU, 7LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 8LLU, 28LLU, 56LLU, 70LLU, 56LLU, 28LLU, 8LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 9LLU, 36LLU, 84LLU, 126LLU, 126LLU, 84LLU, 36LLU, 9LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 10LLU, 45LLU, 120LLU, 210LLU, 252LLU, 210LLU, 120LLU, 45LLU, 10LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 11LLU, 55LLU, 165LLU, 330LLU, 462LLU, 462LLU, 330LLU, 165LLU, 55LLU, 11LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 12LLU, 66LLU, 220LLU, 495LLU, 792LLU, 924LLU, 792LLU, 495LLU, 220LLU, 66LLU, 12LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 13LLU, 78LLU, 286LLU, 715LLU, 1287LLU, 1716LLU, 1716LLU, 1287LLU, 715LLU, 286LLU, 78LLU, 13LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 14LLU, 91LLU, 364LLU, 1001LLU, 2002LLU, 3003LLU, 3432LLU, 3003LLU, 2002LLU, 1001LLU, 364LLU, 91LLU, 14LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 15LLU, 105LLU, 455LLU, 1365LLU, 3003LLU, 5005LLU, 6435LLU, 6435LLU, 5005LLU, 3003LLU, 1365LLU, 455LLU, 105LLU, 15LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 16LLU, 120LLU, 560LLU, 1820LLU, 4368LLU, 8008LLU, 11440LLU, 12870LLU, 11440LLU, 8008LLU, 4368LLU, 1820LLU, 560LLU, 120LLU, 16LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 17LLU, 136LLU, 680LLU, 2380LLU, 6188LLU, 12376LLU, 19448LLU, 24310LLU, 24310LLU, 19448LLU, 12376LLU, 6188LLU, 2380LLU, 680LLU, 136LLU, 17LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 18LLU, 153LLU, 816LLU, 3060LLU, 8568LLU, 18564LLU, 31824LLU, 43758LLU, 48620LLU, 43758LLU, 31824LLU, 18564LLU, 8568LLU, 3060LLU, 816LLU, 153LLU, 18LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 19LLU, 171LLU, 969LLU, 3876LLU, 11628LLU, 27132LLU, 50388LLU, 75582LLU, 92378LLU, 92378LLU, 75582LLU, 50388LLU, 27132LLU, 11628LLU, 3876LLU, 969LLU, 171LLU, 19LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 20LLU, 190LLU, 1140LLU, 4845LLU, 15504LLU, 38760LLU, 77520LLU, 125970LLU, 167960LLU, 184756LLU, 167960LLU, 125970LLU, 77520LLU, 38760LLU, 15504LLU, 4845LLU, 1140LLU, 190LLU, 20LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 21LLU, 210LLU, 1330LLU, 5985LLU, 20349LLU, 54264LLU, 116280LLU, 203490LLU, 293930LLU, 352716LLU, 352716LLU, 293930LLU, 203490LLU, 116280LLU, 54264LLU, 20349LLU, 5985LLU, 1330LLU, 210LLU, 21LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 22LLU, 231LLU, 1540LLU, 7315LLU, 26334LLU, 74613LLU, 170544LLU, 319770LLU, 497420LLU, 646646LLU, 705432LLU, 646646LLU, 497420LLU, 319770LLU, 170544LLU, 74613LLU, 26334LLU, 7315LLU, 1540LLU, 231LLU, 22LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 23LLU, 253LLU, 1771LLU, 8855LLU, 33649LLU, 100947LLU, 245157LLU, 490314LLU, 817190LLU, 1144066LLU, 1352078LLU, 1352078LLU, 1144066LLU, 817190LLU, 490314LLU, 245157LLU, 100947LLU, 33649LLU, 8855LLU, 1771LLU, 253LLU, 23LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 24LLU, 276LLU, 2024LLU, 10626LLU, 42504LLU, 134596LLU, 346104LLU, 735471LLU, 1307504LLU, 1961256LLU, 2496144LLU, 2704156LLU, 2496144LLU, 1961256LLU, 1307504LLU, 735471LLU, 346104LLU, 134596LLU, 42504LLU, 10626LLU, 2024LLU, 276LLU, 24LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU},
+{ 1LLU, 25LLU, 300LLU, 2300LLU, 12650LLU, 53130LLU, 177100LLU, 480700LLU, 1081575LLU, 2042975LLU, 3268760LLU, 4457400LLU, 5200300LLU, 5200300LLU, 4457400LLU, 3268760LLU, 2042975LLU, 1081575LLU, 480700LLU, 177100LLU, 53130LLU, 12650LLU, 2300LLU, 300LLU, 25LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU [...]
+{ 1LLU, 26LLU, 325LLU, 2600LLU, 14950LLU, 65780LLU, 230230LLU, 657800LLU, 1562275LLU, 3124550LLU, 5311735LLU, 7726160LLU, 9657700LLU, 10400600LLU, 9657700LLU, 7726160LLU, 5311735LLU, 3124550LLU, 1562275LLU, 657800LLU, 230230LLU, 65780LLU, 14950LLU, 2600LLU, 325LLU, 26LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LL [...]
+{ 1LLU, 27LLU, 351LLU, 2925LLU, 17550LLU, 80730LLU, 296010LLU, 888030LLU, 2220075LLU, 4686825LLU, 8436285LLU, 13037895LLU, 17383860LLU, 20058300LLU, 20058300LLU, 17383860LLU, 13037895LLU, 8436285LLU, 4686825LLU, 2220075LLU, 888030LLU, 296010LLU, 80730LLU, 17550LLU, 2925LLU, 351LLU, 27LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU [...]
+{ 1LLU, 28LLU, 378LLU, 3276LLU, 20475LLU, 98280LLU, 376740LLU, 1184040LLU, 3108105LLU, 6906900LLU, 13123110LLU, 21474180LLU, 30421755LLU, 37442160LLU, 40116600LLU, 37442160LLU, 30421755LLU, 21474180LLU, 13123110LLU, 6906900LLU, 3108105LLU, 1184040LLU, 376740LLU, 98280LLU, 20475LLU, 3276LLU, 378LLU, 28LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, [...]
+{ 1LLU, 29LLU, 406LLU, 3654LLU, 23751LLU, 118755LLU, 475020LLU, 1560780LLU, 4292145LLU, 10015005LLU, 20030010LLU, 34597290LLU, 51895935LLU, 67863915LLU, 77558760LLU, 77558760LLU, 67863915LLU, 51895935LLU, 34597290LLU, 20030010LLU, 10015005LLU, 4292145LLU, 1560780LLU, 475020LLU, 118755LLU, 23751LLU, 3654LLU, 406LLU, 29LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU,  [...]
+{ 1LLU, 30LLU, 435LLU, 4060LLU, 27405LLU, 142506LLU, 593775LLU, 2035800LLU, 5852925LLU, 14307150LLU, 30045015LLU, 54627300LLU, 86493225LLU, 119759850LLU, 145422675LLU, 155117520LLU, 145422675LLU, 119759850LLU, 86493225LLU, 54627300LLU, 30045015LLU, 14307150LLU, 5852925LLU, 2035800LLU, 593775LLU, 142506LLU, 27405LLU, 4060LLU, 435LLU, 30LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU,  [...]
+{ 1LLU, 31LLU, 465LLU, 4495LLU, 31465LLU, 169911LLU, 736281LLU, 2629575LLU, 7888725LLU, 20160075LLU, 44352165LLU, 84672315LLU, 141120525LLU, 206253075LLU, 265182525LLU, 300540195LLU, 300540195LLU, 265182525LLU, 206253075LLU, 141120525LLU, 84672315LLU, 44352165LLU, 20160075LLU, 7888725LLU, 2629575LLU, 736281LLU, 169911LLU, 31465LLU, 4495LLU, 465LLU, 31LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0L [...]
+{ 1LLU, 32LLU, 496LLU, 4960LLU, 35960LLU, 201376LLU, 906192LLU, 3365856LLU, 10518300LLU, 28048800LLU, 64512240LLU, 129024480LLU, 225792840LLU, 347373600LLU, 471435600LLU, 565722720LLU, 601080390LLU, 565722720LLU, 471435600LLU, 347373600LLU, 225792840LLU, 129024480LLU, 64512240LLU, 28048800LLU, 10518300LLU, 3365856LLU, 906192LLU, 201376LLU, 35960LLU, 4960LLU, 496LLU, 32LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0L [...]
+{ 1LLU, 33LLU, 528LLU, 5456LLU, 40920LLU, 237336LLU, 1107568LLU, 4272048LLU, 13884156LLU, 38567100LLU, 92561040LLU, 193536720LLU, 354817320LLU, 573166440LLU, 818809200LLU, 1037158320LLU, 1166803110LLU, 1166803110LLU, 1037158320LLU, 818809200LLU, 573166440LLU, 354817320LLU, 193536720LLU, 92561040LLU, 38567100LLU, 13884156LLU, 4272048LLU, 1107568LLU, 237336LLU, 40920LLU, 5456LLU, 528LLU, 33LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU,  [...]
+{ 1LLU, 34LLU, 561LLU, 5984LLU, 46376LLU, 278256LLU, 1344904LLU, 5379616LLU, 18156204LLU, 52451256LLU, 131128140LLU, 286097760LLU, 548354040LLU, 927983760LLU, 1391975640LLU, 1855967520LLU, 2203961430LLU, 2333606220LLU, 2203961430LLU, 1855967520LLU, 1391975640LLU, 927983760LLU, 548354040LLU, 286097760LLU, 131128140LLU, 52451256LLU, 18156204LLU, 5379616LLU, 1344904LLU, 278256LLU, 46376LLU, 5984LLU, 561LLU, 34LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, [...]
+{ 1LLU, 35LLU, 595LLU, 6545LLU, 52360LLU, 324632LLU, 1623160LLU, 6724520LLU, 23535820LLU, 70607460LLU, 183579396LLU, 417225900LLU, 834451800LLU, 1476337800LLU, 2319959400LLU, 3247943160LLU, 4059928950LLU, 4537567650LLU, 4537567650LLU, 4059928950LLU, 3247943160LLU, 2319959400LLU, 1476337800LLU, 834451800LLU, 417225900LLU, 183579396LLU, 70607460LLU, 23535820LLU, 6724520LLU, 1623160LLU, 324632LLU, 52360LLU, 6545LLU, 595LLU, 35LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU,  [...]
+{ 1LLU, 36LLU, 630LLU, 7140LLU, 58905LLU, 376992LLU, 1947792LLU, 8347680LLU, 30260340LLU, 94143280LLU, 254186856LLU, 600805296LLU, 1251677700LLU, 2310789600LLU, 3796297200LLU, 5567902560LLU, 7307872110LLU, 8597496600LLU, 9075135300LLU, 8597496600LLU, 7307872110LLU, 5567902560LLU, 3796297200LLU, 2310789600LLU, 1251677700LLU, 600805296LLU, 254186856LLU, 94143280LLU, 30260340LLU, 8347680LLU, 1947792LLU, 376992LLU, 58905LLU, 7140LLU, 630LLU, 36LLU, 1LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0LLU, 0 [...]
+{ 1LLU, 37LLU, 666LLU, 7770LLU, 66045LLU, 435897LLU, 2324784LLU, 10295472LLU, 38608020LLU, 124403620LLU, 348330136LLU, 854992152LLU, 1852482996LLU, 3562467300LLU, 6107086800LLU, 9364199760LLU, 12875774670LLU, 15905368710LLU, 17672631900LLU, 17672631900LLU, 15905368710LLU, 12875774670LLU, 9364199760LLU, 6107086800LLU, 3562467300LLU, 1852482996LLU, 854992152LLU, 348330136LLU, 124403620LLU, 38608020LLU, 10295472LLU, 2324784LLU, 435897LLU, 66045LLU, 7770LLU, 666LLU, 37LLU, 1LLU, 0LLU, 0LLU,  [...]
+{ 1LLU, 38LLU, 703LLU, 8436LLU, 73815LLU, 501942LLU, 2760681LLU, 12620256LLU, 48903492LLU, 163011640LLU, 472733756LLU, 1203322288LLU, 2707475148LLU, 5414950296LLU, 9669554100LLU, 15471286560LLU, 22239974430LLU, 28781143380LLU, 33578000610LLU, 35345263800LLU, 33578000610LLU, 28781143380LLU, 22239974430LLU, 15471286560LLU, 9669554100LLU, 5414950296LLU, 2707475148LLU, 1203322288LLU, 472733756LLU, 163011640LLU, 48903492LLU, 12620256LLU, 2760681LLU, 501942LLU, 73815LLU, 8436LLU, 703LLU, 38LLU [...]
+{ 1LLU, 39LLU, 741LLU, 9139LLU, 82251LLU, 575757LLU, 3262623LLU, 15380937LLU, 61523748LLU, 211915132LLU, 635745396LLU, 1676056044LLU, 3910797436LLU, 8122425444LLU, 15084504396LLU, 25140840660LLU, 37711260990LLU, 51021117810LLU, 62359143990LLU, 68923264410LLU, 68923264410LLU, 62359143990LLU, 51021117810LLU, 37711260990LLU, 25140840660LLU, 15084504396LLU, 8122425444LLU, 3910797436LLU, 1676056044LLU, 635745396LLU, 211915132LLU, 61523748LLU, 15380937LLU, 3262623LLU, 575757LLU, 82251LLU, 9139 [...]
+{ 1LLU, 40LLU, 780LLU, 9880LLU, 91390LLU, 658008LLU, 3838380LLU, 18643560LLU, 76904685LLU, 273438880LLU, 847660528LLU, 2311801440LLU, 5586853480LLU, 12033222880LLU, 23206929840LLU, 40225345056LLU, 62852101650LLU, 88732378800LLU, 113380261800LLU, 131282408400LLU, 137846528820LLU, 131282408400LLU, 113380261800LLU, 88732378800LLU, 62852101650LLU, 40225345056LLU, 23206929840LLU, 12033222880LLU, 5586853480LLU, 2311801440LLU, 847660528LLU, 273438880LLU, 76904685LLU, 18643560LLU, 3838380LLU, 65 [...]
+{ 1LLU, 41LLU, 820LLU, 10660LLU, 101270LLU, 749398LLU, 4496388LLU, 22481940LLU, 95548245LLU, 350343565LLU, 1121099408LLU, 3159461968LLU, 7898654920LLU, 17620076360LLU, 35240152720LLU, 63432274896LLU, 103077446706LLU, 151584480450LLU, 202112640600LLU, 244662670200LLU, 269128937220LLU, 269128937220LLU, 244662670200LLU, 202112640600LLU, 151584480450LLU, 103077446706LLU, 63432274896LLU, 35240152720LLU, 17620076360LLU, 7898654920LLU, 3159461968LLU, 1121099408LLU, 350343565LLU, 95548245LLU, 22 [...]
+{ 1LLU, 42LLU, 861LLU, 11480LLU, 111930LLU, 850668LLU, 5245786LLU, 26978328LLU, 118030185LLU, 445891810LLU, 1471442973LLU, 4280561376LLU, 11058116888LLU, 25518731280LLU, 52860229080LLU, 98672427616LLU, 166509721602LLU, 254661927156LLU, 353697121050LLU, 446775310800LLU, 513791607420LLU, 538257874440LLU, 513791607420LLU, 446775310800LLU, 353697121050LLU, 254661927156LLU, 166509721602LLU, 98672427616LLU, 52860229080LLU, 25518731280LLU, 11058116888LLU, 4280561376LLU, 1471442973LLU, 445891810 [...]
+{ 1LLU, 43LLU, 903LLU, 12341LLU, 123410LLU, 962598LLU, 6096454LLU, 32224114LLU, 145008513LLU, 563921995LLU, 1917334783LLU, 5752004349LLU, 15338678264LLU, 36576848168LLU, 78378960360LLU, 151532656696LLU, 265182149218LLU, 421171648758LLU, 608359048206LLU, 800472431850LLU, 960566918220LLU, 1052049481860LLU, 1052049481860LLU, 960566918220LLU, 800472431850LLU, 608359048206LLU, 421171648758LLU, 265182149218LLU, 151532656696LLU, 78378960360LLU, 36576848168LLU, 15338678264LLU, 5752004349LLU, 191 [...]
+{ 1LLU, 44LLU, 946LLU, 13244LLU, 135751LLU, 1086008LLU, 7059052LLU, 38320568LLU, 177232627LLU, 708930508LLU, 2481256778LLU, 7669339132LLU, 21090682613LLU, 51915526432LLU, 114955808528LLU, 229911617056LLU, 416714805914LLU, 686353797976LLU, 1029530696964LLU, 1408831480056LLU, 1761039350070LLU, 2012616400080LLU, 2104098963720LLU, 2012616400080LLU, 1761039350070LLU, 1408831480056LLU, 1029530696964LLU, 686353797976LLU, 416714805914LLU, 229911617056LLU, 114955808528LLU, 51915526432LLU, 2109068 [...]
+{ 1LLU, 45LLU, 990LLU, 14190LLU, 148995LLU, 1221759LLU, 8145060LLU, 45379620LLU, 215553195LLU, 886163135LLU, 3190187286LLU, 10150595910LLU, 28760021745LLU, 73006209045LLU, 166871334960LLU, 344867425584LLU, 646626422970LLU, 1103068603890LLU, 1715884494940LLU, 2438362177020LLU, 3169870830126LLU, 3773655750150LLU, 4116715363800LLU, 4116715363800LLU, 3773655750150LLU, 3169870830126LLU, 2438362177020LLU, 1715884494940LLU, 1103068603890LLU, 646626422970LLU, 344867425584LLU, 166871334960LLU, 73 [...]
+{ 1LLU, 46LLU, 1035LLU, 15180LLU, 163185LLU, 1370754LLU, 9366819LLU, 53524680LLU, 260932815LLU, 1101716330LLU, 4076350421LLU, 13340783196LLU, 38910617655LLU, 101766230790LLU, 239877544005LLU, 511738760544LLU, 991493848554LLU, 1749695026860LLU, 2818953098830LLU, 4154246671960LLU, 5608233007146LLU, 6943526580276LLU, 7890371113950LLU, 8233430727600LLU, 7890371113950LLU, 6943526580276LLU, 5608233007146LLU, 4154246671960LLU, 2818953098830LLU, 1749695026860LLU, 991493848554LLU, 511738760544LLU [...]
+{ 1LLU, 47LLU, 1081LLU, 16215LLU, 178365LLU, 1533939LLU, 10737573LLU, 62891499LLU, 314457495LLU, 1362649145LLU, 5178066751LLU, 17417133617LLU, 52251400851LLU, 140676848445LLU, 341643774795LLU, 751616304549LLU, 1503232609098LLU, 2741188875414LLU, 4568648125690LLU, 6973199770790LLU, 9762479679106LLU, 12551759587422LLU, 14833897694226LLU, 16123801841550LLU, 16123801841550LLU, 14833897694226LLU, 12551759587422LLU, 9762479679106LLU, 6973199770790LLU, 4568648125690LLU, 2741188875414LLU, 150323 [...]
+{ 1LLU, 48LLU, 1128LLU, 17296LLU, 194580LLU, 1712304LLU, 12271512LLU, 73629072LLU, 377348994LLU, 1677106640LLU, 6540715896LLU, 22595200368LLU, 69668534468LLU, 192928249296LLU, 482320623240LLU, 1093260079344LLU, 2254848913647LLU, 4244421484512LLU, 7309837001104LLU, 11541847896480LLU, 16735679449896LLU, 22314239266528LLU, 27385657281648LLU, 30957699535776LLU, 32247603683100LLU, 30957699535776LLU, 27385657281648LLU, 22314239266528LLU, 16735679449896LLU, 11541847896480LLU, 7309837001104LLU,  [...]
+{ 1LLU, 49LLU, 1176LLU, 18424LLU, 211876LLU, 1906884LLU, 13983816LLU, 85900584LLU, 450978066LLU, 2054455634LLU, 8217822536LLU, 29135916264LLU, 92263734836LLU, 262596783764LLU, 675248872536LLU, 1575580702584LLU, 3348108992991LLU, 6499270398159LLU, 11554258485616LLU, 18851684897584LLU, 28277527346376LLU, 39049918716424LLU, 49699896548176LLU, 58343356817424LLU, 63205303218876LLU, 63205303218876LLU, 58343356817424LLU, 49699896548176LLU, 39049918716424LLU, 28277527346376LLU, 18851684897584LLU [...]
+{ 1LLU, 50LLU, 1225LLU, 19600LLU, 230300LLU, 2118760LLU, 15890700LLU, 99884400LLU, 536878650LLU, 2505433700LLU, 10272278170LLU, 37353738800LLU, 121399651100LLU, 354860518600LLU, 937845656300LLU, 2250829575120LLU, 4923689695575LLU, 9847379391150LLU, 18053528883775LLU, 30405943383200LLU, 47129212243960LLU, 67327446062800LLU, 88749815264600LLU, 108043253365600LLU, 121548660036300LLU, 126410606437752LLU, 121548660036300LLU, 108043253365600LLU, 88749815264600LLU, 67327446062800LLU, 4712921224 [...]
+{ 1LLU, 51LLU, 1275LLU, 20825LLU, 249900LLU, 2349060LLU, 18009460LLU, 115775100LLU, 636763050LLU, 3042312350LLU, 12777711870LLU, 47626016970LLU, 158753389900LLU, 476260169700LLU, 1292706174900LLU, 3188675231420LLU, 7174519270695LLU, 14771069086725LLU, 27900908274925LLU, 48459472266975LLU, 77535155627160LLU, 114456658306760LLU, 156077261327400LLU, 196793068630200LLU, 229591913401900LLU, 247959266474052LLU, 247959266474052LLU, 229591913401900LLU, 196793068630200LLU, 156077261327400LLU, 114 [...]
+{ 1LLU, 52LLU, 1326LLU, 22100LLU, 270725LLU, 2598960LLU, 20358520LLU, 133784560LLU, 752538150LLU, 3679075400LLU, 15820024220LLU, 60403728840LLU, 206379406870LLU, 635013559600LLU, 1768966344600LLU, 4481381406320LLU, 10363194502115LLU, 21945588357420LLU, 42671977361650LLU, 76360380541900LLU, 125994627894135LLU, 191991813933920LLU, 270533919634160LLU, 352870329957600LLU, 426384982032100LLU, 477551179875952LLU, 495918532948104LLU, 477551179875952LLU, 426384982032100LLU, 352870329957600LLU, 2 [...]
+{ 1LLU, 53LLU, 1378LLU, 23426LLU, 292825LLU, 2869685LLU, 22957480LLU, 154143080LLU, 886322710LLU, 4431613550LLU, 19499099620LLU, 76223753060LLU, 266783135710LLU, 841392966470LLU, 2403979904200LLU, 6250347750920LLU, 14844575908435LLU, 32308782859535LLU, 64617565719070LLU, 119032357903550LLU, 202355008436035LLU, 317986441828055LLU, 462525733568080LLU, 623404249591760LLU, 779255311989700LLU, 903936161908052LLU, 973469712824056LLU, 973469712824056LLU, 903936161908052LLU, 779255311989700LLU,  [...]
+{ 1LLU, 54LLU, 1431LLU, 24804LLU, 316251LLU, 3162510LLU, 25827165LLU, 177100560LLU, 1040465790LLU, 5317936260LLU, 23930713170LLU, 95722852680LLU, 343006888770LLU, 1108176102180LLU, 3245372870670LLU, 8654327655120LLU, 21094923659355LLU, 47153358767970LLU, 96926348578605LLU, 183649923622620LLU, 321387366339585LLU, 520341450264090LLU, 780512175396135LLU, 1085929983159840LLU, 1402659561581460LLU, 1683191473897752LLU, 1877405874732108LLU, 1946939425648112LLU, 1877405874732108LLU, 168319147389 [...]
+{ 1LLU, 55LLU, 1485LLU, 26235LLU, 341055LLU, 3478761LLU, 28989675LLU, 202927725LLU, 1217566350LLU, 6358402050LLU, 29248649430LLU, 119653565850LLU, 438729741450LLU, 1451182990950LLU, 4353548972850LLU, 11899700525790LLU, 29749251314475LLU, 68248282427325LLU, 144079707346575LLU, 280576272201225LLU, 505037289962205LLU, 841728816603675LLU, 1300853625660225LLU, 1866442158555975LLU, 2488589544741300LLU, 3085851035479212LLU, 3560597348629860LLU, 3824345300380220LLU, 3824345300380220LLU, 35605973 [...]
+{ 1LLU, 56LLU, 1540LLU, 27720LLU, 367290LLU, 3819816LLU, 32468436LLU, 231917400LLU, 1420494075LLU, 7575968400LLU, 35607051480LLU, 148902215280LLU, 558383307300LLU, 1889912732400LLU, 5804731963800LLU, 16253249498640LLU, 41648951840265LLU, 97997533741800LLU, 212327989773900LLU, 424655979547800LLU, 785613562163430LLU, 1346766106565880LLU, 2142582442263900LLU, 3167295784216200LLU, 4355031703297275LLU, 5574440580220512LLU, 6646448384109072LLU, 7384942649010080LLU, 7648690600760440LLU, 7384942 [...]
+{ 1LLU, 57LLU, 1596LLU, 29260LLU, 395010LLU, 4187106LLU, 36288252LLU, 264385836LLU, 1652411475LLU, 8996462475LLU, 43183019880LLU, 184509266760LLU, 707285522580LLU, 2448296039700LLU, 7694644696200LLU, 22057981462440LLU, 57902201338905LLU, 139646485582065LLU, 310325523515700LLU, 636983969321700LLU, 1210269541711230LLU, 2132379668729310LLU, 3489348548829780LLU, 5309878226480100LLU, 7522327487513475LLU, 9929472283517787LLU, 12220888964329584LLU, 14031391033119152LLU, 15033633249770520LLU, 15 [...]
+{ 1LLU, 58LLU, 1653LLU, 30856LLU, 424270LLU, 4582116LLU, 40475358LLU, 300674088LLU, 1916797311LLU, 10648873950LLU, 52179482355LLU, 227692286640LLU, 891794789340LLU, 3155581562280LLU, 10142940735900LLU, 29752626158640LLU, 79960182801345LLU, 197548686920970LLU, 449972009097765LLU, 947309492837400LLU, 1847253511032930LLU, 3342649210440540LLU, 5621728217559090LLU, 8799226775309880LLU, 12832205713993575LLU, 17451799771031262LLU, 22150361247847371LLU, 26252279997448736LLU, 29065024282889672LLU [...]
+{ 1LLU, 59LLU, 1711LLU, 32509LLU, 455126LLU, 5006386LLU, 45057474LLU, 341149446LLU, 2217471399LLU, 12565671261LLU, 62828356305LLU, 279871768995LLU, 1119487075980LLU, 4047376351620LLU, 13298522298180LLU, 39895566894540LLU, 109712808959985LLU, 277508869722315LLU, 647520696018735LLU, 1397281501935165LLU, 2794563003870330LLU, 5189902721473470LLU, 8964377427999630LLU, 14420954992868970LLU, 21631432489303455LLU, 30284005485024837LLU, 39602161018878633LLU, 48402641245296107LLU, 5531730428033840 [...]
+{ 1LLU, 60LLU, 1770LLU, 34220LLU, 487635LLU, 5461512LLU, 50063860LLU, 386206920LLU, 2558620845LLU, 14783142660LLU, 75394027566LLU, 342700125300LLU, 1399358844975LLU, 5166863427600LLU, 17345898649800LLU, 53194089192720LLU, 149608375854525LLU, 387221678682300LLU, 925029565741050LLU, 2044802197953900LLU, 4191844505805495LLU, 7984465725343800LLU, 14154280149473100LLU, 23385332420868600LLU, 36052387482172425LLU, 51915437974328292LLU, 69886166503903470LLU, 88004802264174740LLU, 103719945525634 [...]
+{ 1LLU, 61LLU, 1830LLU, 35990LLU, 521855LLU, 5949147LLU, 55525372LLU, 436270780LLU, 2944827765LLU, 17341763505LLU, 90177170226LLU, 418094152866LLU, 1742058970275LLU, 6566222272575LLU, 22512762077400LLU, 70539987842520LLU, 202802465047245LLU, 536830054536825LLU, 1312251244423350LLU, 2969831763694950LLU, 6236646703759395LLU, 12176310231149295LLU, 22138745874816900LLU, 37539612570341700LLU, 59437719903041025LLU, 87967825456500717LLU, 121801604478231762LLU, 157890968768078210LLU, 19172474778 [...]
+{ 1LLU, 62LLU, 1891LLU, 37820LLU, 557845LLU, 6471002LLU, 61474519LLU, 491796152LLU, 3381098545LLU, 20286591270LLU, 107518933731LLU, 508271323092LLU, 2160153123141LLU, 8308281242850LLU, 29078984349975LLU, 93052749919920LLU, 273342452889765LLU, 739632519584070LLU, 1849081298960175LLU, 4282083008118300LLU, 9206478467454345LLU, 18412956934908690LLU, 34315056105966195LLU, 59678358445158600LLU, 96977332473382725LLU, 147405545359541742LLU, 209769429934732479LLU, 279692573246309972LLU, 349615716 [...]
+{ 1LLU, 63LLU, 1953LLU, 39711LLU, 595665LLU, 7028847LLU, 67945521LLU, 553270671LLU, 3872894697LLU, 23667689815LLU, 127805525001LLU, 615790256823LLU, 2668424446233LLU, 10468434365991LLU, 37387265592825LLU, 122131734269895LLU, 366395202809685LLU, 1012974972473835LLU, 2588713818544245LLU, 6131164307078475LLU, 13488561475572645LLU, 27619435402363035LLU, 52728013040874885LLU, 93993414551124795LLU, 156655690918541325LLU, 244382877832924467LLU, 357174975294274221LLU, 489462003181042451LLU, 6293 [...]
+{ 1LLU, 64LLU, 2016LLU, 41664LLU, 635376LLU, 7624512LLU, 74974368LLU, 621216192LLU, 4426165368LLU, 27540584512LLU, 151473214816LLU, 743595781824LLU, 3284214703056LLU, 13136858812224LLU, 47855699958816LLU, 159518999862720LLU, 488526937079580LLU, 1379370175283520LLU, 3601688791018080LLU, 8719878125622720LLU, 19619725782651120LLU, 41107996877935680LLU, 80347448443237920LLU, 146721427591999680LLU, 250649105469666120LLU, 401038568751465792LLU, 601557853127198688LLU, 846636978475316672LLU, 111 [...]
+
+}
diff --git a/src/EnumCoderTest.cpp b/src/EnumCoderTest.cpp
new file mode 100644
index 0000000..372e063
--- /dev/null
+++ b/src/EnumCoderTest.cpp
@@ -0,0 +1,46 @@
+/* 
+ *  Copyright (c) 2012 Daisuke Okanohara
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *   1. Redistributions of source code must retain the above Copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above Copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the authors nor the names of its contributors
+ *      may be used to endorse or promote products derived from this
+ *      software without specific prior written permission.
+ */
+
+#include <gtest/gtest.h>
+#include "EnumCoder.hpp"
+
+using namespace std;
+using namespace rsdic;
+
+uint64_t PopCount(uint64_t x){
+  uint64_t count = 0;
+  for (uint64_t i = 0; i < 64; ++i){
+    if ((x >> i) & 1LLU) ++count;
+  }
+  return count;
+}
+
+TEST(EnumCoder, small){
+  uint64_t code = EnumCoder::Encode(0, PopCount(0));
+  ASSERT_EQ(0, EnumCoder::Decode(0, code));
+}
+
+TEST(EnumCoder, random){
+  for (uint64_t i = 0; i < 10000; ++i){
+    uint64_t x = rand();
+    uint64_t rank_sb = PopCount(x);
+    uint64_t code = EnumCoder::Encode(x, rank_sb);
+    ASSERT_EQ(x, EnumCoder::Decode(code, rank_sb));
+  }
+}
diff --git a/src/HitManager.cpp b/src/HitManager.cpp
new file mode 100644
index 0000000..b5fc9e7
--- /dev/null
+++ b/src/HitManager.cpp
@@ -0,0 +1,700 @@
+#include "HitManager.hpp"
+#include "BooMap.hpp"
+#include <type_traits>
+
+namespace rapmap {
+    namespace hit_manager {
+    	// Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimple(std::vector<ProcessedHit>& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus){
+            bool foundHit{false};
+            // One processed hit per transcript
+            for (auto& ph : processedHits) {
+                auto tid = ph.tid;
+                std::sort(ph.tqvec.begin(), ph.tqvec.end(),
+                        [](const TxpQueryPos& x, const TxpQueryPos& y) -> bool {
+                        return x.txpPosInfo.pos() < y.txpPosInfo.pos();
+                        });
+                auto& firstHit = ph.tqvec[0];
+                bool hitRC = firstHit.queryRC;
+                bool txpRC = ph.tqvec[0].txpPosInfo.isRC();
+                bool isFwd = (hitRC == txpRC);
+                int32_t hitPos = firstHit.txpPosInfo.pos() - firstHit.queryPos;
+
+                // determine forward
+                hits.emplace_back(tid, hitPos, isFwd, readLen);
+                hits.back().mateStatus = mateStatus;
+            }
+
+            return true;
+        }
+
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA(SAHitMap& processedHits,
+                        uint32_t readLen,
+                        uint32_t maxDist,
+                        std::vector<QuasiAlignment>& hits,
+                        MateStatus mateStatus){
+                bool foundHit{false};
+                // One processed hit per transcript
+	            auto startOffset = hits.size();
+                for (auto& ph : processedHits) {
+                        // If this is an *active* position list
+                        if (ph.second.active) {
+                                auto tid = ph.first;
+				auto minPosIt = std::min_element(ph.second.tqvec.begin(),
+						ph.second.tqvec.end(),
+						[](const SATxpQueryPos& a, const SATxpQueryPos& b) -> bool {
+						    return a.pos < b.pos;
+						});
+                                bool hitRC = minPosIt->queryRC;
+                                int32_t hitPos = minPosIt->pos - minPosIt->queryPos;
+                                bool isFwd = !hitRC;
+                                hits.emplace_back(tid, hitPos, isFwd, readLen);
+                                hits.back().mateStatus = mateStatus;
+                        }
+                }
+                // if SAHitMap is sorted, no need to sort here
+                /*
+                std::sort(hits.begin() + startOffset, hits.end(),
+                                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                                return a.tid < b.tid;
+                                });
+                                */
+                return true;
+        }
+
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA2(std::vector<ProcessedSAHit>& processedHits,
+                        uint32_t readLen,
+                        uint32_t maxDist,
+                        std::vector<QuasiAlignment>& hits,
+                        MateStatus mateStatus){
+                bool foundHit{false};
+
+                // One processed hit per transcript
+                for (auto& ph : processedHits) {
+                        // If this is an *active* position list
+                        if (ph.active) {
+                                auto tid = ph.tid;
+                                auto minPosIt =
+                                    std::min_element(ph.tqvec.begin(),
+                                                     ph.tqvec.end(),
+                                                     [](const SATxpQueryPos& a, const SATxpQueryPos& b) -> bool {
+                                                        return a.pos < b.pos;
+                                                        });
+
+                                bool hitRC = minPosIt->queryRC;
+                                int32_t hitPos = minPosIt->pos - minPosIt->queryPos;
+                                bool isFwd = !hitRC;
+                                hits.emplace_back(tid, hitPos, isFwd, readLen);
+                                hits.back().mateStatus = mateStatus;
+                        }
+                }
+                return true;
+        }
+
+
+
+
+        // Intersects the hit h2 with outHits.
+        // This will modify outHits so that the tqvec field of the
+        // entries in outHits that are labeled by the transcripts in
+        // which h2 appears will have an iterator to the beginning of
+        // the position list for h2.
+        void intersectWithOutput(HitInfo& h2, RapMapIndex& rmi,
+                std::vector<ProcessedHit>& outHits) {
+
+            // Convenient bindings for variables we'll use
+            auto& eqClasses = rmi.eqClassList;
+            auto& eqClassLabels = rmi.eqLabelList;
+            auto& posList = rmi.posList;
+
+            // Iterator to the beginning and end of the output hits
+            auto outHitIt = outHits.begin();
+            auto outHitEnd = outHits.end();
+
+            // Equiv. class for h2
+            auto& eqClassRight = eqClasses[h2.kinfo->eqId];
+
+            // Iterator into, length of and end of the positon list for h2
+            auto rightPosIt = posList.begin() + h2.kinfo->offset;
+            auto rightPosLen = h2.kinfo->count;
+            auto rightPosEnd = rightPosIt + rightPosLen;
+            // Iterator into, length of and end of the transcript list for h2
+            auto rightTxpIt = eqClassLabels.begin() + eqClassRight.txpListStart;
+            auto rightTxpListLen = eqClassRight.txpListLen;
+            auto rightTxpEnd = rightTxpIt + rightTxpListLen;
+
+            auto rightQueryPos = h2.queryPos;
+            auto rightQueryRC = h2.queryRC;
+            PositionListHelper rightPosHelper(rightPosIt, posList.end());
+
+            uint32_t leftTxp, rightTxp;
+            while (outHitIt != outHitEnd and rightTxpIt != rightTxpEnd) {
+                // Get the current transcript ID for the left and right eq class
+                leftTxp = outHitIt->tid;
+                rightTxp = *rightTxpIt;
+                // If we need to advance the left txp, do it
+                if (leftTxp < rightTxp) {
+                    // Advance to the next transcript in the
+                    // equivalence class label
+                    ++outHitIt;
+                } else {
+                    // If the transcripts are equal (i.e. leftTxp >= rightTxp and !(rightTxp < leftTxp))
+                    // Then see if there are any hits here.
+                    if (!(rightTxp < leftTxp)) {
+                        // Add the position list iterator and query pos for the
+                        // hit from h2 to the back of outHits' tqvec.
+                        outHitIt->tqvec.emplace_back(rightPosHelper, rightQueryPos, rightQueryRC);
+                        ++outHitIt;
+                    }
+                    // advance the hit we're intersecting to the next transcript
+                    rightPosHelper.advanceToNextTranscript();
+                    // Advance the right transcript id regardless of whether
+                    // we found a hit or not.
+                    ++rightTxpIt;
+                }
+            }
+
+        }
+
+        /** from http://en.cppreference.com/w/cpp/algorithm/lower_bound **/
+        template <typename ForwardIt>
+        ForwardIt binarySearch(
+                ForwardIt first,
+                ForwardIt last,
+                uint32_t value) {
+            ForwardIt it;
+            typename std::iterator_traits<ForwardIt>::difference_type count, step;
+            count = std::distance(first, last);
+
+            while (count > 0) {
+                it = first;
+                step = count / 2;
+                std::advance(it, step);
+                if (*it < value) {
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else {
+                    count = step;
+                }
+            }
+            return first;
+        }
+
+        /** from http://en.cppreference.com/w/cpp/algorithm/find **/
+        template<class InputIt>
+        InputIt linearSearch(InputIt first, InputIt last, uint32_t value) {
+            for (; first != last; ++first) {
+                if (*first == value) {
+                    return first;
+                }
+            }
+            return last;
+        }
+
+        /** adapted from https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/ **/
+        uint32_t binarySearchFast(const std::vector<uint32_t>& arr, size_t n, uint32_t key) {
+            uint32_t min = 0, max = n;
+            while (min < max) {
+                int middle = (min + max) >> 1;
+                min = (key > arr[middle]) ? middle+1 : min;
+                max = (key <= arr[middle]) ? middle : max;
+            }
+            return (arr[min] == key) ? min : std::numeric_limits<uint32_t>::max();
+        }
+
+        /** adapted from https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/ **/
+        // ASSUMES SENTINEL VALUE (value in array >= key *MUST* exist)
+        uint32_t linearSearchUnrolled16(const std::vector<uint32_t>& arr, size_t n, uint32_t key) {
+            uint32_t i{0};
+                for (;;) {
+                    if ( arr[i + 0] >= key) return  i + 0;
+                    if ( arr[i + 1] >= key) return  i + 1;
+                    if ( arr[i + 2] >= key) return  i + 2;
+                    if ( arr[i + 3] >= key) return  i + 3;
+                    if ( arr[i + 4] >= key) return  i + 4;
+                    if ( arr[i + 5] >= key) return  i + 5;
+                    if ( arr[i + 6] >= key) return  i + 6;
+                    if ( arr[i + 7] >= key) return  i + 7;
+                    if ( arr[i + 8] >= key) return  i + 8;
+                    if ( arr[i + 9] >= key) return  i + 9;
+                    if ( arr[i + 10] >= key) return i + 10;
+                    if ( arr[i + 11] >= key) return i + 11;
+                    if ( arr[i + 12] >= key) return i + 12;
+                    if ( arr[i + 13] >= key) return i + 13;
+                    if ( arr[i + 14] >= key) return i + 14;
+                    if ( arr[i + 15] >= key) return i + 15;
+                    i += 16;
+                }
+            }
+
+          template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput2(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                RapMapIndexT& rmi,
+                //fbs::eytzinger_array_bfp<uint32_t, uint32_t, true>& outTxps,
+                //std::vector<uint32_t>& outTxps,
+                SAProcessedHitVec& processedHits) {
+            // Convenient bindings for variables we'll use
+            auto& SA = rmi.SA;
+            auto& txpIDs = rmi.positionIDs;
+            auto& txpStarts = rmi.txpOffsets;
+
+            auto& outStructs = processedHits.hits;
+            auto& outTxps = processedHits.txps;
+
+            // Iterator to the beginning and end of the output hits
+            auto txpIt = processedHits.txps.begin();
+            auto txpEnd = processedHits.txps.end();
+
+            uint32_t arraySize = processedHits.txps.size();
+
+            uint32_t rightTxp;
+            uint32_t pos;
+            //decltype(processedHits.txps)::iterator searchIt = txpEnd;
+            uint32_t searchInd{0};
+            for (auto i = h.begin; i < h.end; ++i) {
+                rightTxp = txpIDs[SA[i]];
+                if (arraySize > 64) {
+                    searchInd = binarySearchFast(outTxps, arraySize, rightTxp);
+                } else {
+                    searchInd = linearSearchUnrolled16(outTxps, arraySize, rightTxp);
+                }
+                // If we found this transcript (make sure it's not the sentinel) then
+                // add it to the list.
+                if ( searchInd < arraySize - 1 ) {
+                    //auto offset = std::distance(txpIt, searchIt);
+                    pos = static_cast<uint32_t>(SA[i]) - txpStarts[rightTxp];
+                    outStructs[searchInd].tqvec.emplace_back(pos, h.queryPos, h.queryRC);
+                }
+                /*
+                auto searchIdx = outTxps.search(rightTxp);
+                if (searchIdx < arraySize) {
+                    pos = static_cast<uint32_t>(SA[i]) - txpStarts[rightTxp];
+                    outStructs[searchIdx].tqvec.emplace_back(pos, h.queryPos, h.queryRC);
+                }
+                */
+            }
+        }
+
+
+        /*
+        void intersectSAIntervalWithOutput3(SAIntervalHit& h,
+                RapMapSAIndex& rmi,
+                SAProcessedHitVec& outHits) {
+            // Convenient bindings for variables we'll use
+            auto& SA = rmi.SA;
+            auto& txpIDs = rmi.positionIDs;
+            auto& txpStarts = rmi.txpOffsets;
+
+            // Iterator to the beginning and end of the output hits
+            auto outHitIt = outHits.begin();
+            auto outHitEnd = outHits.end();
+
+            // Make a vector of iterators into the right interval
+            std::vector<int*> rightHitIterators;
+            rightHitIterators.reserve(h.span());
+            for (auto i = h.begin; i < h.end; ++i) {
+                rightHitIterators.emplace_back(&SA[i]);
+            }
+            // Sort the iterators by their transcript ID
+            std::sort(rightHitIterators.begin(), rightHitIterators.end(),
+                    [&txpIDs](const int* a, const int* b) -> bool {
+                    return txpIDs[*a] < txpIDs[*b];
+                    });
+            auto rightIntHit = rightHitIterators.begin();
+            auto rightIntHitEnd = rightHitIterators.end();
+
+            uint32_t leftTxp, rightTxp;
+            uint32_t pos;
+            while (outHitIt != outHitEnd and rightIntHit != rightIntHitEnd) {
+                // Get the current transcript ID for the left and right eq class
+                leftTxp = outHitIt->tid;
+                rightTxp = txpIDs[(*(*rightIntHit))];
+                // If we need to advance the left txp, do it
+                if (leftTxp < rightTxp) {
+                    // Advance to the next transcript in the
+                    // equivalence class label
+                    ++outHitIt;
+                } else {
+                    // If the transcripts are equal (i.e. leftTxp >= rightTxp and !(rightTxp < leftTxp))
+                    // Then see if there are any hits here.
+                    if (!(rightTxp < leftTxp)) {
+                        // Add the position list iterator and query pos for the
+                        // hit from h2 to the back of outHits' tqvec.
+                        pos = static_cast<uint32_t>(*(*rightIntHit)) - txpStarts[rightTxp];
+                        outHitIt->tqvec.emplace_back(pos, h.queryPos, h.queryRC);
+                        //++outHitIt;
+                    }
+                    ++rightIntHit;
+                }
+            }
+        }
+        */
+
+
+
+        template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                                           RapMapIndexT& rmi,
+                                           uint32_t intervalCounter,
+                                           SAHitMap& outHits) {
+            using OffsetT = typename RapMapIndexT::IndexType;
+            // Convenient bindings for variables we'll use
+            auto& SA = rmi.SA;
+            //auto& txpIDs = rmi.positionIDs;
+            auto& rankDict = rmi.rankDict;
+            auto& txpStarts = rmi.txpOffsets;
+
+            // Walk through every hit in the new interval 'h'
+            for (OffsetT i = h.begin; i != h.end; ++i) {
+              //auto txpID = txpIDs[SA[i]];
+              // auto txpID = rankDict.Rank(SA[i], 1);
+              auto txpID = rmi.transcriptAtPosition(SA[i]);
+              auto txpListIt = outHits.find(txpID);
+              // If we found this transcript
+              // Add this position to the list
+              if (txpListIt != outHits.end()) {
+                txpListIt->second.numActive += (txpListIt->second.numActive == intervalCounter - 1) ? 1 : 0;
+                if (txpListIt->second.numActive == intervalCounter) {
+                  auto globalPos = SA[i];
+                  auto localPos = globalPos - txpStarts[txpID];
+                  txpListIt->second.tqvec.emplace_back(localPos, h.queryPos, h.queryRC);
+                }
+              }
+            }
+          }
+
+
+
+        std::vector<ProcessedHit> intersectHits(
+                std::vector<HitInfo>& inHits,
+                RapMapIndex& rmi
+                ) {
+            // Each inHit is a HitInfo structure that contains
+            // an iterator to the KmerInfo for this k-mer, the k-mer ID,
+            // and the query position where this k-mer appeared.
+            // We want to find the transcripts that appear in *every*
+            // hit.  Further, for each transcript, we want to
+            // know the k-mers that appear in this txp.
+
+            // Check this --- we should never call this function
+            // with less than 2 hits.
+            if (inHits.size() < 2) {
+                std::cerr << "intersectHits() called with < 2 k-mer "
+                    " hits; this shouldn't happen\n";
+                return {};
+            }
+
+            auto& eqClasses = rmi.eqClassList;
+            auto& eqClassLabels = rmi.eqLabelList;
+            auto& posList = rmi.posList;
+
+            // The HitInfo with the smallest equivalence class
+            // i.e. label with the fewest transcripts.
+            HitInfo* minHit = &inHits[0];
+            for (auto& h : inHits) {
+                if (h.kinfo->count < minHit->kinfo->count) {
+                    minHit = &h;
+                }
+            }
+
+            std::vector<ProcessedHit> outHits;
+            outHits.reserve(minHit->kinfo->count);
+            // =========
+            { // Add the info from minHit to outHits
+                // Equiv. class for minHit
+                auto& eqClass = eqClasses[minHit->kinfo->eqId];
+                // Iterator into, length of and end of the positon list
+                auto posIt = posList.begin() + minHit->kinfo->offset;
+                auto posLen = minHit->kinfo->count;
+                auto posEnd = posIt + posLen;
+                // Iterator into, length of and end of the transcript list
+                auto txpIt = eqClassLabels.begin() + eqClass.txpListStart;
+                auto txpListLen = eqClass.txpListLen;
+                auto txpEnd = txpIt + txpListLen;
+                PositionListHelper posHelper(posIt, posList.end());
+
+                while (txpIt != txpEnd) {
+                    auto tid = *txpIt;
+                    outHits.emplace_back(tid, posHelper, minHit->queryPos, minHit->queryRC);
+                    posHelper.advanceToNextTranscript();
+                    ++txpIt;
+                }
+            }
+            // =========
+
+            // Now intersect everything in inHits (apart from minHits)
+            // to get the final set of mapping info.
+            for (auto& h : inHits) {
+                if (&h != minHit) { // don't intersect minHit with itself
+                    intersectWithOutput(h, rmi, outHits);
+                }
+            }
+
+            size_t requiredNumHits = inHits.size();
+            // do we need stable_partition? --- don't think so.
+            auto newEnd = std::stable_partition(outHits.begin(), outHits.end(),
+                    [requiredNumHits] (const ProcessedHit& ph) -> bool {
+                    // should never really be greater.
+                    return (ph.tqvec.size() >= requiredNumHits);
+                    });
+            /*
+               bool didDrop = false;
+               for (auto it = newEnd; it != outHits.end(); ++it) {
+               std::cerr << "Dropped hit for txp " << it->tid << "\n";
+               didDrop = true;
+               }
+               if (didDrop) {
+               auto& eqClass = eqClasses[inHits[0].kinfo->eqId];
+               auto txpIt = eqClassLabels.begin() + eqClass.txpListStart;
+               auto txpListLen = eqClass.txpListLen;
+               auto txpEnd = txpIt + txpListLen;
+               std::cerr << "hits1: {";
+               while (txpIt != txpEnd) {
+               std::cerr << *txpIt << ", ";
+               ++txpIt;
+               }
+               std::cerr << "}\n";
+               auto& eqClass2 = eqClasses[inHits[1].kinfo->eqId];
+               txpIt = eqClassLabels.begin() + eqClass2.txpListStart;
+               txpListLen = eqClass2.txpListLen;
+               txpEnd = txpIt + txpListLen;
+               std::cerr << "hits2: {";
+               while (txpIt != txpEnd) {
+               std::cerr << *txpIt << ", ";
+               ++txpIt;
+               }
+               std::cerr << "}\n";
+               }
+               */
+            // return only the valid hits
+            outHits.resize(std::distance(outHits.begin(), newEnd));
+            return outHits;
+        }
+
+        template <typename RapMapIndexT>
+        std::vector<ProcessedSAHit> intersectSAHits2(
+                std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                RapMapIndexT& rmi
+                ) {
+            using OffsetT = typename RapMapIndexT::IndexType;
+
+            // Each inHit is a SAIntervalHit structure that contains
+            // an SA interval with all hits for a particuar query location
+            // on the read.
+            //
+            // We want to find the transcripts that appear in *every*
+            // interavl.  Further, for each transcript, we want to
+            // know the positions within this txp.
+
+            // Check this --- we should never call this function
+            // with less than 2 hits.
+            SAProcessedHitVec outHits;
+            if (inHits.size() < 2) {
+                std::cerr << "intersectHitsSA() called with < 2 k-mer "
+                    " hits; this shouldn't happen\n";
+                return outHits.hits;
+            }
+
+            auto& SA = rmi.SA;
+            auto& txpStarts = rmi.txpOffsets;
+            auto& txpIDs = rmi.positionIDs;
+
+            // Start with the smallest interval
+            // i.e. interval with the fewest hits.
+            SAIntervalHit<OffsetT>* minHit = &inHits[0];
+            for (auto& h : inHits) {
+                if (h.span() < minHit->span()) {
+                    minHit = &h;
+                }
+            }
+
+            auto& outStructs = outHits.hits;
+            auto& outTxps = outHits.txps;
+            outStructs.reserve(minHit->span());
+            outTxps.reserve(minHit->span());
+            std::map<int, uint32_t> posMap;
+            // =========
+            //{ // Add the info from minHit to outHits
+                for (int i = minHit->begin; i < minHit->end; ++i) {
+                    auto globalPos = SA[i];
+                    auto tid = txpIDs[globalPos];
+                    auto txpPos = globalPos - txpStarts[tid];
+                    auto posIt = posMap.find(tid);
+                    if (posIt == posMap.end()) {
+                        posMap[tid] = outStructs.size();
+                        outStructs.emplace_back(tid, txpPos, minHit->queryPos, minHit->queryRC);
+                    } else {
+                        outStructs[posIt->second].tqvec.emplace_back(txpPos, minHit->queryPos, minHit->queryRC);
+                    }
+                }
+                std::sort(outStructs.begin(), outStructs.end(),
+                          [] (const ProcessedSAHit& a, const ProcessedSAHit& b) -> bool {
+                            return a.tid < b.tid;
+                          });
+                for (auto it = outStructs.begin(); it != outStructs.end(); ++it) {
+                    outTxps.emplace_back(it->tid);
+                }
+                // Sentinel value for search
+                outTxps.emplace_back(std::numeric_limits<uint32_t>::max());
+                /*
+                fbs::eytzinger_array_bfp<uint32_t, uint32_t, true> searchArray(
+                        txpIndices.begin(), txpIndices.size()
+                        );
+                        */
+            //}
+            // =========
+
+            // Now intersect everything in inHits (apart from minHits)
+            // to get the final set of mapping info.
+            for (auto& h : inHits) {
+                if (&h != minHit) { // don't intersect minHit with itself
+                    intersectSAIntervalWithOutput2(h, rmi, outHits);
+                }
+            }
+
+            size_t requiredNumHits = inHits.size();
+            // Mark as active any transcripts with the required number of hits.
+            for (auto it = outStructs.begin(); it != outStructs.end(); ++it) {
+                if (it->tqvec.size() >= requiredNumHits) {
+                    it->active = true;
+                }
+            }
+            return outStructs;
+        }
+
+        template <typename RapMapIndexT>
+        SAHitMap intersectSAHits(
+                std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                RapMapIndexT& rmi,
+                bool strictFilter 
+                ) {
+            using OffsetT = typename RapMapIndexT::IndexType;
+            // Each inHit is a SAIntervalHit structure that contains
+            // an SA interval with all hits for a particuar query location
+            // on the read.
+            //
+            // We want to find the transcripts that appear in *every*
+            // interavl.  Further, for each transcript, we want to
+            // know the positions within this txp.
+
+            // Check this --- we should never call this function
+            // with less than 2 hits.
+            SAHitMap outHits;
+            if (inHits.size() < 2) {
+                std::cerr << "intersectHitsSA() called with < 2 hits "
+                    " hits; this shouldn't happen\n";
+                return outHits;
+            }
+
+            auto& SA = rmi.SA;
+            auto& txpStarts = rmi.txpOffsets;
+            //auto& txpIDs = rmi.positionIDs;
+	    auto& rankDict = rmi.rankDict;
+
+            // Start with the smallest interval
+            // i.e. interval with the fewest hits.
+            SAIntervalHit<OffsetT>* minHit = &inHits[0];
+            for (auto& h : inHits) {
+                if (h.span() < minHit->span()) {
+                    minHit = &h;
+                }
+            }
+
+            //outHits.reserve(minHit->span());
+            // =========
+            { // Add the info from minHit to outHits
+                for (OffsetT i = minHit->begin; i < minHit->end; ++i) {
+                    auto globalPos = SA[i];
+                    //auto tid = txpIDs[globalPos];
+                    auto tid = rmi.transcriptAtPosition(globalPos);
+                    auto txpPos = globalPos - txpStarts[tid];
+                    outHits[tid].tqvec.emplace_back(txpPos, minHit->queryPos, minHit->queryRC);
+                }
+            }
+            // =========
+
+            // Now intersect everything in inHits (apart from minHits)
+            // to get the final set of mapping info.
+            size_t intervalCounter{2};
+            for (auto& h : inHits) {
+                if (&h != minHit) { // don't intersect minHit with itself
+                    intersectSAIntervalWithOutput(h, rmi, intervalCounter, outHits);
+                    ++intervalCounter;
+                }
+            }
+
+            size_t requiredNumHits = inHits.size();
+            // Mark as active any transcripts with the required number of hits.
+            for (auto it = outHits.begin(); it != outHits.end(); ++it) {
+                bool enoughHits = (it->second.numActive >= requiredNumHits);
+                it->second.active = (strictFilter) ? 
+                    (enoughHits and it->second.checkConsistent(requiredNumHits)) :
+                    (enoughHits);
+            }
+            return outHits;
+        }
+
+
+        /**
+        * Need to explicitly instantiate the versions we use
+        */
+      using SAIndex32BitDense = RapMapSAIndex<int32_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int32_t>,
+									     rapmap::utils::KmerKeyHasher>>;
+      using SAIndex64BitDense = RapMapSAIndex<int64_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int64_t>,
+									     rapmap::utils::KmerKeyHasher>>;
+      using SAIndex32BitPerfect = RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+      using SAIndex64BitPerfect = RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex32BitDense>(SAIntervalHit<int32_t>& h,
+                                                              SAIndex32BitDense& rmi, 
+                                                              uint32_t intervalCounter, 
+                                                              SAHitMap& outHits);
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex64BitDense>(SAIntervalHit<int64_t>& h,
+                                                              SAIndex64BitDense& rmi, 
+                                                              uint32_t intervalCounter, 
+                                                              SAHitMap& outHits); 
+
+        template
+        SAHitMap intersectSAHits<SAIndex32BitDense>(std::vector<SAIntervalHit<int32_t>>& inHits,
+                                                    SAIndex32BitDense& rmi, bool strictFilter);
+
+        template
+        SAHitMap intersectSAHits<SAIndex64BitDense>(std::vector<SAIntervalHit<int64_t>>& inHits,
+          SAIndex64BitDense& rmi, bool strictFilter);
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex32BitPerfect>(SAIntervalHit<int32_t>& h,
+                                                                SAIndex32BitPerfect& rmi, 
+                                                                uint32_t intervalCounter, 
+                                                                SAHitMap& outHits);
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex64BitPerfect>(SAIntervalHit<int64_t>& h,
+                                                                SAIndex64BitPerfect& rmi, 
+                                                                uint32_t intervalCounter, 
+                                                                SAHitMap& outHits);
+
+        template
+        SAHitMap intersectSAHits<SAIndex32BitPerfect>(std::vector<SAIntervalHit<int32_t>>& inHits,
+                                                      SAIndex32BitPerfect& rmi, bool strictFilter);
+
+        template
+        SAHitMap intersectSAHits<SAIndex64BitPerfect>(std::vector<SAIntervalHit<int64_t>>& inHits,
+                                                      SAIndex64BitPerfect& rmi, bool strictFilter);
+    }
+}
diff --git a/src/RapMap.cpp b/src/RapMap.cpp
new file mode 100644
index 0000000..942b970
--- /dev/null
+++ b/src/RapMap.cpp
@@ -0,0 +1,74 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <cereal/archives/json.hpp>
+
+#include "RapMapConfig.hpp"
+#include "IndexHeader.hpp"
+
+int rapMapIndex(int argc, char* argv[]);
+int rapMapSAIndex(int argc, char* argv[]);
+int rapMapMap(int argc, char* argv[]);
+int rapMapSAMap(int argc, char* argv[]);
+
+void printUsage() {
+    std::string versionString = rapmap::version;
+    std::cerr << "RapMap Transcriptome Aligner v"
+              << versionString << '\n';
+    std::cerr << "=====================================\n";
+    auto usage =
+        R"(
+There are currently 4 RapMap subcommands
+    pseudoindex   --- builds a k-mer-based index
+    pseudomap     --- map reads using a k-mer-based index
+    quasiindex --- builds a suffix array-based (SA) index
+    quasimap   --- map reads using the SA-based index
+
+Run a corresponding command "rapmap <cmd> -h" for
+more information on each of the possible RapMap
+commands.)";
+    std::cerr << usage << '\n';
+}
+
+bool isIndexArg(char* arg) {
+    std::string argStr(arg);
+    return (argStr == "-i") or (argStr == "--index");
+}
+
+
+int main(int argc, char* argv[]) {
+
+    std::vector<char*> args;
+    args.push_back(argv[0]);
+
+    if (argc < 2) {
+        printUsage();
+        std::exit(0);
+    }
+
+    for (int i = 2; i < argc; ++i) {
+        args.push_back(argv[i]);
+    }
+
+    if (std::string(argv[1]) == "-h" or
+        std::string(argv[1]) == "--help") {
+        printUsage();
+        std::exit(0);
+    }
+
+    if (std::string(argv[1]) == "pseudoindex") {
+        return rapMapIndex(argc - 1, args.data());
+    } else if (std::string(argv[1]) == "quasiindex") {
+        return rapMapSAIndex(argc - 1, args.data());
+    } else if (std::string(argv[1]) == "pseudomap") {
+        return rapMapMap(argc - 1, args.data());
+    } else if (std::string(argv[1]) == "quasimap") {
+        return rapMapSAMap(argc - 1, args.data());
+    } else {
+        std::cerr << "the command " << argv[1]
+                  << " is not yet implemented\n";
+        return 1;
+    }
+    return 0;
+}
diff --git a/src/RapMapFileSystem.cpp b/src/RapMapFileSystem.cpp
new file mode 100644
index 0000000..66e246b
--- /dev/null
+++ b/src/RapMapFileSystem.cpp
@@ -0,0 +1,37 @@
+#include "RapMapFileSystem.hpp"
+#include <sys/stat.h>
+
+
+namespace rapmap {
+    namespace fs {
+
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool FileExists(const char *path) {
+            struct stat fileStat;
+            if ( stat(path, &fileStat) ) {
+                return false;
+            }
+            if ( !S_ISREG(fileStat.st_mode) ) {
+                return false;
+            }
+            return true;
+        }
+
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool DirExists(const char *path) {
+            struct stat fileStat;
+            if ( stat(path, &fileStat) ) {
+                return false;
+            }
+            if ( !S_ISDIR(fileStat.st_mode) ) {
+                return false;
+            }
+            return true;
+        }
+
+        void MakeDir(const char* path) {
+            mkdir(path, ACCESSPERMS);
+        }
+
+    }
+}
diff --git a/src/RapMapIndex.cpp b/src/RapMapIndex.cpp
new file mode 100644
index 0000000..8efb45f
--- /dev/null
+++ b/src/RapMapIndex.cpp
@@ -0,0 +1,139 @@
+#include "RapMapIndex.hpp"
+
+RapMapIndex::RapMapIndex() {}
+
+bool RapMapIndex::load(std::string& indexPrefix) {
+    auto logger = spdlog::get("stderrLog");
+    std::string kmerInfosName = indexPrefix + "kinfo.bin";
+    std::string eqClassListName = indexPrefix + "eqclass.bin";
+    std::string eqLabelListName = indexPrefix + "eqlab.bin";
+    std::string posListName = indexPrefix + "pos.bin";
+    std::string jfFileName = indexPrefix + "rapidx.jfhash";
+    std::string txpNameFile = indexPrefix + "txpnames.bin";
+    std::string txpLenFile = indexPrefix + "txplens.bin";
+    std::string fwdJumpFile = indexPrefix + "fwdjump.bin";
+    std::string revJumpFile = indexPrefix + "revjump.bin";
+
+    // Load the kmer info list first --- this will
+    // give us the # of unique k-mers
+    std::ifstream kmerInfoStream(kmerInfosName, std::ios::binary);
+    {
+        logger->info("loading k-mer info list . . .");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive kmerInfoArchive(kmerInfoStream);
+        kmerInfoArchive(kmerInfos);
+        logger->info("done\n");
+    }
+    kmerInfoStream.close();
+
+    size_t numDistinctKmers = kmerInfos.size();
+    {
+        ScopedTimer timer;
+        logger->info("loading k-mer => id hash . . . ");
+        std::ifstream bis(jfFileName);
+        const SpecialHeader bh(bis);
+        // mapFile.reset(new jellyfish::mapped_file(jfFileName.c_str()));
+        const size_t sizeInBytes = bh.size_bytes();
+        // Load the hash from file
+        logger->info("\theader format = {}"      , bh.format());
+        logger->info("\t# distinct k-mers = {}"  , numDistinctKmers);
+        logger->info("\thash key len = {}"       , bh.key_len());
+        logger->info("\tcounter len = {}"        , bh.counter_len());
+        logger->info("\tmax reprobe offset = {}" , bh.max_reprobe());
+        logger->info("\tsize in bytes = {}"      , sizeInBytes);
+
+        // Allocate the actual storage
+        rawHashMem.reset(new char[sizeInBytes]);
+        bis.read(rawHashMem.get(), sizeInBytes);
+        // We can close the file now
+        bis.close();
+
+        merHash.reset( new FileMerArray(rawHashMem.get(),//mapFile->base() + bh.offset(),
+                    sizeInBytes,
+                    bh.size(),
+                    bh.key_len(),
+                    bh.counter_len(),
+                    bh.max_reprobe(),
+                    bh.matrix()));
+        // Set the key size
+        rapmap::utils::my_mer::k(bh.key_len() / 2);
+        logger->info("done");
+    }
+
+
+    std::ifstream eqClassStream(eqClassListName, std::ios::binary);
+    {
+        logger->info("loading eq classes . . . ");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive eqClassArchive(eqClassStream);
+        eqClassArchive(eqClassList);
+        logger->info("[{}] classes", eqClassList.size());
+        logger->info("done");
+    }
+    eqClassStream.close();
+    std::ifstream eqLabelStream(eqLabelListName, std::ios::binary);
+    {
+        logger->info("loading eq class labels . . . ");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive eqLabelArchive(eqLabelStream);
+        eqLabelArchive(eqLabelList);
+        logger->info("[{}] labels", eqLabelList.size());
+        logger->info("done");
+    }
+    eqLabelStream.close();
+    std::ifstream posStream(posListName, std::ios::binary);
+    {
+        logger->info("loading position list . . . ");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive posArchive(posStream);
+        posArchive(posList);
+        logger->info("[{}] total k-mer positions", posList.size());
+        logger->info("done");
+    }
+    posStream.close();
+    std::ifstream txpNameStream(txpNameFile, std::ios::binary);
+    {
+        logger->info("loading transcript names ");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive txpNameArchive(txpNameStream);
+        txpNameArchive(txpNames);
+        logger->info("[{}] transcripts in index ", txpNames.size());
+        logger->info("done ");
+    }
+    txpNameStream.close();
+
+    std::ifstream txpLenStream(txpLenFile, std::ios::binary);
+    {
+        logger->info("loading transcript lengths");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive txpLenArchive(txpLenStream);
+        txpLenArchive(txpLens);
+        logger->info("[{}] transcripts in index", txpLens.size());
+        logger->info("done ");
+    }
+    txpLenStream.close();
+
+    std::ifstream fwdJumpStream(fwdJumpFile, std::ios::binary);
+    {
+        logger->info("loading forward jumps");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive fwdJumpArchive(fwdJumpStream);
+        fwdJumpArchive(fwdJumpTable);
+        logger->info("[{}] forward jumps", fwdJumpTable.size());
+        logger->info("done ");
+    }
+    fwdJumpStream.close();
+
+    std::ifstream revJumpStream(revJumpFile, std::ios::binary);
+    {
+        logger->info("loading reverse jumps");
+        ScopedTimer timer;
+        cereal::BinaryInputArchive revJumpArchive(revJumpStream);
+        revJumpArchive(revJumpTable);
+        logger->info("[{}] reverse jumps", revJumpTable.size());
+        logger->info("done ");
+    }
+    revJumpStream.close();
+    return true;
+}
+
diff --git a/src/RapMapIndexer.cpp b/src/RapMapIndexer.cpp
new file mode 100644
index 0000000..c8608ca
--- /dev/null
+++ b/src/RapMapIndexer.cpp
@@ -0,0 +1,765 @@
+#include <iostream>
+#include <mutex>
+#include <vector>
+#include <random>
+#include <unordered_map>
+#include <type_traits>
+#include <fstream>
+
+#include "tclap/CmdLine.h"
+
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+
+#include "xxhash.h"
+#include "btree/btree_map.h"
+
+// Jellyfish 2 include
+#include "jellyfish/mer_dna.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+
+#include "RapMapUtils.hpp"
+#include "RapMapFileSystem.hpp"
+#include "ScopedTimer.hpp"
+#include "IndexHeader.hpp"
+
+#include "jellyfish/file_header.hpp"
+#include "jellyfish/binary_dumper.hpp"
+#include "jellyfish/thread_exec.hpp"
+#include "jellyfish/hash_counter.hpp"
+#include "jellyfish/mer_overlap_sequence_parser.hpp"
+#include "jellyfish/mer_iterator.hpp"
+#include "JFRaw.hpp"
+
+#include <chrono>
+
+using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
+using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using TranscriptID = uint32_t;
+using TranscriptIDVector = std::vector<TranscriptID>;
+using KmerIDMap = std::vector<TranscriptIDVector>;
+using MerMapT = jellyfish::cooperative::hash_counter<rapmap::utils::my_mer>;
+
+uint64_t encode(uint32_t tid, uint32_t pos) {
+    uint64_t res = tid;
+    res = res << 32;
+    res |= pos;
+    return res;
+}
+
+constexpr uint32_t uint32HighestBitMask = 0x80000000;
+constexpr uint32_t uint31LowBitMask = 0x7FFFFFFF;
+
+constexpr uint32_t uint30LowBitMask = 0x3FFFFFFF;
+
+constexpr uint32_t rcSetMask = 0x40000000;
+
+// marks the second highest bit
+void markRCBit(uint32_t& i) { i |= rcSetMask; }
+
+// marks the highest bit
+void markNewTxpBit(uint32_t& i) { i |= uint32HighestBitMask; }
+
+bool wasSeen(uint32_t i) { return ((i & uint32HighestBitMask) >> 31) == 1; }
+
+void markSeen(uint32_t& i) { i |= uint32HighestBitMask; }
+
+uint32_t unmarked(uint32_t i) { return (i & uint31LowBitMask); }
+
+class VectorHasher {
+    public:
+    size_t operator()(const std::vector<uint32_t>& vec) const {
+        size_t hashVal{0};
+        for (auto v : vec) {
+            rapmap::utils::hashCombine(hashVal, v);
+        }
+        return hashVal;
+    }
+};
+
+struct PosInfo {
+    PosInfo(uint64_t merIDIn, bool isRCIn, uint32_t posIn) :
+        merID(merIDIn), isRC(isRCIn), pos(posIn) {}
+
+    uint64_t merID;
+    bool isRC;
+    uint32_t pos;
+};
+
+// maybe unify with the above?
+struct JumpCell {
+    JumpCell(uint32_t merIdxIn, int32_t posIn, bool isRCIn) :
+        merIdx(merIdxIn), pos(posIn), isRC(isRCIn) {}
+    uint32_t merIdx;
+    int32_t pos;
+    bool isRC;
+};
+
+template <typename T>
+void printVec(std::vector<T>& vec) {
+    std::cerr << "{ ";
+    for (auto& e : vec) {
+        std::cerr << e << ", ";
+    }
+    std::cerr << "}";
+}
+
+// There may be a better way to do this, but here we just check the
+// possible neighbors
+bool isBreakpoint(MerMapT& merIntMap, rapmap::utils::my_mer canonicalMer) {
+    uint32_t inDegree{0};
+    uint32_t outDegree{0};
+    const auto& ary = merIntMap.ary();
+    // extend forward
+    for (char b : {'A', 'C', 'G', 'T'}) {
+        auto newMer = canonicalMer;
+        newMer.shift_left(b);
+        newMer.canonicalize();
+        outDegree += ary->has_key(newMer) ? 1 : 0;
+        if (outDegree > 1) { return true; }
+    }
+    // extend backward
+    for (char b : {'A', 'C', 'G', 'T'}) {
+        auto newMer = canonicalMer;
+        newMer.shift_right(b);
+        newMer.canonicalize();
+        inDegree += ary->has_key(newMer) ? 1 : 0;
+        if (inDegree > 1) { return true; }
+    }
+    return false;
+}
+
+void emptyJumpQueue(std::vector<JumpCell>& jumpQueue, int32_t lastBreak,
+                    int32_t pos,
+                    MerMapT& merIntMap,
+                    std::vector<uint8_t>& fwdJump,
+                    std::vector<uint8_t>& revJump) {
+
+    // The maximum representable jump
+    constexpr auto maxJump =
+        std::numeric_limits<std::remove_reference<decltype(fwdJump[0])>::type>::max();
+
+    while (!jumpQueue.empty()) {
+        auto& jumpCell = jumpQueue.back();
+        uint8_t revJumpDist = static_cast<uint8_t>(
+                std::min(jumpCell.pos - lastBreak + 1,
+                static_cast<int32_t>(maxJump)));
+        uint8_t fwdJumpDist = static_cast<uint8_t>(
+                std::min(pos - jumpCell.pos + 1,
+                static_cast<int32_t>(maxJump)));
+        fwdJump[jumpCell.merIdx] =
+            std::min(fwdJumpDist, fwdJump[jumpCell.merIdx]);
+        revJump[jumpCell.merIdx] =
+            std::min(revJumpDist, revJump[jumpCell.merIdx]);
+        // Now that we marked the jump, no need to keep this around.
+        jumpQueue.pop_back();
+    }
+}
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+template <typename ParserT>//, typename CoverageCalculator>
+void processTranscripts(ParserT* parser,
+			std::string& outputDir,
+                        std::mutex& iomutex) {
+    // Seed with a real random value, if available
+    std::random_device rd;
+
+    // Create a random uniform distribution
+    std::default_random_engine eng(rd());
+
+    std::uniform_int_distribution<> dis(0, 3);
+
+    uint32_t n{0};
+    uint32_t k = rapmap::utils::my_mer::k();
+    std::vector<std::string> transcriptNames;
+    std::vector<uint32_t> transcriptLengths;
+    constexpr char bases[] = {'A', 'C', 'G', 'T'};
+    uint32_t polyAClipLength{10};
+    uint32_t numPolyAsClipped{0};
+    std::string polyA(polyAClipLength, 'A');
+
+    using TranscriptList = std::vector<uint32_t>;
+    using eager_iterator = MerMapT::array::eager_iterator;
+    using KmerBinT = uint64_t;
+    //create the hash
+    size_t hashSize = 100000000;
+    MerMapT merIntMap(hashSize, rapmap::utils::my_mer::k()*2, 32, 1, 126);
+    std::vector<rapmap::utils::KmerInfo> kmerInfos;
+
+    std::vector<std::string> transcriptSeqs;
+    size_t numDistinctKmers{0};
+    size_t numKmers{0};
+
+    std::cerr << "\n[Step 1 of 4] : counting k-mers\n";
+
+    {
+        ScopedTimer timer;
+        while(true) {
+            typename ParserT::job j(*parser);
+            if(j.is_empty()) break;
+
+            for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+                std::string& readStr = j->data[i].seq;
+
+		// Do Kallisto-esque clipping of polyA tails
+		if (readStr.size() > polyAClipLength and
+		    readStr.substr(readStr.length() - polyAClipLength) == polyA) {
+
+		    auto newEndPos = readStr.find_last_not_of("Aa");
+		    // If it was all As
+		    if (newEndPos == std::string::npos) {
+			readStr.resize(0);
+		    } else {
+			readStr.resize(newEndPos + 1);
+		    }
+		    ++numPolyAsClipped;
+		}
+
+                uint32_t readLen  = readStr.size();
+                uint32_t txpIndex = n++;
+                transcriptLengths.push_back(readLen);
+                auto& recHeader = j->data[i].header;
+                transcriptNames.emplace_back(recHeader.substr(0, recHeader.find_first_of(" \t")));
+
+                rapmap::utils::my_mer mer;
+                mer.polyT();
+                for (size_t b = 0; b < readLen; ++b) {
+                    int c = jellyfish::mer_dna::code(readStr[b]);
+                    if (jellyfish::mer_dna::not_dna(c)) {
+                        char rbase = bases[dis(eng)];
+                        c = jellyfish::mer_dna::code(rbase);
+                        readStr[b] = rbase;
+                    }
+                    mer.shift_left(c);
+                    if (b >= k) {
+                        auto canonicalMer = mer.get_canonical();
+                        auto key = canonicalMer.get_bits(0, 2*k);
+
+                        uint64_t val;
+                        auto found = merIntMap.ary()->get_val_for_key(canonicalMer, &val);
+                        if (!found) {
+                            merIntMap.add(canonicalMer, numDistinctKmers);
+                            kmerInfos.emplace_back(txpIndex, 0, 1);
+                            ++numDistinctKmers;
+                        } else {
+                            kmerInfos[val].count++;
+                        }
+
+                        /*
+                           start = std::chrono::high_resolution_clock::now();
+                           auto it = cmap.find(key);
+                           end = std::chrono::high_resolution_clock::now();
+                           elapsedNS += end - start;
+                        // If we found the k-mer, increment the count
+                        if (it != cmap.end()) {
+                        it->second.count++;
+                        } else { // Otherwise, add it
+                        cmap[key] = KmerInfo(txpIndex, 0, 1);
+                        }
+                        */
+                        // No matter what, our k-mer count increased
+                        numKmers++;
+                    }
+                }
+                transcriptSeqs.push_back(j->data[i].seq);
+                if (n % 10000 == 0) {
+                    std::cerr << "\r\rcounted k-mers for " << n << " transcripts";
+                }
+            }
+        }
+        std::cerr << "\n";
+    }
+
+    std::cerr << "Clipped poly-A tails from " << numPolyAsClipped << " transcripts\n";
+
+    std::ofstream txpLenStream(outputDir + "txplens.bin", std::ios::binary);
+    {
+        cereal::BinaryOutputArchive txpLenArchive(txpLenStream);
+        txpLenArchive(transcriptLengths);
+    }
+    txpLenStream.close();
+    transcriptLengths.clear();
+    transcriptLengths.shrink_to_fit();
+
+    constexpr uint32_t uint32Invalid = std::numeric_limits<uint32_t>::max();
+    std::vector<uint32_t> transcriptIDs(numKmers, uint32Invalid);
+
+    std::cerr << "\n[Step 2 of 4] : marking k-mers\n";
+    // Mark the transcript in which each occurence oc a k-mer appears
+    // in the transcriptIDs vector.
+
+
+    bool isRC{false};
+    int32_t pos{0};
+    uint32_t offset{0};
+    uint32_t transcriptID{0};
+    {
+    ScopedTimer timer;
+
+
+    for (auto& transcriptSeq : transcriptSeqs) {
+        auto readLen = transcriptSeq.length();
+        rapmap::utils::my_mer mer;
+        mer.polyT();
+        std::vector<KmerBinT> kmers;
+        for (size_t b = 0; b < readLen; ++b) {
+            int c = jellyfish::mer_dna::code(transcriptSeq[b]);
+            mer.shift_left(c);
+            if (b >= k) {
+                auto canonicalMer = mer.get_canonical();
+                uint64_t kmerIndex;
+                auto found = merIntMap.ary()->get_val_for_key(canonicalMer, &kmerIndex);
+                // Should ALWAYS find the key
+                assert(found);
+
+
+                auto& v = kmerInfos[kmerIndex];
+                // use the highest bit to mark if we've seen this k-mer yet or not
+                // If we haven't seen this k-mer yet
+                if (!wasSeen(v.count)) {
+                   // Where we start looking for transcripts for this k-mer
+                   v.offset = offset;
+                   offset += v.count;
+                   // The number of transcripts we've currently added
+                   v.count = 0;
+                   markSeen(v.count);
+                }
+
+                // Note: We allow duplicate transcripts here --- they will always be adjacent
+                auto lastOffset = v.offset + unmarked(v.count);
+                transcriptIDs[lastOffset] = transcriptID;
+                v.count++;
+            }
+
+
+        }
+        if (transcriptID % 10000 == 0) {
+            std::cerr << "\r\rmarked kmers for " << transcriptID << " transcripts";
+        }
+        ++transcriptID;
+    }
+    	std::cerr << "\n";
+    }
+
+    //printVec(transcriptIDs);
+    // A hash to quickly and easily determine the equivalence classes
+    std::unordered_map<std::vector<uint32_t>, uint32_t, VectorHasher> eqClassMap;
+    // Holds the members of each equivalence class in the order in which classes
+    // are assigned.  The final size should be \sum_{c \in eqclasses} |c|.
+    std::vector<uint32_t> eqClassLabelVec;
+    // Holds pointer information about a k-mer's equivalence class.
+    // Specifically, where the label for the eq can be found
+    // as an offset and length into eqClassTxpVec, and where the
+    std::vector<rapmap::utils::EqClass> eqClasses;
+
+    uint32_t eqClassVecSize{0};
+
+    std::cerr << "\n[Step 3 of 4] : building k-mers equivalence classes\n";
+    // Compute the equivalence classes for the k-mers
+    {
+        ScopedTimer timer;
+        const auto ary = merIntMap.ary();
+        auto hashIt = ary->iterator_all<eager_iterator>();
+        std::vector<uint32_t> tlist;
+        while (hashIt.next()) {
+            auto& key = hashIt.key();
+            auto& val = kmerInfos[hashIt.val()];
+            //auto& val = kv.second;
+            auto offset = val.offset;
+            auto num = unmarked(val.count);
+
+            tlist.clear();
+            tlist.reserve(num);
+
+            for (size_t idx = offset; idx < offset + num; ++idx) {
+                auto tid = transcriptIDs[idx];
+                // We won't consider duplicate transcript IDs when building the
+                // equivalence classes
+		//
+		if (tlist.size() > 0 and tlist.back() > tid) {
+		    std::cerr << "Non monotnoically increasing transcript id!\n";
+		}
+                if (tlist.size() == 0 or tlist.back() != tid) {
+                    tlist.push_back(tid);
+                }
+            }
+
+            auto eqIt = eqClassMap.find(tlist);
+            uint32_t eqId{0};
+            // If there is no such equivalence class yet, then add it
+            if (eqIt == eqClassMap.end()) {
+                eqId = eqClassMap.size();
+                eqClassMap[tlist] = eqId;
+                // The label of this eq-class starts at eqClassVecSize and is
+                // tlist.size() elements long.
+                eqClasses.emplace_back(eqClassVecSize, tlist.size());
+                // Insert the label information into eqClassTxpVec
+                eqClassLabelVec.insert(eqClassLabelVec.end(), tlist.begin(), tlist.end());
+                eqClassVecSize += tlist.size();
+            } else {
+                eqId = eqIt->second;
+            }
+            // Set the equivalence class ID here for this transcript
+            val.eqId = eqId;
+            val.count = 0;
+        }
+    	std::cerr << "done! There were " << eqClassMap.size() << " classes\n";
+    }
+    // reuse the transcript IDs vector
+    auto& posVec = transcriptIDs;
+
+
+    constexpr uint8_t maxJump = std::numeric_limits<uint8_t>::max();
+    // Also, attempt to build *jump* tables here!
+    // How far we can move "forward" before hitting a new eq. class
+    std::vector<uint8_t> fwdJump(numDistinctKmers, maxJump);
+    // How far we can move "forward" backward hitting a new eq. class
+    std::vector<uint8_t> revJump(numDistinctKmers, maxJump);
+    int32_t lastBreak{0};
+
+    std::cerr << "\n[Step 4 of 4] : finalizing index\n";
+    transcriptID = 0;
+    {
+        ScopedTimer finalizeTimer;
+        // Local vector to hold k-mers per transcript
+        btree::btree_map<rapmap::utils::my_mer,
+                         std::vector<PosInfo>> posHash;//std::vector<PosInfo> posInfos;
+
+        // k-mers in the forward orientation w.r.t the reference txp
+        std::vector<JumpCell> fwdJumpQueue;
+        // k-mers in the reverse orientation w.r.t the reference txp
+        std::vector<JumpCell> revJumpQueue;
+
+        for (auto& transcriptSeq : transcriptSeqs) {
+	    // We can always jump to the beginning of a
+	    // new transcript
+	    lastBreak = 0;
+
+            fwdJumpQueue.clear();
+            revJumpQueue.clear();
+            posHash.clear();
+            auto readLen = transcriptSeq.length();
+            uint32_t currEqClass;
+            uint32_t prevEqClass = std::numeric_limits<uint32_t>::max();
+            rapmap::utils::my_mer mer;
+            mer.polyT();
+            for (size_t b = 0; b < readLen; ++b) {
+                int c = jellyfish::mer_dna::code(transcriptSeq[b]);
+                mer.shift_left(c);
+                if (b >= k) {
+                    auto canonicalMer = mer.get_canonical();
+                    bool isRC = (mer != canonicalMer);
+
+                    uint64_t kmerIndex;
+                    auto found = merIntMap.ary()->get_val_for_key(
+                                            canonicalMer, &kmerIndex);
+
+                    auto& val = kmerInfos[kmerIndex];
+                    currEqClass = val.eqId;
+
+                    // Record the position of this k-mer in the transcript
+                    uint32_t pos = b - k;
+                    if (pos > readLen) {
+                        std::cerr << "Pos is " << pos << ", but transcript length is " << readLen << "\n";
+                    }
+
+                    // === Jumping
+                    // if we hit a node with in-degree > 1 or out-degree > 1
+                    // then this defines the new breakpoint.  At this time, we
+                    // clear out the queues and mark the appropriate skips for
+                    // each k-mer we encountered.
+                    if ( currEqClass != prevEqClass ) {
+                        // For each k-mer in the forward direction, it can
+                        // skip forward to this breakpoint, which is at
+                        // position pos.
+                        emptyJumpQueue(fwdJumpQueue, lastBreak, pos, merIntMap,
+                                fwdJump, revJump);
+                        // The only difference here is that we reverse the
+                        // revJump and fwdJump arguments, since these are RC
+                        // mers.
+                        emptyJumpQueue(revJumpQueue, lastBreak, pos, merIntMap,
+                                revJump, fwdJump);
+                        lastBreak = pos;
+                    }
+                    // Does this k-mer exists in the table in the forward
+                    // or reverse complement direction.
+                    if (isRC) {
+                        revJumpQueue.emplace_back(kmerIndex, pos, isRC);
+                    } else {
+                        fwdJumpQueue.emplace_back(kmerIndex, pos, isRC);
+                    }
+                    prevEqClass = currEqClass;
+                    // === Jumping
+
+                    //posInfos.emplace_back(canonicalMer, isRC, pos);
+                    if (posHash[canonicalMer].size() > 0) {
+                        if (pos < posHash[canonicalMer].back().pos) {
+                            std::cerr << "NON-MONOTONIC POS\n";
+                        }
+                    }
+                    posHash[canonicalMer].emplace_back(kmerIndex, isRC, pos);
+                }
+            }
+
+            // === Jumping
+            // Empty anything remaining out of the jump queues
+            //
+            // The last k-mer in the transcript is, by definition a breakpoint.
+            // So, empty the queues.
+            emptyJumpQueue(fwdJumpQueue, lastBreak, pos, merIntMap,
+                    fwdJump, revJump);
+            // The only difference here is that we reverse the
+            // revJump and fwdJump arguments, since these are RC
+            // mers.
+            emptyJumpQueue(revJumpQueue, lastBreak, pos, merIntMap,
+                    revJump, fwdJump);
+            // === Jumping
+
+            for (auto kv = posHash.begin(); kv != posHash.end(); ++kv) {
+                    auto mer = kv->first;
+                    auto& list = kv->second;
+                    // Should ALWAYS find the key
+                    assert(found);
+                    auto& val = kmerInfos[list.front().merID];
+                    uint32_t offset;
+                    markNewTxpBit(list.front().pos);
+                    for (auto& pi : list) {
+                        offset = val.offset + val.count;
+                        if (pi.isRC) {
+                            markRCBit(pi.pos);
+                        }
+                        transcriptIDs[offset] = pi.pos;
+                        ++val.count;
+                    }
+                }
+            /*
+            std::sort(posInfos.begin(), posInfos.end(),
+                      [] (const PosInfo& a, const PosInfo& b) -> bool {
+                        if (a.mer < b.mer) {
+                            return true;
+                        } else if (a.mer == b.mer) {
+                            return a.pos < b.pos;
+                        } else {
+                            return false;
+                        }
+                      });
+            PosInfo* prev{nullptr};
+            for (auto& posInfo : posInfos) {
+                    uint64_t kmerIndex;
+                    auto found = merIntMap.ary()->get_val_for_key(
+                                    posInfo.mer, &kmerIndex);
+                    // Should ALWAYS find the key
+                    assert(found);
+                    auto& val = kmerInfos[kmerIndex];
+                    auto offset = val.offset + val.count;
+                    // Check if this offset is for a new transcript in the
+                    // position list and, if so, set the proper flag bit.
+                    if ( prev == nullptr or prev->mer != posInfo.mer) {
+                        markNewTxpBit(posInfo.pos);
+                    } else {
+                        // These are the same k-mer so the pos better
+                        // be increasing!
+                        if (prev != nullptr) {
+                            if ((prev->pos & uint30LowBitMask) >= posInfo.pos) {
+                                std::cerr << "prev pos = " << (prev->pos & uint30LowBitMask)
+                                          << ", but curr pos = " << posInfo.pos
+                                          << "\n";
+                            }
+                        }
+                    }
+                    if ( posInfo.isRC ) {
+                        markRCBit(posInfo.pos);
+                    }
+        		    prev = &posInfo;
+                    transcriptIDs[offset] = posInfo.pos;
+                    ++val.count;
+                }
+            */
+            if (transcriptID % 10000 == 0) {
+                std::cerr << "\r\rfinalized kmers for " << transcriptID << " transcripts";
+            }
+            ++transcriptID;
+        }
+	std::cerr << "\n";
+    }
+
+    /*
+    std::cerr << "[DEBUG]: Verifying Index\n";
+    {
+        ScopedTimer st;
+    auto hashIt = merIntMap.ary()->iterator_all<eager_iterator>();
+    std::vector<uint32_t> tlist;
+    while (hashIt.next()) {
+        auto& key = hashIt.key();
+        auto& val = kmerInfos[hashIt.val()];
+        if (!(*(transcriptIDs.begin() + val.offset) & uint32HighestBitMask)) {
+            std::cerr << "found un-marked k-mer position at beginning of list!\n";
+        }
+    }
+    }
+    */
+    // merIntMap --- jf hash
+    // kmerInfos --- info for each k-mer
+    // std::vector<uint32_t> eqClassLabelVec --- transcripts for each eq class
+    // std::vector<EqClass> eqClasses --- where each eq class starts and ends
+    // transcriptIDs --- position vec
+
+    std::cerr << "Writing the index to " << outputDir << "\n";
+
+    using JFFileHeader = jellyfish::file_header;
+    using JFDumper = jellyfish::binary_dumper<MerMapT::array>;
+
+    SpecialHeader fh;
+    fh.update_from_ary(*merIntMap.ary());
+    fh.canonical(true);
+    fh.format("gus/special"); // Thanks, Guillaume
+    fh.counter_len(8*sizeof(uint32_t)); // size of counter in bits
+    fh.fill_standard();
+    //fh.set_cmdline(argc, argv);
+
+    std::ofstream jfos(outputDir + "rapidx.jfhash");
+    fh.write(jfos);
+    merIntMap.ary()->write(jfos);
+    jfos.close();
+
+    // === Dump the jump tables to disk and reclaim the space
+    std::ofstream fwdJumpStream(outputDir + "fwdjump.bin", std::ios::binary);
+    {
+        cereal::BinaryOutputArchive fwdJumpArchive(fwdJumpStream);
+        fwdJumpArchive(fwdJump);
+    }
+    fwdJumpStream.close();
+    fwdJump.clear();
+    fwdJump.shrink_to_fit();
+
+    std::ofstream revJumpStream(outputDir + "revjump.bin", std::ios::binary);
+    {
+        cereal::BinaryOutputArchive revJumpArchive(revJumpStream);
+        revJumpArchive(revJump);
+    }
+    revJumpStream.close();
+    revJump.clear();
+    revJump.shrink_to_fit();
+    // === Done dumping the jump tables
+
+
+
+
+
+    std::ofstream kinfoStream(outputDir + "kinfo.bin", std::ios::binary);
+    {
+        cereal::BinaryOutputArchive kinfoArchive(kinfoStream);
+        kinfoArchive(kmerInfos);
+    }
+    kinfoStream.close();
+
+  std::ofstream eqLabelStream(outputDir + "eqlab.bin", std::ios::binary);
+  {
+      cereal::BinaryOutputArchive eqLabelArchive(eqLabelStream);
+      eqLabelArchive(eqClassLabelVec);
+  }
+  eqLabelStream.close();
+
+  std::ofstream eqClassStream(outputDir + "eqclass.bin", std::ios::binary);
+  {
+      cereal::BinaryOutputArchive eqClassArchive(eqClassStream);
+      eqClassArchive(eqClasses);
+  }
+  eqClassStream.close();
+
+  std::ofstream posStream(outputDir + "pos.bin", std::ios::binary);
+  {
+      cereal::BinaryOutputArchive posArchive(posStream);
+      posArchive(transcriptIDs);
+  }
+  posStream.close();
+  std::ofstream txpStream(outputDir + "txpnames.bin", std::ios::binary);
+  {
+    cereal::BinaryOutputArchive txpArchive(txpStream);
+    txpArchive(transcriptNames);
+  }
+  txpStream.close();
+
+  std::string indexVersion = "p0";
+  IndexHeader header(IndexType::PSEUDO, indexVersion, true, k);
+  // Finally (since everything presumably succeeded) write the header
+  std::ofstream headerStream(outputDir + "header.json");
+  {
+    cereal::JSONOutputArchive archive(headerStream);
+    archive(header);
+  }
+  headerStream.close();
+
+
+  std::cerr << "transcriptIDs.size() = " << transcriptIDs.size() << "\n";
+  std::cerr << "parsed " << transcriptNames.size() << " transcripts\n";
+  std::cerr << "There were " << numDistinctKmers << " distinct k-mers (canonicalized)\n";
+
+  // Data structure idea:
+  // k-mer => equivalence class, position array offset
+  // equivalence class = ordered (unique) list of transcripts
+  // position array = *self-delimited* list of positions with the same order as txps in the equivalence class
+}
+
+
+
+int rapMapIndex(int argc, char* argv[]) {
+    std::cerr << "RapMap Indexer\n";
+
+    TCLAP::CmdLine cmd("RapMap Indexer");
+    TCLAP::ValueArg<std::string> transcripts("t", "transcripts", "The transcript file to be indexed", true, "", "path");
+    TCLAP::ValueArg<std::string> index("i", "index", "The location where the index should be written", true, "", "path");
+    TCLAP::ValueArg<uint32_t> kval("k", "klen", "The length of k-mer to index", false, 31, "positive integer less than 32");
+    cmd.add(transcripts);
+    cmd.add(index);
+    cmd.add(kval);
+
+    cmd.parse(argc, argv);
+
+    // stupid parsing for now
+    std::string transcriptFile(transcripts.getValue());
+    std::vector<std::string> transcriptFiles({ transcriptFile });
+
+    uint32_t k = kval.getValue();
+    if (k % 2 == 0) {
+        std::cerr << "Error: k must be an odd value, you chose " << k << '\n';
+        std::exit(1);
+    } else if (k > 31) {
+        std::cerr << "Error: k must not be larger than 31, you chose " << k << '\n';
+        std::exit(1);
+    }
+    rapmap::utils::my_mer::k(k);
+
+    std::string indexDir = index.getValue();
+    if (indexDir.back() != '/') {
+	indexDir += '/';
+    }
+    bool dirExists = rapmap::fs::DirExists(indexDir.c_str());
+    bool dirIsFile = rapmap::fs::FileExists(indexDir.c_str());
+    if (dirIsFile) {
+        std::cerr << "The requested index directory already exists as a file.";
+        std::exit(1);
+    }
+    if (!dirExists) {
+        rapmap::fs::MakeDir(indexDir.c_str());
+    }
+
+    size_t maxReadGroup{1000}; // Number of reads in each "job"
+    size_t concurrentFile{2}; // Number of files to read simultaneously
+    size_t numThreads{2};
+    stream_manager streams(transcriptFiles.begin(), transcriptFiles.end(), concurrentFile);
+    std::unique_ptr<single_parser> transcriptParserPtr{nullptr};
+    transcriptParserPtr.reset(new single_parser(4 * numThreads, maxReadGroup,
+                              concurrentFile, streams));
+    std::mutex iomutex;
+    processTranscripts(transcriptParserPtr.get(), indexDir, iomutex);
+    return 0;
+}
+
+
diff --git a/src/RapMapMapper.cpp b/src/RapMapMapper.cpp
new file mode 100644
index 0000000..273261c
--- /dev/null
+++ b/src/RapMapMapper.cpp
@@ -0,0 +1,1427 @@
+#include <iostream>
+#include <mutex>
+#include <vector>
+#include <random>
+#include <unordered_map>
+#include <fstream>
+#include <algorithm>
+#include <iterator>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <thread>
+#include <tuple>
+#include <sstream>
+
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+
+#include "IndexHeader.hpp"
+#include "HitManager.hpp"
+//#include "SIMDCompressionAndIntersection/intersection.h"
+#include "xxhash.h"
+
+#include "spdlog/spdlog.h"
+#include "spdlog/details/format.h"
+
+// Jellyfish 2 include
+#include "jellyfish/mer_dna.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+#include "jellyfish/hash_counter.hpp"
+
+#include "tclap/CmdLine.h"
+
+/*extern "C" {
+#include "kseq.h"
+}
+*/
+#include "stringpiece.h"
+
+#include "PairAlignmentFormatter.hpp"
+#include "SingleAlignmentFormatter.hpp"
+#include "PairSequenceParser.hpp"
+#include "RapMapUtils.hpp"
+#include "RapMapIndex.hpp"
+#include "RapMapFileSystem.hpp"
+#include "RapMapConfig.hpp"
+#include "ScopedTimer.hpp"
+#include "SpinLock.hpp"
+
+// #define __DEBUG__
+// #define __TRACK_CORRECT__
+
+
+// STEP 1: declare the type of file handler and the read() function
+// KSEQ_INIT(int, read)
+
+using paired_parser = pair_sequence_parser<char**>;
+using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
+using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using TranscriptID = uint32_t;
+using TranscriptIDVector = std::vector<TranscriptID>;
+using KmerIDMap = std::vector<TranscriptIDVector>;
+using TranscriptList = std::vector<uint32_t>;
+using PositionList = std::vector<uint32_t>;
+using KmerIndex = std::unordered_map<uint64_t, TranscriptList, rapmap::utils::KmerKeyHasher>;
+using IntervalIndex = std::unordered_map<uint64_t, rapmap::utils::KmerInterval, rapmap::utils::KmerKeyHasher>;
+using OccList = std::vector<uint64_t>;
+using KmerInfoList = std::vector<rapmap::utils::KmerInfo>;
+using EqClassList = std::vector<rapmap::utils::EqClass>;
+using EqClassLabelVec = std::vector<uint32_t>;
+using PositionListHelper = rapmap::utils::PositionListHelper;
+#if defined __APPLE__
+using SpinLockT = SpinLock;
+#else
+using SpinLockT = std::mutex;
+#endif
+
+// "Fake" mutex for single-threaded exceuction that does nothing;
+class NullMutex {
+    public:
+	void lock() { return; }
+	bool try_lock() { return true; }
+	void unlock() { return; }
+};
+
+
+
+constexpr char bases[] = {'A', 'C', 'G', 'T'};
+
+inline int32_t tid(uint64_t x) { return static_cast<uint32_t>(x >> 32); }
+inline int32_t pos(uint64_t x) { return static_cast<uint32_t>(x); }
+
+// Seed with a real random value, if available
+std::random_device rd;
+
+// Create a random uniform distribution
+std::default_random_engine eng(rd());
+
+std::uniform_int_distribution<> dis(0, 3);
+
+using HitCounters = rapmap::utils::HitCounters;
+using MateStatus = rapmap::utils::MateStatus;
+using HitInfo = rapmap::utils::HitInfo;
+using ProcessedHit = rapmap::utils::ProcessedHit;
+using QuasiAlignment = rapmap::utils::QuasiAlignment;
+using FixedWriter = rapmap::utils::FixedWriter;
+
+
+
+
+// Walks the position list for this transcript and puts all hits
+// on the back of the hits vector.
+bool collectAllHits(uint32_t tid,
+		uint32_t readLen,
+		bool hitRC,
+		PositionListHelper& ph,
+		std::vector<QuasiAlignment>& hits,
+        MateStatus mateStatus){
+	bool foundHit{false};
+	bool canAdvance = !ph.done();
+	// The first position should always be a nextTxp, but we don't care
+	bool nextTxp{false};
+	bool isRC;
+	int32_t pos;
+
+//	while (canAdvance) {
+    // only return the first hit for now
+    if (canAdvance) {
+		isRC = ph.isRC();
+		pos = ph.pos();
+		bool isReadRC = (isRC != hitRC);
+		hits.emplace_back(tid, pos, isReadRC, readLen);
+        hits.back().mateStatus = mateStatus;
+		foundHit = true;
+		// If we can't advance the left but we need to, we're done
+		/*if (!canAdvance) { return foundHit; }
+		++(ph.it_);
+		canAdvance = !ph.isNewTxp();
+		*/
+	}
+    if (canAdvance) { ph.advanceToNextTranscript(); }
+	return foundHit;
+}
+
+
+// Finds if there are positions within a specific transcript given by
+// leftPosIt and rightPosIt within the distance constraints such that
+// abs(leftPos - rightPos) < maxDist.  If so, the hit is appended to
+// hits and the function returns true --- otherwise hits remains unchanged
+// and the function returns false;
+bool collectHitsWithPositionConstraint(uint32_t tid,
+		uint32_t readLen,
+		bool leftHitRC,
+		bool rightHitRC,
+		uint32_t leftQueryPos,
+		uint32_t rightQueryPos,
+		PositionListHelper& leftPH,
+		PositionListHelper& rightPH,
+		uint32_t maxDist,
+		std::vector<QuasiAlignment>& hits,
+		MateStatus mateStatus){
+	bool foundHit{false};
+	bool canAdvance = true, canAdvanceLeft = !leftPH.done(), canAdvanceRight = !rightPH.done();
+	// The first position should always be a nextTxp, but we don't care
+	bool nextTxpLeft = false, nextTxpRight = false;
+	bool isRCLeft, isRCRight;
+	// True if the k-mer thinks the read is from fwd, false if from rc
+	bool readStrandLeft, readStrandRight;
+	int32_t leftPos, rightPos;
+	std::vector<PositionListHelper> leftPosQueue, rightPosQueue;
+
+#ifdef __DEBUG__
+	if (!leftPH.isNewTxp()) {
+		std::cerr << "leftPH = (" << leftPH.pos()
+			<< ", " << leftPH.isNewTxp() << "), but should be start of "
+			"new txp list";
+	}
+
+	if (!rightPH.isNewTxp()) {
+		std::cerr << "rightPH = (" << rightPH.pos()
+			<< ", " << rightPH.isNewTxp() << "), but should be start of "
+			"new txp list";
+	}
+#endif // __DEBUG__
+
+	while (canAdvance) {
+		leftPos = leftPH.pos();
+		rightPos = rightPH.pos();
+		isRCLeft = leftPH.isRC();
+		isRCRight = rightPH.isRC();
+
+		int32_t posDiff = rightPos - leftPos;
+		uint32_t fragLen = std::abs(posDiff);
+		// We found a hit (potentially -- what do we do about RCs here?)
+		// I think we need to know if the k-mer from the *read* was fw or rc
+		if (fragLen < maxDist) {
+			bool isRC = (leftHitRC != isRCLeft);
+			int32_t hitPos = (leftPos < rightPos) ? leftPos - leftQueryPos :
+				rightPos - rightQueryPos;
+			hits.emplace_back(tid, hitPos, isRC, readLen);
+			hits.back().mateStatus = mateStatus;
+			foundHit = true;
+			break;
+		}
+		// rightPos >= leftPos (advance left)
+		if (posDiff > 0) {
+			// If we can't advance the left but we need to, we're done
+			if (!canAdvanceLeft) { break; }
+			++(leftPH.it_);
+			if (leftPH.isNewTxp() or leftPH.done()) {
+				canAdvanceLeft = false;
+				break;
+			}
+		} else if (posDiff < 0) { // leftPos > rightPos (advance right)
+			// If we can't advance the right but we need to, we're done
+			if (!canAdvanceRight) { break; }
+			++(rightPH.it_);
+			if (rightPH.isNewTxp() or rightPH.done()) {
+				canAdvanceRight = false;
+				break;
+			}
+		} else { // posDiff == 0 (advance both)
+			/**
+			 * This is a strange case --- both k-mers (from different places)
+			 * in the read map to the same position.  First, this should
+			 * probably be a hit (i.e. < maxDist).  If not, is advancing
+			 * both the right thing to do?
+			 */
+			// If we can't advance the left but we need to, we're done
+			if (!canAdvanceLeft) { break; }
+			++(leftPH.it_);
+			if (leftPH.isNewTxp() or leftPH.done()) {
+				canAdvanceLeft = false;
+				break;
+			}
+			if (leftPH.isNewTxp()) { --(leftPH.it_); }
+			// If we can't advance the right but we need to, we're done
+			if (!canAdvanceRight) { break; }
+			++(rightPH.it_);
+			if (rightPH.isNewTxp() or rightPH.done()) {
+				canAdvanceRight = false;
+				break;
+			}
+		}
+
+		// We can continue if we can advance either the left or right position
+		canAdvance = (canAdvanceLeft or canAdvanceRight);
+	}
+	// Advance left and right until next txp
+	if ( canAdvanceLeft ) {
+		leftPH.advanceToNextTranscript();
+	}
+	if ( canAdvanceRight ) {
+		rightPH.advanceToNextTranscript();
+	}
+	return foundHit;
+}
+
+
+class SkippingKmerSearcher{
+    private:
+	std::string* qstr;
+	const char* qCharArray;
+	uint32_t qlen;
+	uint32_t k;
+	uint32_t klen;
+	uint32_t startPos;
+	uint32_t nextBaseIndex;
+	rapmap::utils::my_mer mer;
+	rapmap::utils::my_mer rcmer;
+	rapmap::utils::my_mer tempMer;
+	static constexpr uint32_t invalidIndex = std::numeric_limits<uint32_t>::max();
+
+    public:
+	SkippingKmerSearcher(std::string& queryStr) :
+		qstr(&queryStr),
+		qCharArray(queryStr.c_str()),
+		qlen(queryStr.length()),
+		k(rapmap::utils::my_mer::k()),
+		klen(0),
+		startPos(0),
+		nextBaseIndex(0) {
+		    next();
+	}
+
+
+	// return the index of the current k-mer (start) in the query string
+	uint32_t queryIndex() { return startPos; }
+
+	rapmap::utils::my_mer getMer(bool& isRC) {
+	    tempMer = mer.get_canonical();
+	    isRC = (mer != tempMer);
+	    return tempMer;
+	}
+
+	bool backwardSearch(uint32_t searchPos) {
+	    // Perform a backward search starting from searchPos
+
+	    // If we're not at least k bases in, we can't do a
+	    // backward search
+	    if (searchPos < k) {
+	    	return false;
+	    }
+
+	    // We can't search off the end
+	    if (searchPos + k > qlen) {
+		searchPos = qlen - k;
+	    }
+
+
+	    // otherwise start a new k-mer at the jump position
+	    while (!tempMer.from_chars(&qCharArray[searchPos])) {
+		// If it wasn't a valid k-mer, find the offending base
+		// and try to start k bases before it
+		uint32_t invalidLoc = qstr->find_last_not_of("aAcCgGtT", searchPos + k);
+		// Make sure we don't fall off the end
+		if (invalidLoc < k + 1) {
+		    return false;
+		}
+		searchPos = invalidLoc - k - 1;
+	    }
+
+	    // we found a hit, so make it the current k-mer
+	    klen = k;
+	    startPos = searchPos;
+	    nextBaseIndex = searchPos + k + 1;
+	    mer = tempMer;
+	    return true;
+	}
+
+	bool isOutsideQuery(uint32_t jumpLen) {
+	    uint32_t jumpPos = startPos + jumpLen;
+	    return (jumpPos > qlen - k);
+	}
+
+
+	// move to the next valid k-mer that is at least skipVal past
+	// the current position.  If skipVal positions forward is past
+	// the end of the query, try and move to the last k-mer.
+	//
+	// If a valid k-mer is found, make it the current k-mer and
+	// return true and the k-mer position. Otherwise, return
+	// false.
+	std::tuple<bool, uint32_t> skipForward(uint32_t skipVal) {
+	   tempMer = mer;
+	   uint32_t tempKLen = klen;
+      	   uint32_t tempStartPos = startPos;
+	   uint32_t tempNextBaseIndex = nextBaseIndex;
+
+	   uint32_t jumpPos = startPos + skipVal;
+	   // Would we jump past the end of the read?
+	   // If so, just jump to the end.
+	   if (jumpPos > qlen - k) {
+	     jumpPos = qlen - k;
+	   }
+	   uint32_t initJumpPos = jumpPos;
+	   if (jumpPos == startPos) { return std::make_pair(false, 0); }
+
+	   // if the jumpPos is < k away, it's more efficient to just
+	   // walk there b/c it overlaps the current k-mer
+	   bool reachedGoal{false};
+	   if (jumpPos - startPos < k) {
+	       while (!reachedGoal and next()) {
+		   reachedGoal = (startPos >= jumpPos);
+	       }
+	   } else {
+	      // otherwise start a new k-mer at the jump position
+	      while (! (reachedGoal = mer.from_chars(&qCharArray[jumpPos])) ) {
+		  // If it wasn't a valid k-mer, find the offending base
+		  // and try to start after it
+		  jumpPos = qstr->find_first_not_of("aAcCgGtT", jumpPos) + 1;
+		  // Make sure we don't fall off the end
+		  if (jumpPos > qlen - k) {
+		      startPos = invalidIndex;
+		      reachedGoal = false;
+		      break;
+		  }
+	      }
+	      // If the search was successful
+	      if (reachedGoal) {
+		  // set startPos to the position we ended up jumping to
+		  klen = k;
+		  startPos = jumpPos;
+		  nextBaseIndex = startPos + k + 1;
+	      }
+	   }
+	   // If the search was un-successful, return the searcher to it's previous state
+	   // and report the failure and the position where the backward search should begin.
+	   if (!reachedGoal) {
+	       mer = tempMer;
+	       klen = tempKLen;
+	       startPos = tempStartPos;
+	       nextBaseIndex = tempNextBaseIndex;
+	       return std::make_pair(false, initJumpPos);
+	   }
+	   return std::make_pair(true, startPos);
+	}
+
+	// Move to the next *valid* k-mer.  If we found a k-mer, return true,
+	// If we can't move forward anymore, then return false.
+	bool next() {
+	    bool valid{false};
+	    while (!valid and nextBaseIndex < qlen) {
+		int c = jellyfish::mer_dna::code(qCharArray[nextBaseIndex]);
+		// If the next base isn't a valid nucleotide
+		if (jellyfish::mer_dna::not_dna(c)) {
+		    // reset the k-mer
+		    klen = 0;
+		    ++nextBaseIndex;
+		    if (qlen - nextBaseIndex < k) {
+			startPos = invalidIndex;
+			return false;
+		    } else {
+			continue;
+		    }
+		}
+		mer.shift_left(c);
+		//rcmer.shift_right(jellyfish::mer_dna::complement(c));
+		++klen;
+	        ++nextBaseIndex;
+            // EDIT
+		if (klen >= k and !mer.is_homopolymer()) {
+		    startPos = nextBaseIndex - k;
+		    valid = true;
+		}
+	    }
+	    if (!valid) { startPos = invalidIndex; }
+	    return valid;
+	}
+
+	bool isValid() { return startPos != invalidIndex; }
+};
+
+struct JumpStats {
+    std::atomic<uint64_t> jumpSizes{0};
+    std::atomic<uint64_t> numJumps{0};
+};
+
+
+class SkippingCollector {
+    private:
+	RapMapIndex* rmi_;
+    public:
+	SkippingCollector(RapMapIndex* rmiIn) : rmi_(rmiIn) {}
+
+	void operator()(std::string& readStr,
+		std::vector<QuasiAlignment>& hits,
+		MateStatus mateStatus) {
+
+	    auto jfhash = rmi_->merHash.get();
+	    auto& kmerInfos = rmi_->kmerInfos;
+	    auto& eqClasses = rmi_->eqClassList;
+	    auto& eqClassLabels = rmi_->eqLabelList;
+	    auto& posList = rmi_->posList;
+	    auto& fwdJumpTable = rmi_->fwdJumpTable;
+	    auto& revJumpTable = rmi_->revJumpTable;
+	    auto posEnd = posList.end();
+
+	    auto k = rapmap::utils::my_mer::k();
+	    auto readLen = readStr.length();
+	    uint32_t maxDist = static_cast<uint32_t>(readLen) * 1.5;
+
+	    auto endIt = kmerInfos.end();
+
+	    std::vector<HitInfo> kmerHits;
+	    uint64_t merID;
+	    size_t kID;
+	    rapmap::utils::my_mer searchBuffer;
+
+	    bool terminateSearch{false}; // terminate search after checking the *next* hit
+	    bool validKmer{false};
+	    uint32_t numAnchors{0};
+	    uint32_t searchPos{0};
+	    uint32_t jumpLen{0};
+	    bool isRC;
+	    SkippingKmerSearcher ksearch(readStr);
+
+	    while (ksearch.isValid()) {
+		auto searchMer = ksearch.getMer(isRC);
+		bool foundMer = jfhash->get_val_for_key(searchMer, &merID,
+			searchBuffer, &kID);
+		// OK --- we found a hit
+		if (foundMer) {
+
+		    kmerHits.emplace_back(kmerInfos.begin() + merID,
+			    merID, ksearch.queryIndex(), !isRC);
+		    // Increment the # of anchor k-mers we found
+		    ++numAnchors;
+
+		    // If we decided to terminate the search in the last loop, then we're done.
+		    if (terminateSearch) { break; }
+
+		    jumpLen = isRC ? revJumpTable[merID] :
+			      fwdJumpTable[merID];
+		    //js.jumpSizes += jumpLen;
+		    //++js.numJumps;
+
+		    if (jumpLen > 1) { // only jump if it's worth it
+			if (ksearch.isOutsideQuery(jumpLen)) {
+			    if (ksearch.backwardSearch(searchPos)) {
+				terminateSearch = true;
+				continue;
+			    } else {
+				break;
+			    }
+			}
+
+			std::tie(validKmer, searchPos) = ksearch.skipForward(jumpLen);
+			if (!validKmer) {
+			    // There was no valid k-mer from the skip position to the end
+			    // of the read --- execute reverse search
+
+			    // But -- only do so if we don't have at least 2 anchors
+			    if (numAnchors >= 2) { break; }
+
+			    if (ksearch.backwardSearch(searchPos)) {
+				// If we find something in the reverse search, it will
+				// be the last thing we check.
+				terminateSearch = true;
+				continue;
+			    } else {
+				// Otherwise, if we don't find anything in the reverse
+				// search --- just give up and take what we have.
+				break;
+			    }
+			} else {
+			    if (searchPos == readLen - k) {
+				terminateSearch = true;
+			    }
+			    // The skip was successful --- continue the search normally
+			    // from here.
+			    continue;
+			}
+		    }
+		} else {
+		    if (terminateSearch) { break; }
+		}
+		ksearch.next();
+	    }
+
+	    // found no hits in the entire read
+	    if (kmerHits.size() == 0) { return; }
+
+	    if (kmerHits.size() > 0) {
+		if (kmerHits.size() > 1) {
+		    //std::cerr << "kmerHits.size() = " << kmerHits.size() << "\n";
+		    auto processedHits = rapmap::hit_manager::intersectHits(kmerHits, *rmi_);
+		    rapmap::hit_manager::collectHitsSimple(processedHits, readLen, maxDist, hits, mateStatus);
+		} else {
+		    // std::cerr << "kmerHits.size() = " << kmerHits.size() << "\n";
+		    auto& kinfo = *kmerHits[0].kinfo;
+		    hits.reserve(kinfo.count);
+		    // Iterator into, length of and end of the transcript list
+		    auto& eqClassLeft = eqClasses[kinfo.eqId];
+		    // Iterator into, length of and end of the positon list
+		    auto leftPosIt = posList.begin() + kinfo.offset;
+		    auto leftPosLen = kinfo.count;
+		    auto leftPosEnd = leftPosIt + leftPosLen;
+		    PositionListHelper leftPosHelper(leftPosIt, posList.end());
+		    bool leftHitRC = kmerHits[0].queryRC;
+
+		    auto leftTxpIt = eqClassLabels.begin() + eqClassLeft.txpListStart;
+		    auto leftTxpListLen = eqClassLeft.txpListLen;
+		    auto leftTxpEnd = leftTxpIt + leftTxpListLen;
+
+		    for (auto it = leftTxpIt; it < leftTxpEnd; ++it) {
+			collectAllHits(*it, readLen, leftHitRC, leftPosHelper, hits, mateStatus);
+		    }
+		}
+	    }
+
+	}
+};
+
+
+
+class EndCollector {
+    private:
+	RapMapIndex* rmi_;
+    public:
+	EndCollector(RapMapIndex* rmiIn) : rmi_(rmiIn) {}
+
+    void operator()(std::string& readStr,
+	    std::vector<QuasiAlignment>& hits,
+	    MateStatus mateStatus) {
+
+	auto jfhash = rmi_->merHash.get();
+	auto& kmerInfos = rmi_->kmerInfos;
+	auto& eqClasses = rmi_->eqClassList;
+	auto& eqClassLabels = rmi_->eqLabelList;
+	auto& posList = rmi_->posList;
+	auto posEnd = posList.end();
+
+	rapmap::utils::my_mer mer;
+	rapmap::utils::my_mer rcmer;
+	auto k = rapmap::utils::my_mer::k();
+	auto kbits = 2*k;
+	auto readLen = readStr.length();
+	uint32_t maxDist = static_cast<uint32_t>(readLen) * 1.5;
+	size_t leftQueryPos = std::numeric_limits<size_t>::max();
+	size_t rightQueryPos = std::numeric_limits<size_t>::max();
+	bool leftHitRC = false, rightHitRC = false;
+
+	auto endIt = kmerInfos.end();
+
+	std::vector<HitInfo> kmerHits;
+	bool leftFwd{true};
+	bool rightFwd{true};
+
+	uint64_t merID;
+	size_t kID;
+	rapmap::utils::my_mer searchBuffer;
+
+	size_t klen{0};
+	for (size_t i = 0; i < readLen; ++i) {
+	    int c = jellyfish::mer_dna::code(readStr[i]);
+	    // If the next base isn't a valid nucleotide
+	    if (jellyfish::mer_dna::not_dna(c)) {
+		// reset the k-mer
+		klen = 0;
+		continue;
+	    }
+	    mer.shift_left(c);
+	    rcmer.shift_right(jellyfish::mer_dna::complement(c));
+	    ++klen;
+	    if (klen >= k) {
+		auto& searchMer = (mer < rcmer) ? mer : rcmer;
+		bool foundMer = jfhash->get_val_for_key(searchMer, &merID,
+			searchBuffer, &kID);
+		if (foundMer) {
+		    kmerHits.emplace_back(kmerInfos.begin() + merID,
+			    merID,
+			    i - k,
+			    searchMer == rcmer);
+		    leftQueryPos = i - k;
+		    break;
+		}
+	    }
+	}
+
+	// found no hits in the entire read
+	if (kmerHits.size() == 0) { return; }
+
+	// Now, start from the right and move left
+	klen = 0;
+	for (size_t i = readLen - 1; i > leftQueryPos; --i) {
+	    int c = jellyfish::mer_dna::code(readStr[i]);
+	    // If the next base isn't a valid nucleotide
+	    if (jellyfish::mer_dna::not_dna(c)) {
+		klen = 0;
+		continue;
+	    }
+	    mer.shift_right(c);
+	    rcmer.shift_left(jellyfish::mer_dna::complement(c));
+	    ++klen;
+	    if (klen >= k) {
+		auto& searchMer = (mer < rcmer) ? mer : rcmer;
+		bool foundMer = jfhash->get_val_for_key(searchMer, &merID,
+			searchBuffer, &kID);
+		if (foundMer) {
+		    kmerHits.emplace_back(kmerInfos.begin() + merID,
+			    merID,
+			    readLen - (i + k),
+			    searchMer == rcmer);
+		    break;
+		}
+	    }
+	}
+
+	if (kmerHits.size() > 0) {
+	    if (kmerHits.size() > 1) {
+		//std::cerr << "kmerHits.size() = " << kmerHits.size() << "\n";
+		auto processedHits = rapmap::hit_manager::intersectHits(kmerHits, *rmi_);
+		rapmap::hit_manager::collectHitsSimple(processedHits, readLen, maxDist, hits, mateStatus);
+	    } else {
+		//std::cerr << "kmerHits.size() = " << kmerHits.size() << "\n";
+		auto& kinfo = *kmerHits[0].kinfo;
+		hits.reserve(kinfo.count);
+		// Iterator into, length of and end of the transcript list
+		auto& eqClassLeft = eqClasses[kinfo.eqId];
+		// Iterator into, length of and end of the positon list
+		auto leftPosIt = posList.begin() + kinfo.offset;
+		auto leftPosLen = kinfo.count;
+		auto leftPosEnd = leftPosIt + leftPosLen;
+		PositionListHelper leftPosHelper(leftPosIt, posList.end());
+		leftHitRC = kmerHits[0].queryRC;
+
+		auto leftTxpIt = eqClassLabels.begin() + eqClassLeft.txpListStart;
+		auto leftTxpListLen = eqClassLeft.txpListLen;
+		auto leftTxpEnd = leftTxpIt + leftTxpListLen;
+
+		for (auto it = leftTxpIt; it < leftTxpEnd; ++it) {
+		    collectAllHits(*it, readLen, leftHitRC, leftPosHelper, hits, mateStatus);
+		}
+	    }
+	}
+
+    }
+};
+
+template <typename CollectorT, typename MutexT>
+void processReadsSingle(single_parser* parser,
+        RapMapIndex& rmi,
+	CollectorT& hitCollector,
+        MutexT* iomutex,
+	std::shared_ptr<spdlog::logger> outQueue,
+        HitCounters& hctr,
+        uint32_t maxNumHits,
+        bool noOutput) {
+
+    auto& txpNames = rmi.txpNames;
+    auto& txpLens = rmi.txpLens;
+    uint32_t n{0};
+    uint32_t k = rapmap::utils::my_mer::k();
+    std::vector<std::string> transcriptNames;
+    constexpr char bases[] = {'A', 'C', 'G', 'T'};
+
+    fmt::MemoryWriter sstream;
+    size_t batchSize{1000};
+    std::vector<QuasiAlignment> hits;
+
+    SingleAlignmentFormatter<RapMapIndex*> formatter(&rmi);
+
+    size_t readLen{0};
+
+    while(true) {
+        typename single_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
+        if(j.is_empty()) break;           // If got nothing, quit
+        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+            readLen = j->data[i].seq.length();
+            ++hctr.numReads;
+            hits.clear();
+            hitCollector(j->data[i].seq, hits, MateStatus::SINGLE_END);
+            /*
+               std::set_intersection(leftHits.begin(), leftHits.end(),
+               rightHits.begin(), rightHits.end(),
+               std::back_inserter(jointHits));
+               */
+            auto numHits = hits.size();
+            hctr.totHits += numHits;
+
+             if (hits.size() > 0 and !noOutput and hits.size() <= maxNumHits) {
+                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+                        hctr, hits, sstream);
+            }
+
+            if (hctr.numReads > hctr.lastPrint + 1000000) {
+		hctr.lastPrint.store(hctr.numReads.load());
+                if (iomutex->try_lock()){
+                    if (hctr.numReads > 0) {
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                        std::cerr << "\033[F\033[F\033[F";
+#else
+                        std::cerr << "\033[F\033[F";
+#endif // __DEBUG__
+                    }
+                    std::cerr << "saw " << hctr.numReads << " reads\n";
+                    std::cerr << "# hits per read = "
+                        << hctr.totHits / static_cast<float>(hctr.numReads) << "\n";
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                    std::cerr << "The true hit was in the returned set of hits "
+                        << 100.0 * (hctr.trueHits / static_cast<float>(hctr.numReads))
+                        <<  "% of the time\n";
+#endif // __DEBUG__
+                    iomutex->unlock();
+                }
+            }
+        } // for all reads in this job
+
+	if (!noOutput) {
+        std::string outStr(sstream.str());
+        // Get rid of last newline
+        if (!outStr.empty()) {
+            outStr.pop_back();
+            outQueue->info() << std::move(outStr);
+        }
+	    sstream.clear();
+	}
+	/*
+        // DUMP OUTPUT
+        iomutex->lock();
+        outStream << sstream.str();
+        iomutex->unlock();
+        sstream.clear();
+	*/
+
+    } // processed all reads
+}
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+template <typename CollectorT, typename MutexT>
+void processReadsPair(paired_parser* parser,
+        RapMapIndex& rmi,
+	CollectorT& hitCollector,
+        MutexT* iomutex,
+	std::shared_ptr<spdlog::logger> outQueue,
+        HitCounters& hctr,
+        uint32_t maxNumHits,
+        bool noOutput) {
+    auto& txpNames = rmi.txpNames;
+    std::vector<uint32_t>& txpLens = rmi.txpLens;
+    uint32_t n{0};
+    uint32_t k = rapmap::utils::my_mer::k();
+    std::vector<std::string> transcriptNames;
+    constexpr char bases[] = {'A', 'C', 'G', 'T'};
+
+    auto logger = spdlog::get("stderrLog");
+
+    fmt::MemoryWriter sstream;
+    size_t batchSize{1000};
+    std::vector<QuasiAlignment> leftHits;
+    std::vector<QuasiAlignment> rightHits;
+    std::vector<QuasiAlignment> jointHits;
+
+    PairAlignmentFormatter<RapMapIndex*> formatter(&rmi);
+
+    size_t readLen{0};
+	bool tooManyHits{false};
+
+    JumpStats js;
+    // 0 means properly aligned
+    // 0x1 means only alignments for left read
+    // 0x2 means only alignments for right read
+    // 0x3 means "orphaned" alignments for left and right
+    // (currently not treated as orphan).
+    uint32_t orphanStatus{0};
+    while(true) {
+        typename paired_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
+        if(j.is_empty()) break;           // If got nothing, quit
+        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+	    tooManyHits = false;
+            readLen = j->data[i].first.seq.length();
+            ++hctr.numReads;
+            jointHits.clear();
+            leftHits.clear();
+            rightHits.clear();
+    	    hitCollector(j->data[i].first.seq,
+                        leftHits, MateStatus::PAIRED_END_LEFT);
+            hitCollector(j->data[i].second.seq,
+                        rightHits, MateStatus::PAIRED_END_RIGHT);
+
+            rapmap::utils::mergeLeftRightHits(
+                    leftHits, rightHits, jointHits,
+                    readLen, maxNumHits, tooManyHits, hctr);
+
+
+            if (jointHits.size() > 0 and !noOutput and jointHits.size() <= maxNumHits) {
+                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+                                                       hctr, jointHits, sstream);
+            }
+
+            if (hctr.numReads > hctr.lastPrint + 1000000) {
+		hctr.lastPrint.store(hctr.numReads.load());
+                if (iomutex->try_lock()) {
+                    if (hctr.numReads > 0) {
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                        std::cerr << "\033[F\033[F\033[F\033[F";
+#else
+                        std::cerr << "\033[F\033[F\033[F";
+#endif // __DEBUG__
+                    }
+                    std::cerr << "saw " << hctr.numReads << " reads\n";
+                    std::cerr << "# pe hits per read = "
+                        << hctr.peHits / static_cast<float>(hctr.numReads) << "\n";
+                    std::cerr << "# se hits per read = "
+                        << hctr.seHits / static_cast<float>(hctr.numReads) << "\n";
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                    std::cerr << "The true hit was in the returned set of hits "
+                        << 100.0 * (hctr.trueHits / static_cast<float>(hctr.numReads))
+                        <<  "% of the time\n";
+		    /*
+                    std::cerr << "Average jump size = "
+                              << js.jumpSizes / static_cast<double>(js.numJumps) << "\n";
+			      */
+#endif // __DEBUG__
+                    iomutex->unlock();
+                }
+            }
+        } // for all reads in this job
+
+	if (!noOutput) {
+        std::string outStr(sstream.str());
+        // Get rid of last newline
+        if (!outStr.empty()){
+            outStr.pop_back();
+            outQueue->info() << std::move(outStr);
+        }
+	    sstream.clear();
+	}
+
+        // DUMP OUTPUT
+	/*
+        if (!noOutput) {
+            iomutex->lock();
+            outStream << sstream.str();
+            iomutex->unlock();
+            sstream.clear();
+        }
+	*/
+
+    } // processed all reads
+
+}
+
+
+
+int rapMapMap(int argc, char* argv[]) {
+    std::cerr << "RapMap Mapper\n";
+
+    std::string versionString = rapmap::version;
+    TCLAP::CmdLine cmd(
+            "RapMap Mapper",
+            ' ',
+            versionString);
+    cmd.getProgramName() = "rapmap";
+
+    TCLAP::ValueArg<std::string> index("i", "index", "The location of the pseudoindex", true, "", "path");
+    TCLAP::ValueArg<std::string> read1("1", "leftMates", "The location of the left paired-end reads", false, "", "path");
+    TCLAP::ValueArg<std::string> read2("2", "rightMates", "The location of the right paired-end reads", false, "", "path");
+    TCLAP::ValueArg<std::string> unmatedReads("r", "unmatedReads", "The location of single-end reads", false, "", "path");
+    TCLAP::ValueArg<uint32_t> numThreads("t", "numThreads", "Number of threads to use", false, 1, "positive integer");
+    TCLAP::ValueArg<uint32_t> maxNumHits("m", "maxNumHits", "Reads mapping to more than this many loci are discarded", false, 200, "positive integer");
+    TCLAP::ValueArg<std::string> outname("o", "output", "The output file (default: stdout)", false, "", "path");
+    TCLAP::SwitchArg endCollectorSwitch("e", "endCollector", "Use the simpler (and faster) \"end\" collector as opposed to the more sophisticated \"skipping\" collector", false);
+    TCLAP::SwitchArg noout("n", "noOutput", "Don't write out any alignments (for speed testing purposes)", false);
+    cmd.add(index);
+    cmd.add(noout);
+
+    cmd.add(read1);
+    cmd.add(read2);
+    cmd.add(unmatedReads);
+    cmd.add(outname);
+    cmd.add(numThreads);
+    cmd.add(maxNumHits);
+    cmd.add(endCollectorSwitch);
+
+    auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+    auto consoleLog = spdlog::create("stderrLog", {consoleSink});
+
+    try {
+
+	cmd.parse(argc, argv);
+	bool pairedEnd = (read1.isSet() or read2.isSet());
+	if (pairedEnd and (read1.isSet() != read2.isSet())) {
+	    consoleLog->error("You must set both the -1 and -2 arguments to align "
+		    "paired end reads!");
+	    std::exit(1);
+	}
+
+	if (pairedEnd and unmatedReads.isSet()) {
+	    consoleLog->error("You cannot specify both paired-end and unmated "
+		    "reads in the input!");
+	    std::exit(1);
+	}
+
+	if (!pairedEnd and !unmatedReads.isSet()) {
+	    consoleLog->error("You must specify input; either both paired-end "
+			      "or unmated reads!");
+	    std::exit(1);
+
+	}
+
+	std::string indexPrefix(index.getValue());
+	if (indexPrefix.back() != '/') {
+	    indexPrefix += "/";
+	}
+
+	if (!rapmap::fs::DirExists(indexPrefix.c_str())) {
+	    consoleLog->error("It looks like the index you provided [{}] "
+		    "doesn't exist", indexPrefix);
+	    std::exit(1);
+	}
+
+
+	IndexHeader h;
+	std::ifstream indexStream(indexPrefix + "header.json");
+	{
+		cereal::JSONInputArchive ar(indexStream);
+		ar(h);
+	}
+	indexStream.close();
+
+	if (h.indexType() != IndexType::PSEUDO) {
+	    consoleLog->error("The index {} does not appear to be of the "
+			    "appropriate type (pseudo)", indexPrefix);
+	    std::exit(1);
+	}
+
+	RapMapIndex rmi;
+	rmi.load(indexPrefix);
+
+	std::cerr << "\n\n\n\n";
+
+	// from: http://stackoverflow.com/questions/366955/obtain-a-stdostream-either-from-stdcout-or-stdofstreamfile
+	// set either a file or cout as the output stream
+	std::streambuf* outBuf;
+	std::ofstream outFile;
+	bool haveOutputFile{false};
+	if (outname.getValue() == "") {
+	    outBuf = std::cout.rdbuf();
+	} else {
+	    outFile.open(outname.getValue());
+	    outBuf = outFile.rdbuf();
+	    haveOutputFile = true;
+	}
+	// Now set the output stream to the buffer, which is
+	// either std::cout, or a file.
+	std::ostream outStream(outBuf);
+
+	// Must be a power of 2
+	size_t queueSize{268435456};
+	spdlog::set_async_mode(queueSize);
+	auto outputSink = std::make_shared<spdlog::sinks::ostream_sink_mt>(outStream);
+	auto outLog = std::make_shared<spdlog::logger>("outLog", outputSink);
+	outLog->set_pattern("%v");
+
+	uint32_t nthread = numThreads.getValue();
+	std::unique_ptr<paired_parser> pairParserPtr{nullptr};
+	std::unique_ptr<single_parser> singleParserPtr{nullptr};
+
+	if (!noout.getValue()) {
+	    rapmap::utils::writeSAMHeader(rmi, outLog);
+	}
+
+	SpinLockT iomutex;
+	{
+	    ScopedTimer timer;
+	    HitCounters hctrs;
+	    consoleLog->info("mapping reads . . . \n\n\n");
+	    if (pairedEnd) {
+		std::vector<std::thread> threads;
+		std::vector<std::string> read1Vec = rapmap::utils::tokenize(read1.getValue(), ',');
+		std::vector<std::string> read2Vec = rapmap::utils::tokenize(read2.getValue(), ',');
+
+		if (read1Vec.size() != read2Vec.size()) {
+		    consoleLog->error("The number of provided files for "
+			    "-1 and -2 must be the same!");
+		    std::exit(1);
+		}
+
+		size_t numFiles = read1Vec.size() + read2Vec.size();
+		char** pairFileList = new char*[numFiles];
+		for (size_t i = 0; i < read1Vec.size(); ++i) {
+		    pairFileList[2*i] = const_cast<char*>(read1Vec[i].c_str());
+		    pairFileList[2*i+1] = const_cast<char*>(read2Vec[i].c_str());
+		}
+		size_t maxReadGroup{1000}; // Number of reads in each "job"
+		size_t concurrentFile{2}; // Number of files to read simultaneously
+		pairParserPtr.reset(new paired_parser(4 * nthread, maxReadGroup,
+			    concurrentFile,
+			    pairFileList, pairFileList+numFiles));
+
+		/** Create the threads depending on the collector type **/
+		if (endCollectorSwitch.getValue()) {
+		    EndCollector endCollector(&rmi);
+		    for (size_t i = 0; i < nthread; ++i) {
+			threads.emplace_back(processReadsPair<EndCollector, SpinLockT>,
+				pairParserPtr.get(),
+				std::ref(rmi),
+				std::ref(endCollector),
+				&iomutex,
+				outLog,
+				std::ref(hctrs),
+				maxNumHits.getValue(),
+				noout.getValue());
+		    }
+		} else {
+		    SkippingCollector skippingCollector(&rmi);
+		    for (size_t i = 0; i < nthread; ++i) {
+			threads.emplace_back(processReadsPair<SkippingCollector, SpinLockT>,
+				pairParserPtr.get(),
+				std::ref(rmi),
+				std::ref(skippingCollector),
+				&iomutex,
+				outLog,
+				std::ref(hctrs),
+				maxNumHits.getValue(),
+				noout.getValue());
+		    }
+		}
+
+		for (auto& t : threads) { t.join(); }
+		delete [] pairFileList;
+	    } else {
+		std::vector<std::thread> threads;
+		std::vector<std::string> unmatedReadVec = rapmap::utils::tokenize(unmatedReads.getValue(), ',');
+		size_t maxReadGroup{1000}; // Number of reads in each "job"
+		size_t concurrentFile{1};
+		stream_manager streams( unmatedReadVec.begin(), unmatedReadVec.end(),
+			concurrentFile);
+		singleParserPtr.reset(new single_parser(4 * nthread,
+			    maxReadGroup,
+			    concurrentFile,
+			    streams));
+
+		/** Create the threads depending on the collector type **/
+		if (endCollectorSwitch.getValue()) {
+		    EndCollector endCollector(&rmi);
+		    for (size_t i = 0; i < nthread; ++i) {
+			threads.emplace_back(processReadsSingle<EndCollector, SpinLockT>,
+				singleParserPtr.get(),
+				std::ref(rmi),
+				std::ref(endCollector),
+				&iomutex,
+				outLog,
+				std::ref(hctrs),
+				maxNumHits.getValue(),
+				noout.getValue());
+		    }
+		} else {
+		    SkippingCollector skippingCollector(&rmi);
+		    for (size_t i = 0; i < nthread; ++i) {
+			threads.emplace_back(processReadsSingle<SkippingCollector, SpinLockT>,
+				singleParserPtr.get(),
+				std::ref(rmi),
+				std::ref(skippingCollector),
+				&iomutex,
+				outLog,
+				std::ref(hctrs),
+				maxNumHits.getValue(),
+				noout.getValue());
+		    }
+		}
+		for (auto& t : threads) { t.join(); }
+	    }
+	    consoleLog->info("Done mapping reads.");
+        consoleLog->info("In total saw {} reads.", hctrs.numReads);
+        consoleLog->info("Final # hits per read = {}", hctrs.totHits / static_cast<float>(hctrs.numReads));
+	    consoleLog->info("Discarded {} reads because they had > {} alignments",
+		    hctrs.tooManyHits, maxNumHits.getValue());
+
+	    consoleLog->info("flushing output");
+	    outLog->flush();
+	}
+
+	if (haveOutputFile) {
+	    outFile.close();
+	}
+	return 0;
+    } catch (TCLAP::ArgException& e) {
+	consoleLog->error("Exception [{}] when parsing argument {}", e.error(), e.argId());
+	return 1;
+    }
+
+}
+
+
+/*
+template <typename ParserT>//, typename CoverageCalculator>
+void processReadsKSeq(ParserT* lseq,
+                      ParserT* rseq,
+                      RapMapIndex& rmi,
+                      std::mutex& iomutex) {
+
+    auto& txpNames = rmi.txpNames;
+    uint32_t n{0};
+    uint32_t k = rapmap::utils::my_mer::k();
+    std::vector<std::string> transcriptNames;
+    constexpr char bases[] = {'A', 'C', 'G', 'T'};
+
+    size_t batchSize{1000};
+    std::vector<QuasiAlignment> leftHits;
+    std::vector<QuasiAlignment> rightHits;
+    std::vector<std::vector<QuasiAlignment>> jointHits(batchSize);
+
+   int l1=0;
+   int l2=0;
+   size_t peHits{0};
+   size_t seHits{0};
+   size_t readLen{0};
+   while ( (l1 = kseq_read(lseq)) > 0 and (l2 = kseq_read(rseq)) > 0 ) {
+       readLen = lseq->seq.l;
+        n++;
+        jointHits.clear();
+        leftHits.clear();
+        rightHits.clear();
+        collectHits(rmi, lseq->seq.s, lseq->seq.l, leftHits);
+        collectHits(rmi, rseq->seq.s, rseq->seq.l, rightHits);
+
+//        jointHits.resize(std::min(leftHits.size(), rightHits.size()));
+//        size_t intSize = SIMDCompressionLib::SIMDintersection(&leftHits[0], leftHits.size(),
+//                                                              &rightHits[0], rightHits.size(),
+//                                                              &jointHits[0]);
+//        jointHits.resize(intSize);
+//        std::set_intersection(leftHits.begin(), leftHits.end(),
+//                              rightHits.begin(), rightHits.end(),
+//                              std::back_inserter(jointHits));
+//
+        if (leftHits.size() > 0) {
+            auto leftIt = leftHits.begin();
+            auto leftEnd = leftHits.end();
+            auto leftLen = std::distance(leftIt, leftEnd);
+            if (rightHits.size() > 0) {
+                auto rightIt = rightHits.begin();
+                auto rightEnd = rightHits.end();
+                auto rightLen = std::distance(rightIt, rightEnd);
+                jointHits.reserve(std::min(leftLen, rightLen));
+                while (leftIt != leftEnd && rightIt != rightEnd) {
+                    uint32_t leftTxp = leftIt->tid;
+                    uint32_t rightTxp = rightIt->tid;
+                    if (leftTxp < rightTxp) {
+                        ++leftIt;
+                    } else {
+                        if (!(rightTxp < leftTxp)) {
+                            auto startPos = std::min(leftIt->pos, rightIt->pos);
+                            auto endPos = std::max(leftIt->pos, rightIt->pos) + readLen;
+                            jointHits.emplace_back(leftTxp, startPos, leftIt->fwd, static_cast<uint16_t>(endPos - startPos));
+                            ++leftIt;
+                        }
+                        ++rightIt;
+                    }
+                }
+            }
+        }
+
+        if (jointHits.size() > 0) {
+           peHits += jointHits.size();
+        } else if (leftHits.size() + rightHits.size() > 0) {
+           seHits += leftHits.size() + rightHits.size();
+        }
+
+        if (n % 1000 == 0) {
+            std::cerr << "saw " << n << " reads\n";
+            std::cerr << "# pe hits per read = " << peHits / static_cast<float>(n) << "\n";
+            std::cerr << "# se hits per read = " << seHits / static_cast<float>(n) << "\n";
+        }
+    }
+  }
+
+
+void collectHits(RapMapIndex& rmi, std::string& readStr,
+                 std::vector<QuasiAlignment>& hits,
+                 MateStatus mateStatus) {
+
+    auto jfhash = rmi.merHash.get();
+    auto& kmerInfos = rmi.kmerInfos;
+    auto& eqClasses = rmi.eqClassList;
+    auto& eqClassLabels = rmi.eqLabelList;
+    auto& posList = rmi.posList;
+    auto posEnd = posList.end();
+
+    rapmap::utils::my_mer mer;
+    rapmap::utils::my_mer rcmer;
+    auto k = rapmap::utils::my_mer::k();
+    auto kbits = 2*k;
+    auto readLen = readStr.length();
+    uint32_t maxDist = static_cast<uint32_t>(readLen) * 1.5;
+    size_t leftQueryPos = std::numeric_limits<size_t>::max();
+    size_t rightQueryPos = std::numeric_limits<size_t>::max();
+    bool leftHitRC = false, rightHitRC = false;
+
+    auto endIt = kmerInfos.end();
+
+    KmerInfoList::iterator miniLeftHits = endIt;
+    KmerInfoList::iterator miniRightHits = endIt;
+
+    bool leftFwd{true};
+    bool rightFwd{true};
+
+    uint64_t merID;
+    size_t kID;
+	uint32_t klen{0};
+    rapmap::utils::my_mer searchBuffer;
+
+    for (size_t i = 0; i < readLen; ++i) {
+        int c = jellyfish::mer_dna::code(readStr[i]);
+        if (jellyfish::mer_dna::not_dna(c)) {
+		    klen = 0;
+			continue;
+        }
+        mer.shift_left(c);
+        rcmer.shift_right(jellyfish::mer_dna::complement(c));
+		++klen;
+        if (klen >= k) {
+            auto& searchMer = (mer < rcmer) ? mer : rcmer;
+            bool foundMer = jfhash->get_val_for_key(searchMer, &merID,
+                                                    searchBuffer, &kID);
+            if (foundMer) {
+                miniLeftHits = kmerInfos.begin() + merID;
+                leftHitRC = (searchMer == rcmer);
+                leftQueryPos = i - k;
+                break;
+            }
+        }
+    }
+
+    // found no hits in the entire read
+    if (miniLeftHits == endIt) { return; }
+
+    // Now, start from the right and move left
+	klen = 0;
+    for (size_t i = readLen - 1; i > leftQueryPos; --i) {
+        int c = jellyfish::mer_dna::code(readStr[i]);
+        if (jellyfish::mer_dna::not_dna(c)) {
+		  //c = jellyfish::mer_dna::code('G');
+		  klen = 0;
+		  continue;
+        }
+        mer.shift_right(c);
+        rcmer.shift_left(jellyfish::mer_dna::complement(c));
+		++klen;
+        if (klen >= k) {
+            auto& searchMer = (mer < rcmer) ? mer : rcmer;
+            bool foundMer = jfhash->get_val_for_key(searchMer, &merID,
+                                                    searchBuffer, &kID);
+            if (foundMer) {
+                miniRightHits = kmerInfos.begin() + merID;
+				//if (miniLeftHits == miniRightHits) { continue; }
+                rightHitRC = (searchMer == rcmer);
+                // distance from the right end
+                rightQueryPos = readLen - (i + k);
+                break;
+            }
+        }
+    }
+
+    // Take the intersection of these two hit lists
+    // Adapted from : http://en.cppreference.com/w/cpp/algorithm/set_intersection
+    if (miniLeftHits != endIt) {
+	    // Equiv. class for left hit
+	    auto& eqClassLeft = eqClasses[miniLeftHits->eqId];
+	    // Iterator into, length of and end of the positon list
+	    auto leftPosIt = posList.begin() + miniLeftHits->offset;
+	    auto leftPosLen = miniLeftHits->count;
+	    auto leftPosEnd = leftPosIt + leftPosLen;
+	    PositionListHelper leftPosHelper(leftPosIt, posList.end());
+#ifdef __DEBUG__
+	    if (!leftPosHelper.isNewTxp()) {
+		    std::cerr << "\n Should definitely be new txp but "
+			    << "leftPosHelper = ( "
+			    << leftPosHelper.pos() << ", "
+			    << leftPosHelper.isNewTxp() << ")\n";
+	    }
+#endif
+	    // Iterator into, length of and end of the transcript list
+	    auto leftTxpIt = eqClassLabels.begin() + eqClassLeft.txpListStart;
+	    auto leftTxpListLen = eqClassLeft.txpListLen;
+	    auto leftTxpEnd = leftTxpIt + leftTxpListLen;
+
+	    if (miniRightHits != endIt) {
+		    // Equiv. class for right hit
+		    auto& eqClassRight = eqClasses[miniRightHits->eqId];
+		    // Iterator into, length of and end of the positon list
+		    auto rightPosIt = posList.begin() + miniRightHits->offset;
+		    auto rightPosLen = miniRightHits->count;
+		    auto rightPosEnd = rightPosIt + rightPosLen;
+		    PositionListHelper rightPosHelper(rightPosIt, posList.end());
+#ifdef __DEBUG__
+		    if (!rightPosHelper.isNewTxp()) {
+			    std::cerr << "\n Should definitely be new txp but "
+				    << "rightPosHelper = ( "
+				    << rightPosHelper.pos() << ", "
+				    << rightPosHelper.isNewTxp() << ")\n";
+		    }
+#endif
+		    // Iterator into, length of and end of the transcript list
+		    auto rightTxpIt = eqClassLabels.begin() + eqClassRight.txpListStart;
+		    auto rightTxpListLen = eqClassRight.txpListLen;
+		    auto rightTxpEnd = rightTxpIt + rightTxpListLen;
+
+		    //hits.resize(std::min(leftLen, rightLen));
+		    //size_t intSize = SIMDCompressionLib::SIMDintersection(&tidList[leftIt], leftLen,
+		    //                                                      &tidList[rightIt], rightLen,
+		    //                                                      &hits[0]);
+		    //hits.resize(intSize);
+
+
+		    hits.reserve(std::min(leftPosLen, rightPosLen));
+		    uint32_t leftTxp, rightTxp;
+		    while (leftTxpIt != leftTxpEnd and rightTxpIt != rightTxpEnd) {
+			    // Get the current transcript ID for the left and right eq class
+			    leftTxp = *leftTxpIt;
+			    rightTxp = *rightTxpIt;
+			    // If we need to advance the left txp, do it
+			    if (leftTxp < rightTxp) {
+				    // Advance to the next transcript in the
+				    // equivalence class label
+				    ++leftTxpIt;
+				    // Advance in the position array to the next ranscript
+				    leftPosHelper.advanceToNextTranscript();
+			    } else {
+				    // If the transcripts are equal (i.e. leftTxp >= rightTxp and !(rightTxp < leftTxp))
+				    // Then see if there are any hits here.
+				    if (!(rightTxp < leftTxp)) {
+					    // If the hits are on the same transcript, look for
+					    // a mapping position where they appear the appropriate
+					    // distance apart.
+					    // Note: The iterators into the *position* vector will
+					    // be advanced, and should be at the start of the
+					    // positions for the *next* transcript when this function
+					    // returns.
+					    collectHitsWithPositionConstraint(leftTxp, readLen,
+							    leftHitRC, rightHitRC,
+							    leftQueryPos, rightQueryPos,
+							    leftPosHelper, rightPosHelper,
+							    maxDist, hits, mateStatus);
+					    ++leftTxpIt;
+					    // advance pos
+					    // leftPosHelper.advanceToNextTranscript();
+				    } else {
+					    // If the right transcript id was less than the left
+					    // transcript id, then advance the right position
+					    // iterator to the next transcript.
+					    rightPosHelper.advanceToNextTranscript();
+				    }
+				    // Advance the right transcript id regardless of whether
+				    // we looked for a hit or not.
+				    ++rightTxpIt;
+			    }
+		    }
+	    } else { // If we had only hits from the left, then map this as an orphan
+		    hits.reserve(miniLeftHits->count);
+		    for (auto it = leftTxpIt; it < leftTxpEnd; ++it) {
+			    collectAllHits(*it, readLen, leftHitRC, leftPosHelper, hits, mateStatus);
+		    }
+	    }
+    }
+
+}
+
+*/
diff --git a/src/RapMapSAIndex.cpp b/src/RapMapSAIndex.cpp
new file mode 100644
index 0000000..2e97122
--- /dev/null
+++ b/src/RapMapSAIndex.cpp
@@ -0,0 +1,177 @@
+#include "BooMap.hpp"
+#include "RapMapSAIndex.hpp"
+#include "IndexHeader.hpp"
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+
+
+#include <future>
+#include <thread>
+
+// These are **free** functions that are used for loading the
+// appropriate type of hash.
+template <typename IndexT>
+bool loadHashFromIndex(const std::string& indexDir,
+                       google::dense_hash_map<uint64_t,
+                       rapmap::utils::SAInterval<IndexT>,
+                       rapmap::utils::KmerKeyHasher>& khash) {
+      khash.set_empty_key(std::numeric_limits<uint64_t>::max());
+      std::ifstream hashStream(indexDir + "hash.bin");
+      khash.unserialize(typename google::dense_hash_map<uint64_t,
+                      rapmap::utils::SAInterval<IndexT>,
+                      rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream);
+      return true;
+}
+
+template <typename IndexT>
+bool loadHashFromIndex(const std::string& indexDir,
+		       BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> & h) {
+    std::string hashBase = indexDir + "hash_info";
+    h.load(hashBase);
+    return true;
+}
+
+template <typename IndexT, typename HashT>
+RapMapSAIndex<IndexT, HashT>::RapMapSAIndex() {}
+
+// Given a position, p, in the concatenated text,
+// return the corresponding transcript
+template <typename IndexT, typename HashT>
+IndexT RapMapSAIndex<IndexT, HashT>::transcriptAtPosition(IndexT p) {
+    return rankDict->rank(p);
+}
+
+template <typename IndexT, typename HashT>
+bool RapMapSAIndex<IndexT, HashT>::load(const std::string& indDir) {
+
+    auto logger = spdlog::get("stderrLog");
+    size_t n{0};
+
+    IndexHeader h;
+    std::ifstream indexStream(indDir + "header.json");
+    {
+      cereal::JSONInputArchive ar(indexStream);
+      ar(h);
+    }
+    indexStream.close();
+    uint32_t idxK = h.kmerLen();
+
+    // This part takes the longest, so do it in it's own asynchronous task
+    std::future<bool> loadingHash = std::async(std::launch::async, [this, logger, indDir]() -> bool {
+	   if (loadHashFromIndex(indDir, khash)) {
+                logger->info("Successfully loaded position hash");
+                return true;
+            } else {
+                logger->error("Failed to load position hash!");
+                return false;
+            }
+	// If using a google dense hash
+        //this->khash.set_empty_key(std::numeric_limits<uint64_t>::max());
+        //uint32_t k = 31;
+        //std::ifstream hashStream(indDir + "hash.bin");
+        //{
+
+	  //logger->info("Loading Position Hash");
+            //khash.load(hashStream);
+            //cereal::BinaryInputArchive hashArchive(hashStream);
+            //hashArchive(k);
+            //khash.unserialize(typename google::dense_hash_map<uint64_t,
+            //        rapmap::utils::SAInterval<IndexT>,
+            //        rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream);
+            //hashArchive(khash);
+	   //}
+        //hashStream.close();
+        //std::cerr << "had " << khash.size() << " entries\n";
+        //return true;
+    });
+
+    /*
+    std::ifstream intervalStream(indDir + "kintervals.bin");
+    {
+        logger->info("Loading k-mer intervals");
+        cereal::BinaryInputArchive intervalArchive(intervalStream);
+        intervalArchive(kintervals);
+    }
+    intervalStream.close();
+    */
+
+    std::ifstream saStream(indDir + "sa.bin");
+    {
+        logger->info("Loading Suffix Array ");
+        cereal::BinaryInputArchive saArchive(saStream);
+        saArchive(SA);
+        //saArchive(LCP);
+    }
+    saStream.close();
+
+    std::ifstream seqStream(indDir + "txpInfo.bin");
+    {
+        logger->info("Loading Transcript Info ");
+        cereal::BinaryInputArchive seqArchive(seqStream);
+        seqArchive(txpNames);
+        seqArchive(txpOffsets);
+        //seqArchive(positionIDs);
+        seqArchive(seq);
+    }
+    seqStream.close();
+
+    /*
+       std::ifstream rsStream(indDir + "rsdSafe.bin", std::ios::binary);
+       {
+       logger->info("Loading Rank-Select Data");
+       rankDictSafe.Load(rsStream);
+       }
+       rsStream.close();
+       */
+    std::string rsFileName = indDir + "rsd.bin";
+    FILE* rsFile = fopen(rsFileName.c_str(), "r");
+    {
+        logger->info("Loading Rank-Select Bit Array");
+        bitArray.reset(bit_array_create(0));
+        if (!bit_array_load(bitArray.get(), rsFile)) {
+            logger->error("Couldn't load bit array from {}!", rsFileName);
+            std::exit(1);
+        }
+        logger->info("There were {} set bits in the bit array", bit_array_num_bits_set(bitArray.get()));
+        rankDict.reset(new rank9b(bitArray->words, bitArray->num_of_bits));
+    }
+    fclose(rsFile);
+
+    {
+        logger->info("Computing transcript lengths");
+        txpLens.resize(txpOffsets.size());
+        if (txpOffsets.size() > 1) {
+            for(size_t i = 0; i < txpOffsets.size() - 1; ++i) {
+                auto nextOffset = txpOffsets[i+1];
+                auto currentOffset = txpOffsets[i];
+                txpLens[i] = (nextOffset - 1) - currentOffset;
+            }
+        }
+        // The last length is just the length of the suffix array - the last offset
+        txpLens[txpOffsets.size()-1] = (SA.size() - 1) - txpOffsets[txpOffsets.size() - 1];
+    }
+
+    logger->info("Waiting to finish loading hash");
+    loadingHash.wait();
+    auto hashLoadRes = loadingHash.get();
+    if (!hashLoadRes) {
+        logger->error("Failed to load hash!");
+        std::exit(1);
+    }
+    rapmap::utils::my_mer::k(idxK);
+
+    logger->info("Done loading index");
+    return true;
+}
+
+template class RapMapSAIndex<int32_t,  google::dense_hash_map<uint64_t,
+                      rapmap::utils::SAInterval<int32_t>,
+                      rapmap::utils::KmerKeyHasher>>;
+template class RapMapSAIndex<int64_t,  google::dense_hash_map<uint64_t,
+                      rapmap::utils::SAInterval<int64_t>,
+                      rapmap::utils::KmerKeyHasher>>;
+template class RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+template class RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
diff --git a/src/RapMapSAIndexer.cpp b/src/RapMapSAIndexer.cpp
new file mode 100644
index 0000000..83a1491
--- /dev/null
+++ b/src/RapMapSAIndexer.cpp
@@ -0,0 +1,731 @@
+#include <algorithm>
+#include <cctype>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "tclap/CmdLine.h"
+
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/utility.hpp>
+#include <cereal/types/vector.hpp>
+
+#include "BooMap.hpp"
+#include "xxhash.h"
+
+#include "spdlog/spdlog.h"
+
+// Jellyfish 2 include
+#include "jellyfish/mer_dna.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+
+#include "divsufsort.h"
+#include "divsufsort64.h"
+
+#include "RapMapFileSystem.hpp"
+#include "RapMapUtils.hpp"
+#include "ScopedTimer.hpp"
+#include "bit_array.h"
+
+#include "JFRaw.hpp"
+#include "jellyfish/binary_dumper.hpp"
+#include "jellyfish/file_header.hpp"
+#include "jellyfish/hash_counter.hpp"
+#include "jellyfish/mer_iterator.hpp"
+#include "jellyfish/mer_overlap_sequence_parser.hpp"
+#include "jellyfish/thread_exec.hpp"
+#include "rank9b.h"
+
+#include "sparsehash/dense_hash_map"
+
+#include "IndexHeader.hpp"
+
+#include <chrono>
+
+using stream_manager =
+    jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
+using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using TranscriptID = uint32_t;
+using TranscriptIDVector = std::vector<TranscriptID>;
+using KmerIDMap = std::vector<TranscriptIDVector>;
+using MerMapT = jellyfish::cooperative::hash_counter<rapmap::utils::my_mer>;
+
+bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
+             std::vector<int64_t>& SA) {
+  // IndexT is the signed index type
+  // UIndexT is the unsigned index type
+  using IndexT = int64_t;
+  using UIndexT = uint64_t;
+  bool success{false};
+
+  std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    SA.resize(tlen, 0);
+    IndexT textLen = static_cast<IndexT>(tlen);
+    std::cerr << "Building suffix array . . . ";
+    auto ret = divsufsort64(
+        reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
+        SA.data(), tlen);
+
+    success = (ret == 0);
+    if (success) {
+      std::cerr << "success\n";
+      {
+        ScopedTimer timer2;
+        std::cerr << "saving to disk . . . ";
+        cereal::BinaryOutputArchive saArchive(saStream);
+        saArchive(SA);
+        std::cerr << "done\n";
+      }
+    } else {
+      std::cerr << "FAILURE: return code from libdivsufsort64() was " << ret
+                << "\n";
+      saStream.close();
+      std::exit(1);
+    }
+    std::cerr << "done\n";
+  }
+  saStream.close();
+  return success;
+}
+
+// IndexT is the index type.
+// int32_t for "small" suffix arrays
+// int64_t for "large" ones
+template <typename IndexT>
+bool buildPerfectHash(const std::string& outputDir, std::string& concatText,
+                      size_t tlen, uint32_t k, std::vector<IndexT>& SA,
+                      uint32_t numHashThreads) {
+  BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
+
+  // The start and stop of the current interval
+  IndexT start = 0, stop = 0;
+  // An iterator to the beginning of the text
+  auto textB = concatText.begin();
+  auto textE = concatText.end();
+  // The current k-mer as a string
+  rapmap::utils::my_mer mer;
+  bool currentValid{false};
+  std::string currentKmer;
+  std::string nextKmer;
+  while (stop < tlen) {
+    // Check if the string starting at the
+    // current position is valid (i.e. doesn't contain $)
+    // and is <= k bases from the end of the string
+    nextKmer = concatText.substr(SA[stop], k);
+    if (nextKmer.length() == k and
+        nextKmer.find_first_of('$') == std::string::npos) {
+      // If this is a new k-mer, then hash the current k-mer
+      if (nextKmer != currentKmer) {
+        if (currentKmer.length() == k and
+            currentKmer.find_first_of('$') == std::string::npos) {
+          mer = rapmap::utils::my_mer(currentKmer);
+          auto bits = mer.get_bits(0, 2 * k);
+          intervals.add(std::move(bits), {start, stop});
+          // push_back(std::make_pair<uint64_t,
+          // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start,
+          // stop}));
+        }
+        currentKmer = nextKmer;
+        start = stop;
+      }
+    } else {
+      // If this isn't a valid suffix (contains a $)
+      // If the previous interval was valid, put it
+      // in the hash.
+      if (currentKmer.length() == k and
+          currentKmer.find_first_of('$') == std::string::npos) {
+        mer = rapmap::utils::my_mer(currentKmer);
+        auto bits = mer.get_bits(0, 2 * k);
+        // intervals.push_back(std::make_pair<uint64_t,
+        // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
+        intervals.add(std::move(bits), {start, stop});
+      }
+      // The current interval is invalid and empty
+      currentKmer = nextKmer;
+      start = stop;
+    }
+    if (stop % 1000000 == 0) {
+      std::cerr << "\r\rprocessed " << stop << " positions";
+    }
+    // We always update the end position
+    ++stop;
+  }
+  if (start < tlen) {
+    if (currentKmer.length() == k and
+        currentKmer.find_first_of('$') != std::string::npos) {
+      mer = rapmap::utils::my_mer(currentKmer);
+      auto bits = mer.get_bits(0, 2 * k);
+      // intervals.push_back(std::make_pair<uint64_t,
+      // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
+      intervals.add(std::move(bits), {start, stop});
+    }
+  }
+
+  // std::cerr << "\nthere are " << intervals.size() << " intervals of the
+  // selected depth\n";
+
+  std::cout << "building perfect hash function\n";
+  intervals.build(numHashThreads);
+  std::cout << "\ndone.\n";
+  std::string outputPrefix = outputDir + "hash_info";
+  std::cout << "saving the perfect hash and SA intervals to disk ... ";
+  intervals.save(outputPrefix);
+  std::cout << "done.\n";
+
+  return true;
+}
+
+bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
+             std::vector<int32_t>& SA) {
+  // IndexT is the signed index type
+  // UIndexT is the unsigned index type
+  using IndexT = int32_t;
+  using UIndexT = uint32_t;
+  bool success{false};
+
+  std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    SA.resize(tlen, 0);
+    IndexT textLen = static_cast<IndexT>(tlen);
+    std::cerr << "Building suffix array . . . ";
+    auto ret = divsufsort(
+        reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
+        SA.data(), tlen);
+
+    success = (ret == 0);
+    if (success) {
+      std::cerr << "success\n";
+      {
+        ScopedTimer timer2;
+        std::cerr << "saving to disk . . . ";
+        cereal::BinaryOutputArchive saArchive(saStream);
+        saArchive(SA);
+        std::cerr << "done\n";
+      }
+    } else {
+      std::cerr << "FAILURE: return code from libdivsufsort() was " << ret
+                << "\n";
+      saStream.close();
+      std::exit(1);
+    }
+    std::cerr << "done\n";
+  }
+  saStream.close();
+  return success;
+}
+
+// IndexT is the index type.
+// int32_t for "small" suffix arrays
+// int64_t for "large" ones
+template <typename IndexT>
+bool buildHash(const std::string& outputDir, std::string& concatText,
+               size_t tlen, uint32_t k, std::vector<IndexT>& SA) {
+  // Now, build the k-mer lookup table
+  google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<IndexT>,
+                         rapmap::utils::KmerKeyHasher>
+      khash;
+  khash.set_empty_key(std::numeric_limits<uint64_t>::max());
+
+  // The start and stop of the current interval
+  IndexT start = 0, stop = 0;
+  // An iterator to the beginning of the text
+  auto textB = concatText.begin();
+  auto textE = concatText.end();
+  // The current k-mer as a string
+  rapmap::utils::my_mer mer;
+  bool currentValid{false};
+  std::string currentKmer;
+  std::string nextKmer;
+  while (stop < tlen) {
+    // Check if the string starting at the
+    // current position is valid (i.e. doesn't contain $)
+    // and is <= k bases from the end of the string
+    nextKmer = concatText.substr(SA[stop], k);
+    if (nextKmer.length() == k and
+        nextKmer.find_first_of('$') == std::string::npos) {
+      // If this is a new k-mer, then hash the current k-mer
+      if (nextKmer != currentKmer) {
+        if (currentKmer.length() == k and
+            currentKmer.find_first_of('$') == std::string::npos) {
+          mer = rapmap::utils::my_mer(currentKmer);
+          auto bits = mer.get_bits(0, 2 * k);
+          auto hashIt = khash.find(bits);
+          if (hashIt == khash.end()) {
+            if (start > 1) {
+              if (concatText.substr(SA[start - 1], k) ==
+                  concatText.substr(SA[start], k)) {
+                std::cerr << "T[SA[" << start - 1 << "]:" << k
+                          << "] = " << concatText.substr(SA[start - 1], k)
+                          << " = T[SA[" << start << "]:" << k << "]\n";
+                std::cerr << "start = " << start << ", stop = " << stop << "\n";
+                std::cerr << "[fatal (1)] THIS SHOULD NOT HAPPEN\n";
+                std::exit(1);
+              }
+            }
+            if (start == stop) {
+              std::cerr << "[fatal (2)] Interval is empty! (start = " << start
+                        << ") = (stop =  " << stop << ")\n";
+            }
+            if (start == stop) {
+              std::cerr << "[fatal (3)] Interval is empty! (start = " << start
+                        << ") = (stop =  " << stop << ")\n";
+            }
+
+            khash[bits] = {start, stop};
+          } else {
+            std::cerr << "\nERROR (1): trying to add same suffix "
+                      << currentKmer << " (len = " << currentKmer.length()
+                      << ") multiple times!\n";
+            auto prevInt = hashIt->second;
+            std::cerr << "existing interval is [" << prevInt.begin << ", "
+                      << prevInt.end << ")\n";
+            for (auto x = prevInt.begin; x < prevInt.end; ++x) {
+              auto suff = concatText.substr(SA[x], k);
+              for (auto c : suff) {
+                std::cerr << "*" << c << "*";
+              }
+              std::cerr << " (len = " << suff.length() << ")\n";
+            }
+            std::cerr << "new interval is [" << start << ", " << stop << ")\n";
+            for (auto x = start; x < stop; ++x) {
+              auto suff = concatText.substr(SA[x], k);
+              for (auto c : suff) {
+                std::cerr << "*" << c << "*";
+              }
+              std::cerr << "\n";
+            }
+          }
+        }
+        currentKmer = nextKmer;
+        start = stop;
+      }
+    } else {
+      // If this isn't a valid suffix (contains a $)
+
+      // If the previous interval was valid, put it
+      // in the hash.
+      if (currentKmer.length() == k and
+          currentKmer.find_first_of('$') == std::string::npos) {
+        mer = rapmap::utils::my_mer(currentKmer);
+        auto bits = mer.get_bits(0, 2 * k);
+        auto hashIt = khash.find(bits);
+        if (hashIt == khash.end()) {
+          if (start > 2) {
+            if (concatText.substr(SA[start - 1], k) ==
+                concatText.substr(SA[start], k)) {
+              std::cerr << "T[SA[" << start - 1 << "]:" << k
+                        << "] = " << concatText.substr(SA[start - 1], k)
+                        << " = T[SA[" << start << "]:" << k << "]\n";
+              std::cerr << "start = " << start << ", stop = " << stop << "\n";
+              std::cerr << "[fatal (4)] THIS SHOULD NOT HAPPEN\n";
+              std::exit(1);
+            }
+          }
+          khash[bits] = {start, stop};
+        } else {
+          std::cerr << "\nERROR (2): trying to add same suffix " << currentKmer
+                    << "multiple times!\n";
+          auto prevInt = hashIt->second;
+          std::cerr << "existing interval is [" << prevInt.begin << ", "
+                    << prevInt.end << ")\n";
+          for (auto x = prevInt.begin; x < prevInt.end; ++x) {
+            std::cerr << concatText.substr(SA[x], k) << "\n";
+          }
+          std::cerr << "new interval is [" << start << ", " << stop << ")\n";
+          for (auto x = start; x < stop; ++x) {
+            std::cerr << concatText.substr(SA[x], k) << "\n";
+          }
+        }
+      }
+      // The current interval is invalid and empty
+      currentKmer = nextKmer;
+      start = stop;
+    }
+    if (stop % 1000000 == 0) {
+      std::cerr << "\r\rprocessed " << stop << " positions";
+    }
+    // We always update the end position
+    ++stop;
+  }
+  if (start < tlen) {
+    if (currentKmer.length() == k and
+        currentKmer.find_first_of('$') != std::string::npos) {
+      mer = rapmap::utils::my_mer(currentKmer);
+      khash[mer.get_bits(0, 2 * k)] = {start, stop};
+    }
+  }
+  std::cerr << "\nkhash had " << khash.size() << " keys\n";
+  std::ofstream hashStream(outputDir + "hash.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    std::cerr << "saving hash to disk . . . ";
+    cereal::BinaryOutputArchive hashArchive(hashStream);
+    // hashArchive(k);
+    khash.serialize(typename google::dense_hash_map<
+                        uint64_t, rapmap::utils::SAInterval<IndexT>,
+                        rapmap::utils::KmerKeyHasher>::NopointerSerializer(),
+                    &hashStream);
+    // hashArchive(khash);
+    std::cerr << "done\n";
+  }
+  hashStream.close();
+  return true;
+}
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+template <typename ParserT> //, typename CoverageCalculator>
+void indexTranscriptsSA(ParserT* parser, std::string& outputDir,
+                        bool noClipPolyA, bool usePerfectHash,
+                        uint32_t numHashThreads, std::mutex& iomutex,
+                        std::shared_ptr<spdlog::logger> log) {
+  // Seed with a real random value, if available
+  std::random_device rd;
+
+  // Create a random uniform distribution
+  std::default_random_engine eng(rd());
+
+  std::uniform_int_distribution<> dis(0, 3);
+
+  uint32_t n{0};
+  uint32_t k = rapmap::utils::my_mer::k();
+  std::vector<std::string> transcriptNames;
+  std::vector<int64_t> transcriptStarts;
+  // std::vector<uint32_t> positionIDs;
+  constexpr char bases[] = {'A', 'C', 'G', 'T'};
+  uint32_t polyAClipLength{10};
+  uint32_t numPolyAsClipped{0};
+  uint32_t numNucleotidesReplaced{0};
+  std::string polyA(polyAClipLength, 'A');
+
+  using TranscriptList = std::vector<uint32_t>;
+  using eager_iterator = MerMapT::array::eager_iterator;
+  using KmerBinT = uint64_t;
+
+  bool clipPolyA = !noClipPolyA;
+
+  // http://biology.stackexchange.com/questions/21329/whats-the-longest-transcript-known
+  // longest human transcript is Titin (108861), so this gives us a *lot* of
+  // leeway before
+  // we issue any warning.
+  size_t tooLong = 200000;
+  size_t numDistinctKmers{0};
+  size_t numKmers{0};
+  size_t currIndex{0};
+  std::cerr << "\n[Step 1 of 4] : counting k-mers\n";
+
+  // rsdic::RSDicBuilder rsdb;
+  std::vector<uint64_t>
+      onePos; // Positions in the bit array where we should write a '1'
+  fmt::MemoryWriter txpSeqStream;
+  {
+    ScopedTimer timer;
+    while (true) {
+      typename ParserT::job j(*parser);
+      if (j.is_empty())
+        break;
+      for (size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+        std::string& readStr = j->data[i].seq;
+        readStr.erase(
+            std::remove_if(readStr.begin(), readStr.end(),
+                           [](const char a) -> bool { return !(isprint(a)); }),
+            readStr.end());
+
+        uint32_t readLen = readStr.size();
+        // First, replace non ATCG nucleotides
+        for (size_t b = 0; b < readLen; ++b) {
+          readStr[b] = ::toupper(readStr[b]);
+          int c = jellyfish::mer_dna::code(readStr[b]);
+          // Replace non-ACGT bases with pseudo-random bases
+          if (jellyfish::mer_dna::not_dna(c)) {
+            char rbase = bases[dis(eng)];
+            c = jellyfish::mer_dna::code(rbase);
+            readStr[b] = rbase;
+            ++numNucleotidesReplaced;
+          }
+        }
+
+        // Now, do Kallisto-esque clipping of polyA tails
+        if (clipPolyA) {
+          if (readStr.size() > polyAClipLength and
+              readStr.substr(readStr.length() - polyAClipLength) == polyA) {
+
+            auto newEndPos = readStr.find_last_not_of("Aa");
+            // If it was all As
+            if (newEndPos == std::string::npos) {
+              log->warn("Entry with header [{}] appeared to be all A's; it "
+                        "will be removed from the index!",
+                        j->data[i].header);
+              readStr.resize(0);
+            } else {
+              readStr.resize(newEndPos + 1);
+            }
+            ++numPolyAsClipped;
+          }
+        }
+
+        readLen = readStr.size();
+        // If the transcript was completely removed during clipping, don't
+        // include it in the index.
+        if (readStr.size() >= k) {
+          // If we're suspicious the user has fed in a *genome* rather
+          // than a transcriptome, say so here.
+          if (readStr.size() >= tooLong) {
+            log->warn("Entry with header [{}] was longer than {} nucleotides.  "
+                      "Are you certain that "
+                      "we are indexing a transcriptome and not a genome?",
+                      j->data[i].header, tooLong);
+          }
+
+          uint32_t txpIndex = n++;
+
+          // The name of the current transcript
+          auto& recHeader = j->data[i].header;
+          transcriptNames.emplace_back(
+              recHeader.substr(0, recHeader.find_first_of(" \t")));
+
+          // The position at which this transcript starts
+          transcriptStarts.push_back(currIndex);
+
+          txpSeqStream << readStr;
+          txpSeqStream << '$';
+          currIndex += readLen + 1;
+          onePos.push_back(currIndex - 1);
+        } else {
+            log->warn("Discarding entry with header [{}], since it was shorter than "
+                      "the k-mer length of {} (perhaps after poly-A clipping)", 
+                      j->data[i].header, k);
+        }
+      }
+      if (n % 10000 == 0) {
+        std::cerr << "\r\rcounted k-mers for " << n << " transcripts";
+      }
+    }
+  }
+  std::cerr << "\n";
+
+  std::cerr << "Replaced " << numNucleotidesReplaced
+            << " non-ATCG nucleotides\n";
+  std::cerr << "Clipped poly-A tails from " << numPolyAsClipped
+            << " transcripts\n";
+
+  // Put the concatenated text in a string
+  std::string concatText = txpSeqStream.str();
+  // And clear the stream
+  txpSeqStream.clear();
+
+  // Build the suffix array
+  size_t tlen = concatText.length();
+  size_t maxInt = std::numeric_limits<int32_t>::max();
+  bool largeIndex = (tlen + 1 > maxInt);
+
+  // Make our dense bit arrray
+  BIT_ARRAY* bitArray = bit_array_create(concatText.length());
+  for (auto p : onePos) {
+    bit_array_set_bit(bitArray, p);
+  }
+
+  onePos.clear();
+  onePos.shrink_to_fit();
+
+  std::string rsFileName = outputDir + "rsd.bin";
+  FILE* rsFile = fopen(rsFileName.c_str(), "w");
+  {
+    ScopedTimer timer;
+    std::cerr << "Building rank-select dictionary and saving to disk ";
+    bit_array_save(bitArray, rsFile);
+    std::cerr << "done\n";
+  }
+  fclose(rsFile);
+  bit_array_free(bitArray);
+
+  std::ofstream seqStream(outputDir + "txpInfo.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    std::cerr << "Writing sequence data to file . . . ";
+    cereal::BinaryOutputArchive seqArchive(seqStream);
+    seqArchive(transcriptNames);
+    if (largeIndex) {
+      seqArchive(transcriptStarts);
+    } else {
+      std::vector<int32_t> txpStarts(transcriptStarts.size(), 0);
+      size_t numTranscriptStarts = transcriptStarts.size();
+      for (size_t i = 0; i < numTranscriptStarts; ++i) {
+        txpStarts[i] = static_cast<int32_t>(transcriptStarts[i]);
+      }
+      transcriptStarts.clear();
+      transcriptStarts.shrink_to_fit();
+      { seqArchive(txpStarts); }
+    }
+    // seqArchive(positionIDs);
+    seqArchive(concatText);
+    std::cerr << "done\n";
+  }
+  seqStream.close();
+
+  // clear stuff we no longer need
+  // positionIDs.clear();
+  // positionIDs.shrink_to_fit();
+  transcriptStarts.clear();
+  transcriptStarts.shrink_to_fit();
+  transcriptNames.clear();
+  transcriptNames.shrink_to_fit();
+  // done clearing
+
+  if (largeIndex) {
+    largeIndex = true;
+    std::cerr << "[info] Building 64-bit suffix array "
+                 "(length of generalized text is "
+              << tlen << " )\n";
+    using IndexT = int64_t;
+    std::vector<IndexT> SA;
+    bool success = buildSA(outputDir, concatText, tlen, SA);
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix array!\n";
+      std::exit(1);
+    }
+
+    if (usePerfectHash) {
+      success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
+                                         numHashThreads);
+    } else {
+      success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
+    }
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix interval hash!\n";
+      std::exit(1);
+    }
+  } else {
+    std::cerr << "[info] Building 32-bit suffix array "
+                 "(length of generalized text is "
+              << tlen << ")\n";
+    using IndexT = int32_t;
+    std::vector<IndexT> SA;
+    bool success = buildSA(outputDir, concatText, tlen, SA);
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix array!\n";
+      std::exit(1);
+    }
+
+    if (usePerfectHash) {
+      success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
+                                         numHashThreads);
+    } else {
+      success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
+    }
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix interval hash!\n";
+      std::exit(1);
+    }
+  }
+
+  std::string indexVersion = "q3";
+  IndexHeader header(IndexType::QUASI, indexVersion, true, k, largeIndex,
+                     usePerfectHash);
+  // Finally (since everything presumably succeeded) write the header
+  std::ofstream headerStream(outputDir + "header.json");
+  {
+    cereal::JSONOutputArchive archive(headerStream);
+    archive(header);
+  }
+  headerStream.close();
+}
+
+int rapMapSAIndex(int argc, char* argv[]) {
+  std::cerr << "RapMap Indexer\n";
+
+  TCLAP::CmdLine cmd("RapMap Indexer");
+  TCLAP::ValueArg<std::string> transcripts("t", "transcripts",
+                                           "The transcript file to be indexed",
+                                           true, "", "path");
+  TCLAP::ValueArg<std::string> index(
+      "i", "index", "The location where the index should be written", true, "",
+      "path");
+  TCLAP::ValueArg<uint32_t> kval("k", "klen", "The length of k-mer to index",
+                                 false, 31, "positive integer less than 32");
+  TCLAP::SwitchArg noClip(
+      "n", "noClip",
+      "Don't clip poly-A tails from the ends of target sequences", false);
+  TCLAP::SwitchArg perfectHash(
+      "p", "perfectHash", "Use a perfect hash instead of dense hash --- "
+                          "somewhat slows construction, but uses less memory",
+      false);
+  TCLAP::ValueArg<uint32_t> numHashThreads(
+      "x", "numThreads",
+      "Use this many threads to build the perfect hash function", false, 4,
+      "positive integer <= # cores");
+  cmd.add(transcripts);
+  cmd.add(index);
+  cmd.add(kval);
+  cmd.add(noClip);
+  cmd.add(perfectHash);
+  cmd.add(numHashThreads);
+  cmd.parse(argc, argv);
+
+  // stupid parsing for now
+  std::string transcriptFile(transcripts.getValue());
+  std::vector<std::string> transcriptFiles({transcriptFile});
+
+  uint32_t k = kval.getValue();
+  if (k % 2 == 0) {
+    std::cerr << "Error: k must be an odd value, you chose " << k << '\n';
+    std::exit(1);
+  } else if (k > 31) {
+    std::cerr << "Error: k must not be larger than 31, you chose " << k << '\n';
+    std::exit(1);
+  }
+  rapmap::utils::my_mer::k(k);
+
+  std::string indexDir = index.getValue();
+  if (indexDir.back() != '/') {
+    indexDir += '/';
+  }
+  bool dirExists = rapmap::fs::DirExists(indexDir.c_str());
+  bool dirIsFile = rapmap::fs::FileExists(indexDir.c_str());
+  if (dirIsFile) {
+    std::cerr << "The requested index directory already exists as a file.";
+    std::exit(1);
+  }
+  if (!dirExists) {
+    rapmap::fs::MakeDir(indexDir.c_str());
+  }
+
+  std::string logPath = indexDir + "quasi_index.log";
+  auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_st>(logPath);
+  auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_st>();
+  auto consoleLog = spdlog::create("stderrLog", {consoleSink});
+  auto fileLog = spdlog::create("fileLog", {fileSink});
+  auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
+
+  size_t maxReadGroup{1000}; // Number of reads in each "job"
+  size_t concurrentFile{2};  // Number of files to read simultaneously
+  size_t numThreads{2};
+  stream_manager streams(transcriptFiles.begin(), transcriptFiles.end(),
+                         concurrentFile);
+  std::unique_ptr<single_parser> transcriptParserPtr{nullptr};
+  transcriptParserPtr.reset(
+      new single_parser(4 * numThreads, maxReadGroup, concurrentFile, streams));
+
+  bool noClipPolyA = noClip.getValue();
+  bool usePerfectHash = perfectHash.getValue();
+  uint32_t numPerfectHashThreads = numHashThreads.getValue();
+  std::mutex iomutex;
+  indexTranscriptsSA(transcriptParserPtr.get(), indexDir, noClipPolyA,
+                     usePerfectHash, numPerfectHashThreads, iomutex, jointLog);
+  return 0;
+}
diff --git a/src/RapMapSAMapper.cpp b/src/RapMapSAMapper.cpp
new file mode 100644
index 0000000..7b265d2
--- /dev/null
+++ b/src/RapMapSAMapper.cpp
@@ -0,0 +1,638 @@
+#include <iostream>
+#include <mutex>
+#include <vector>
+#include <random>
+#include <unordered_map>
+#include <fstream>
+#include <algorithm>
+#include <iterator>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <thread>
+#include <tuple>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <tuple>
+#include <memory>
+#include <cstring>
+
+#include "ScopedTimer.hpp"
+
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+
+#include "HitManager.hpp"
+//#include "SIMDCompressionAndIntersection/intersection.h"
+#include "xxhash.h"
+
+#include "spdlog/spdlog.h"
+#include "spdlog/sinks/ostream_sink.h"
+#include "spdlog/details/format.h"
+
+// Jellyfish 2 include
+#include "jellyfish/mer_dna.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+#include "jellyfish/hash_counter.hpp"
+
+#include "tclap/CmdLine.h"
+
+/*extern "C" {
+#include "kseq.h"
+}
+*/
+#include "stringpiece.h"
+#include "BooMap.hpp"
+#include "PairSequenceParser.hpp"
+#include "PairAlignmentFormatter.hpp"
+#include "SingleAlignmentFormatter.hpp"
+#include "RapMapUtils.hpp"
+#include "RapMapSAIndex.hpp"
+#include "RapMapFileSystem.hpp"
+#include "RapMapConfig.hpp"
+#include "ScopedTimer.hpp"
+#include "SpinLock.hpp"
+#include "IndexHeader.hpp"
+#include "SASearcher.hpp"
+#include "SACollector.hpp"
+
+//#define __TRACK_CORRECT__
+
+using paired_parser = pair_sequence_parser<char**>;
+using stream_manager = jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
+using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using TranscriptID = uint32_t;
+using TranscriptIDVector = std::vector<TranscriptID>;
+using KmerIDMap = std::vector<TranscriptIDVector>;
+using TranscriptList = std::vector<uint32_t>;
+using PositionList = std::vector<uint32_t>;
+using KmerIndex = std::unordered_map<uint64_t, TranscriptList, rapmap::utils::KmerKeyHasher>;
+using IntervalIndex = std::unordered_map<uint64_t, rapmap::utils::KmerInterval, rapmap::utils::KmerKeyHasher>;
+using OccList = std::vector<uint64_t>;
+using KmerInfoList = std::vector<rapmap::utils::KmerInfo>;
+using EqClassList = std::vector<rapmap::utils::EqClass>;
+using EqClassLabelVec = std::vector<uint32_t>;
+using PositionListHelper = rapmap::utils::PositionListHelper;
+#if defined __APPLE__
+using SpinLockT = SpinLock;
+#else
+using SpinLockT = std::mutex;
+#endif
+
+using HitCounters = rapmap::utils::HitCounters;
+using MateStatus = rapmap::utils::MateStatus;
+using HitInfo = rapmap::utils::HitInfo;
+using ProcessedHit = rapmap::utils::ProcessedHit;
+using QuasiAlignment = rapmap::utils::QuasiAlignment;
+using FixedWriter = rapmap::utils::FixedWriter;
+
+
+
+template <typename RapMapIndexT, typename CollectorT, typename MutexT>
+void processReadsSingleSA(single_parser * parser,
+                          RapMapIndexT& rmi,
+                          CollectorT& hitCollector,
+                          MutexT* iomutex,
+                          std::shared_ptr<spdlog::logger> outQueue,
+                          HitCounters& hctr,
+                          uint32_t maxNumHits,
+                          bool noOutput,
+                          bool strictCheck,
+                          bool consistentHits) {
+
+    using OffsetT = typename RapMapIndexT::IndexType;
+    auto& txpNames = rmi.txpNames;
+    auto& txpLens = rmi.txpLens;
+    uint32_t n{0};
+
+    auto logger = spdlog::get("stderrLog");
+
+    fmt::MemoryWriter sstream;
+    size_t batchSize{2500};
+    std::vector<QuasiAlignment> hits;
+
+    size_t readLen{0};
+	bool tooManyHits{false};
+    uint16_t flags;
+
+    SingleAlignmentFormatter<RapMapIndexT*> formatter(&rmi);
+
+    SASearcher<RapMapIndexT> saSearcher(&rmi);
+
+    uint32_t orphanStatus{0};
+    while(true) {
+        typename single_parser::job j(*parser); // Get a job from the parser: a bunch of reads (at most max_read_group)
+        if(j.is_empty()) break;                 // If we got nothing, then quit.
+        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+            readLen = j->data[i].seq.length();
+            ++hctr.numReads;
+            hits.clear();
+            hitCollector(j->data[i].seq, hits, saSearcher, MateStatus::SINGLE_END, strictCheck, consistentHits);
+            auto numHits = hits.size();
+            hctr.totHits += numHits;
+
+	    if (hits.size() > 0 and !noOutput and hits.size() <= maxNumHits) {
+                /*
+                std::sort(hits.begin(), hits.end(),
+                            [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                                return a.tid < b.tid;
+                            });
+                */
+                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+                                                       hctr, hits, sstream);
+            }
+
+            if (hctr.numReads > hctr.lastPrint + 1000000) {
+        		hctr.lastPrint.store(hctr.numReads.load());
+                if (iomutex->try_lock()){
+                    if (hctr.numReads > 0) {
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                        std::cerr << "\033[F\033[F\033[F";
+#else
+                        std::cerr << "\033[F\033[F";
+#endif // __DEBUG__
+                    }
+                    std::cerr << "saw " << hctr.numReads << " reads\n";
+                    std::cerr << "# hits per read = "
+                        << hctr.totHits / static_cast<float>(hctr.numReads) << "\n";
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                    std::cerr << "The true hit was in the returned set of hits "
+                        << 100.0 * (hctr.trueHits / static_cast<float>(hctr.numReads))
+                        <<  "% of the time\n";
+#endif // __DEBUG__
+                    iomutex->unlock();
+                }
+            }
+        } // for all reads in this job
+
+        // DUMP OUTPUT
+        if (!noOutput) {
+            std::string outStr(sstream.str());
+            // Get rid of last newline
+            if (!outStr.empty()) {
+                outStr.pop_back();
+                outQueue->info() << std::move(outStr);
+            }
+            sstream.clear();
+            /*
+             iomutex->lock();
+             outStream << sstream.str();
+             iomutex->unlock();
+             sstream.clear();
+             */
+        }
+
+    } // processed all reads
+
+
+}
+
+/**
+ *  Map reads from a collection of paired-end files.
+ */
+template <typename RapMapIndexT, typename CollectorT, typename MutexT>
+void processReadsPairSA(paired_parser* parser,
+                        RapMapIndexT& rmi,
+                        CollectorT& hitCollector,
+                        MutexT* iomutex,
+                        std::shared_ptr<spdlog::logger> outQueue,
+                        HitCounters& hctr,
+                        uint32_t maxNumHits,
+                        bool noOutput,
+                        bool strictCheck,
+                        bool nonStrictMerge,
+                        bool consistentHits) {
+
+    using OffsetT = typename RapMapIndexT::IndexType;
+
+    auto& txpNames = rmi.txpNames;
+    auto& txpLens = rmi.txpLens;
+    uint32_t n{0};
+
+    auto logger = spdlog::get("stderrLog");
+
+    fmt::MemoryWriter sstream;
+    size_t batchSize{1000};
+    std::vector<QuasiAlignment> leftHits;
+    std::vector<QuasiAlignment> rightHits;
+    std::vector<QuasiAlignment> jointHits;
+
+    size_t readLen{0};
+	bool tooManyHits{false};
+    uint16_t flags1, flags2;
+
+    // Create a formatter for alignments
+    PairAlignmentFormatter<RapMapIndexT*> formatter(&rmi);
+
+    SASearcher<RapMapIndexT> saSearcher(&rmi);
+
+    uint32_t orphanStatus{0};
+    while(true) {
+        typename paired_parser::job j(*parser); // Get a job from the parser: a bunch of reads (at most max_read_group)
+        if(j.is_empty()) break;                 // If we got nothing, quit
+        for(size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+		    tooManyHits = false;
+            readLen = j->data[i].first.seq.length();
+            ++hctr.numReads;
+            jointHits.clear();
+            leftHits.clear();
+            rightHits.clear();
+
+            bool lh = hitCollector(j->data[i].first.seq,
+                                   leftHits, saSearcher,
+                                   MateStatus::PAIRED_END_LEFT,
+                                   strictCheck,
+                                   consistentHits);
+
+            bool rh = hitCollector(j->data[i].second.seq,
+                                   rightHits, saSearcher,
+                                   MateStatus::PAIRED_END_RIGHT,
+                                   strictCheck,
+                                   consistentHits);
+
+            if (nonStrictMerge) {
+                rapmap::utils::mergeLeftRightHitsFuzzy(
+                        lh, rh,
+                        leftHits, rightHits, jointHits,
+                        readLen, maxNumHits, tooManyHits, hctr);
+
+            } else {
+                rapmap::utils::mergeLeftRightHits(
+                        leftHits, rightHits, jointHits,
+                        readLen, maxNumHits, tooManyHits, hctr);
+            }
+
+            // If we have reads to output, and we're writing output.
+            if (jointHits.size() > 0 and !noOutput and jointHits.size() <= maxNumHits) {
+                rapmap::utils::writeAlignmentsToStream(j->data[i], formatter,
+                                                       hctr, jointHits, sstream);
+            }
+
+            if (hctr.numReads > hctr.lastPrint + 1000000) {
+        		hctr.lastPrint.store(hctr.numReads.load());
+                if (iomutex->try_lock()) {
+                    if (hctr.numReads > 0) {
+                        std::cerr << "\r\r";
+                    }
+                    std::cerr << "saw " << hctr.numReads << " reads : "
+                              << "pe / read = " << hctr.peHits / static_cast<float>(hctr.numReads)
+                              << " : se / read = " << hctr.seHits / static_cast<float>(hctr.numReads) << ' ';
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                    std::cerr << ": true hit \% = "
+                        << (100.0 * (hctr.trueHits / static_cast<float>(hctr.numReads)));
+#endif // __DEBUG__
+                    iomutex->unlock();
+                }
+            }
+        } // for all reads in this job
+
+        // DUMP OUTPUT
+        if (!noOutput) {
+            std::string outStr(sstream.str());
+            // Get rid of last newline
+            if (!outStr.empty()) {
+                outStr.pop_back();
+                outQueue->info() << std::move(outStr);
+            }
+            sstream.clear();
+	        /*
+            iomutex->lock();
+            outStream << sstream.str();
+            iomutex->unlock();
+            sstream.clear();
+	        */
+        }
+
+    } // processed all reads
+
+}
+
+template <typename RapMapIndexT, typename MutexT>
+bool spawnProcessReadsThreads(
+                              uint32_t nthread,
+                              paired_parser* parser,
+                              RapMapIndexT& rmi,
+                              MutexT& iomutex,
+                              std::shared_ptr<spdlog::logger> outQueue,
+                              HitCounters& hctr,
+                              uint32_t maxNumHits,
+                              bool noOutput,
+                              bool strictCheck,
+                              bool fuzzy,
+                              bool consistentHits) {
+
+            std::vector<std::thread> threads;
+            SACollector<RapMapIndexT> saCollector(&rmi);
+            for (size_t i = 0; i < nthread; ++i) {
+                threads.emplace_back(processReadsPairSA<RapMapIndexT, SACollector<RapMapIndexT>, MutexT>,
+                                     parser,
+                                     std::ref(rmi),
+                                     std::ref(saCollector),
+                                     &iomutex,
+                                     outQueue,
+                                     std::ref(hctr),
+                                     maxNumHits,
+                                     noOutput,
+                                     strictCheck,
+                                     fuzzy,
+                                     consistentHits);
+            }
+
+            for (auto& t : threads) { t.join(); }
+            return true;
+        }
+
+template <typename RapMapIndexT, typename MutexT>
+bool spawnProcessReadsThreads(
+                              uint32_t nthread,
+                              single_parser* parser,
+                              RapMapIndexT& rmi,
+                              MutexT& iomutex,
+                              std::shared_ptr<spdlog::logger> outQueue,
+                              HitCounters& hctr,
+                              uint32_t maxNumHits,
+                              bool noOutput,
+                              bool strictCheck,
+                              bool consistentHits) {
+
+            std::vector<std::thread> threads;
+            SACollector<RapMapIndexT> saCollector(&rmi);
+            for (size_t i = 0; i < nthread; ++i) {
+                threads.emplace_back(processReadsSingleSA<RapMapIndexT, SACollector<RapMapIndexT>, MutexT>,
+                                     parser,
+                                     std::ref(rmi),
+                                     std::ref(saCollector),
+                                     &iomutex,
+                                     outQueue,
+                                     std::ref(hctr),
+                                     maxNumHits,
+                                     noOutput,
+                                     strictCheck, 
+                                     consistentHits);
+            }
+            for (auto& t : threads) { t.join(); }
+            return true;
+        }
+
+template <typename RapMapIndexT>
+bool mapReads(RapMapIndexT& rmi,
+	      std::shared_ptr<spdlog::logger> consoleLog,
+	      TCLAP::ValueArg<std::string>& index,
+	      TCLAP::ValueArg<std::string>& read1,
+	      TCLAP::ValueArg<std::string>& read2,
+	      TCLAP::ValueArg<std::string>& unmatedReads,
+	      TCLAP::ValueArg<uint32_t>& numThreads,
+	      TCLAP::ValueArg<uint32_t>& maxNumHits,
+	      TCLAP::ValueArg<std::string>& outname,
+	      TCLAP::SwitchArg& noout,
+	      TCLAP::SwitchArg& strict,
+          TCLAP::SwitchArg& fuzzy, 
+          TCLAP::SwitchArg& consistent) {
+
+	std::cerr << "\n\n\n\n";
+
+	bool pairedEnd = (read1.isSet() or read2.isSet());
+	// from: http://stackoverflow.com/questions/366955/obtain-a-stdostream-either-from-stdcout-or-stdofstreamfile
+	// set either a file or cout as the output stream
+	std::streambuf* outBuf;
+	std::ofstream outFile;
+	bool haveOutputFile{false};
+	if (outname.getValue() == "") {
+	    outBuf = std::cout.rdbuf();
+	} else {
+	    outFile.open(outname.getValue());
+	    outBuf = outFile.rdbuf();
+	    haveOutputFile = true;
+	}
+	// Now set the output stream to the buffer, which is
+	// either std::cout, or a file.
+	std::ostream outStream(outBuf);
+
+	// Must be a power of 2
+	size_t queueSize{268435456};
+	spdlog::set_async_mode(queueSize);
+	auto outputSink = std::make_shared<spdlog::sinks::ostream_sink_mt>(outStream);
+	std::shared_ptr<spdlog::logger> outLog = std::make_shared<spdlog::logger>("outLog", outputSink);
+	outLog->set_pattern("%v");
+
+	uint32_t nthread = numThreads.getValue();
+	std::unique_ptr<paired_parser> pairParserPtr{nullptr};
+	std::unique_ptr<single_parser> singleParserPtr{nullptr};
+
+	if (!noout.getValue()) {
+	  rapmap::utils::writeSAMHeader(rmi, outLog);
+	}
+
+    bool strictCheck = strict.getValue();
+    bool fuzzyIntersection = fuzzy.getValue();
+    bool consistentHits = consistent.getValue();
+	SpinLockT iomutex;
+	{
+	    ScopedTimer timer;
+	    HitCounters hctrs;
+	    consoleLog->info("mapping reads . . . \n\n\n");
+        if (pairedEnd) {
+            std::vector<std::string> read1Vec = rapmap::utils::tokenize(read1.getValue(), ',');
+            std::vector<std::string> read2Vec = rapmap::utils::tokenize(read2.getValue(), ',');
+
+            if (read1Vec.size() != read2Vec.size()) {
+                consoleLog->error("The number of provided files for "
+                                  "-1 and -2 must be the same!");
+                std::exit(1);
+            }
+
+            size_t numFiles = read1Vec.size() + read2Vec.size();
+            char** pairFileList = new char*[numFiles];
+            for (size_t i = 0; i < read1Vec.size(); ++i) {
+                pairFileList[2*i] = const_cast<char*>(read1Vec[i].c_str());
+                pairFileList[2*i+1] = const_cast<char*>(read2Vec[i].c_str());
+            }
+            size_t maxReadGroup{1000}; // Number of reads in each "job"
+            size_t concurrentFile{2}; // Number of files to read simultaneously
+            pairParserPtr.reset(new paired_parser(4 * nthread, maxReadGroup,
+                        concurrentFile,
+                        pairFileList, pairFileList+numFiles));
+
+            spawnProcessReadsThreads(nthread, pairParserPtr.get(), rmi, iomutex,
+                                     outLog, hctrs, maxNumHits.getValue(), noout.getValue(), strictCheck, 
+                                     fuzzyIntersection, consistentHits);
+            delete [] pairFileList;
+        } else {
+            std::vector<std::string> unmatedReadVec = rapmap::utils::tokenize(unmatedReads.getValue(), ',');
+            size_t maxReadGroup{1000}; // Number of reads in each "job"
+            size_t concurrentFile{1};
+            stream_manager streams( unmatedReadVec.begin(), unmatedReadVec.end(),
+                    concurrentFile);
+            singleParserPtr.reset(new single_parser(4 * nthread,
+                        maxReadGroup,
+                        concurrentFile,
+                        streams));
+
+            /** Create the threads depending on the collector type **/
+            spawnProcessReadsThreads(nthread, singleParserPtr.get(), rmi, iomutex,
+                                      outLog, hctrs, maxNumHits.getValue(), noout.getValue(), 
+                                     strictCheck, consistentHits);
+        }
+	std::cerr << "\n\n";
+
+
+    consoleLog->info("Done mapping reads.");
+    consoleLog->info("In total saw {} reads.", hctrs.numReads);
+    consoleLog->info("Final # hits per read = {}", hctrs.totHits / static_cast<float>(hctrs.numReads));
+	consoleLog->info("flushing output queue.");
+	outLog->flush();
+	/*
+	    consoleLog->info("Discarded {} reads because they had > {} alignments",
+		    hctrs.tooManyHits, maxNumHits.getValue());
+		    */
+
+	}
+
+	if (haveOutputFile) {
+	    outFile.close();
+	}
+	return true;
+}
+
+
+int rapMapSAMap(int argc, char* argv[]) {
+  std::cerr << "RapMap Mapper (SA-based)\n";
+
+  std::string versionString = rapmap::version;
+  TCLAP::CmdLine cmd(
+		     "RapMap Mapper",
+		     ' ',
+		     versionString);
+  cmd.getProgramName() = "rapmap";
+
+  TCLAP::ValueArg<std::string> index("i", "index", "The location of the quasiindex", true, "", "path");
+  TCLAP::ValueArg<std::string> read1("1", "leftMates", "The location of the left paired-end reads", false, "", "path");
+  TCLAP::ValueArg<std::string> read2("2", "rightMates", "The location of the right paired-end reads", false, "", "path");
+  TCLAP::ValueArg<std::string> unmatedReads("r", "unmatedReads", "The location of single-end reads", false, "", "path");
+  TCLAP::ValueArg<uint32_t> numThreads("t", "numThreads", "Number of threads to use", false, 1, "positive integer");
+  TCLAP::ValueArg<uint32_t> maxNumHits("m", "maxNumHits", "Reads mapping to more than this many loci are discarded", false, 200, "positive integer");
+  TCLAP::ValueArg<std::string> outname("o", "output", "The output file (default: stdout)", false, "", "path");
+  TCLAP::SwitchArg noout("n", "noOutput", "Don't write out any alignments (for speed testing purposes)", false);
+  TCLAP::SwitchArg strict("s", "strictCheck", "Perform extra checks to try and assure that only equally \"best\" mappings for a read are reported", false);
+  TCLAP::SwitchArg fuzzy("f", "fuzzyIntersection", "Find paired-end mapping locations using fuzzy intersection", false);
+  TCLAP::SwitchArg consistent("c", "consistentHits", "Ensure that the hits collected are consistent (co-linear)", false);
+  cmd.add(index);
+  cmd.add(noout);
+
+  cmd.add(read1);
+  cmd.add(read2);
+  cmd.add(unmatedReads);
+  cmd.add(outname);
+  cmd.add(numThreads);
+  cmd.add(maxNumHits);
+  cmd.add(strict);
+  cmd.add(fuzzy);
+  cmd.add(consistent);
+
+  auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+  auto consoleLog = spdlog::create("stderrLog", {consoleSink});
+
+  try {
+
+    cmd.parse(argc, argv);
+    bool pairedEnd = (read1.isSet() or read2.isSet());
+    if (pairedEnd and (read1.isSet() != read2.isSet())) {
+      consoleLog->error("You must set both the -1 and -2 arguments to align "
+			"paired end reads!");
+      std::exit(1);
+    }
+
+    if (pairedEnd and unmatedReads.isSet()) {
+      consoleLog->error("You cannot specify both paired-end and unmated "
+			"reads in the input!");
+      std::exit(1);
+    }
+
+    if (!pairedEnd and !unmatedReads.isSet()) {
+      consoleLog->error("You must specify input; either both paired-end "
+			"or unmated reads!");
+      std::exit(1);
+
+    }
+
+    std::string indexPrefix(index.getValue());
+    if (indexPrefix.back() != '/') {
+      indexPrefix += "/";
+    }
+
+    if (!rapmap::fs::DirExists(indexPrefix.c_str())) {
+      consoleLog->error("It looks like the index you provided [{}] "
+			"doesn't exist", indexPrefix);
+      std::exit(1);
+    }
+
+    IndexHeader h;
+    std::ifstream indexStream(indexPrefix + "header.json");
+    {
+      cereal::JSONInputArchive ar(indexStream);
+      ar(h);
+    }
+    indexStream.close();
+
+    if (h.indexType() != IndexType::QUASI) {
+      consoleLog->error("The index {} does not appear to be of the "
+			"appropriate type (quasi)", indexPrefix);
+      std::exit(1);
+    }
+
+    //std::unique_ptr<RapMapSAIndex<int32_t>> SAIdxPtr{nullptr};
+    //std::unique_ptr<RapMapSAIndex<int64_t>> BigSAIdxPtr{nullptr};
+
+    bool success{false};
+    if (h.bigSA()) {
+        //std::cerr << "Loading 64-bit suffix array index: \n";
+      //BigSAIdxPtr.reset(new RapMapSAIndex<int64_t>);
+      //BigSAIdxPtr->load(indexPrefix, h.kmerLen());
+      if (h.perfectHash()) {
+          RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>> rmi;
+          rmi.load(indexPrefix);
+          success = mapReads(rmi, consoleLog, index, read1, read2,
+                             unmatedReads, numThreads, maxNumHits,
+                             outname, noout, strict, fuzzy, consistent);
+      } else {
+          RapMapSAIndex<int64_t,
+                        google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int64_t>,
+                                               rapmap::utils::KmerKeyHasher>> rmi;
+          rmi.load(indexPrefix);
+          success = mapReads(rmi, consoleLog, index, read1, read2,
+                             unmatedReads, numThreads, maxNumHits,
+                             outname, noout, strict, fuzzy, consistent);
+      }
+    } else {
+        //std::cerr << "Loading 32-bit suffix array index: \n";
+      //SAIdxPtr.reset(new RapMapSAIndex<int32_t>);
+      //SAIdxPtr->load(indexPrefix, h.kmerLen());
+        if (h.perfectHash()) {
+            RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>> rmi;
+            rmi.load(indexPrefix);
+            success = mapReads(rmi, consoleLog, index, read1, read2,
+                               unmatedReads, numThreads, maxNumHits,
+                               outname, noout, strict, fuzzy, consistent);
+        } else {
+            RapMapSAIndex<int32_t,
+                          google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int32_t>,
+                                                 rapmap::utils::KmerKeyHasher>> rmi;
+            rmi.load(indexPrefix);
+            success = mapReads(rmi, consoleLog, index, read1, read2,
+                               unmatedReads, numThreads, maxNumHits,
+                               outname, noout, strict, fuzzy, consistent);
+        }
+    }
+
+    return success ? 0 : 1;
+  } catch (TCLAP::ArgException& e) {
+    consoleLog->error("Exception [{}] when parsing argument {}", e.error(), e.argId());
+    return 1;
+  }
+
+}
diff --git a/src/RapMapUtils.cpp b/src/RapMapUtils.cpp
new file mode 100644
index 0000000..7b0dee2
--- /dev/null
+++ b/src/RapMapUtils.cpp
@@ -0,0 +1,562 @@
+#include <cereal/types/vector.hpp>
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/archives/binary.hpp>
+
+#include "RapMapUtils.hpp"
+#include "RapMapSAIndex.hpp"
+#include "RapMapIndex.hpp"
+#include "PairAlignmentFormatter.hpp"
+#include "SingleAlignmentFormatter.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+#include "BooMap.hpp"
+
+namespace rapmap {
+    namespace utils {
+        std::vector<std::string> tokenize(const std::string &s, char delim) {
+            std::stringstream ss(s);
+            std::string item;
+            std::vector<std::string> elems;
+            while (std::getline(ss, item, delim)) {
+                elems.push_back(item);
+            }
+            return elems;
+        }
+
+
+		// positions are stored in a packed format, where the highest
+		// 2-bits encode if this position refers to a new transcript
+		// and whether or not the k-mer from the hash matches this txp
+		// in the forward or rc direction.
+		void decodePosition(uint32_t p, int32_t& pOut, bool& newTxp, bool& isRC) {
+			uint32_t highBits = (p >> 30);
+			pOut = (p & 0x3fffffff);
+			newTxp = (highBits & 0x1);
+			isRC = (highBits & 0x2);
+		}
+
+
+
+        static constexpr int8_t rc_table[128] = {
+            78, 78,  78, 78,  78,  78,  78, 78,  78, 78, 78, 78,  78, 78, 78, 78, // 15
+            78, 78,  78, 78,  78,  78,  78, 78,  78, 78, 78, 78,  78, 78, 78, 78, // 31
+            78, 78,  78, 78,  78,  78,  78, 78,  78, 78, 78, 78,  78, 78, 78, 78, // 787
+            78, 78,  78, 78,  78,  78,  78, 78,  78, 78, 78, 78,  78, 78, 78, 78, // 63
+            78, 84, 78, 71, 78,  78,  78, 67, 78, 78, 78, 78,  78, 78, 78, 78, // 79
+            78, 78,  78, 78,  65, 65, 78, 78,  78, 78, 78, 78,  78, 78, 78, 78, // 95
+            78, 84, 78, 71, 78,  78,  78, 67, 78, 78, 78, 78,  78, 78, 78, 78, // 101
+            78, 78,  78, 78,  65, 65, 78, 78,  78, 78, 78, 78,  78, 78, 78, 78  // 127
+        };
+
+        // Adapted from
+        // https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/blob/8c9933a1685e0ab50c7d8b7926c9068bc0c9d7d2/src/main.c#L36
+        void reverseRead(std::string& seq,
+                std::string& qual,
+                std::string& readWork,
+                std::string& qualWork) {
+
+            readWork.resize(seq.length(), 'A');
+            qualWork.resize(qual.length(), 'I');
+            int32_t end = seq.length()-1, start = 0;
+            //readWork[end] = '\0';
+            //qualWork[end] = '\0';
+            while (LIKELY(start < end)) {
+                readWork[start] = (char)rc_table[(int8_t)seq[end]];
+                readWork[end] = (char)rc_table[(int8_t)seq[start]];
+                qualWork[start] = qual[end];
+                qualWork[end] = qual[start];
+                ++ start;
+                -- end;
+            }
+            // If odd # of bases, we still have to complement the middle
+            if (start == end) {
+                readWork[start] = (char)rc_table[(int8_t)seq[start]];
+                // but don't need to mess with quality
+                // qualWork[start] = qual[start];
+            }
+            //std::swap(seq, readWork);
+            //std::swap(qual, qualWork);
+        }
+
+        template <typename ReadT, typename IndexT>
+        uint32_t writeAlignmentsToStream(
+                ReadT& r,
+                SingleAlignmentFormatter<IndexT>& formatter,
+                HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& hits,
+                fmt::MemoryWriter& sstream
+                ) {
+                // Convenient variable name bindings
+                auto& txpNames = formatter.index->txpNames;
+                auto& txpLens = formatter.index->txpLens;
+
+                auto& readTemp = formatter.readTemp;
+                auto& qualTemp = formatter.qualTemp;
+                auto& cigarStr = formatter.cigarStr;
+
+                uint16_t flags;
+
+                auto& readName = r.header;
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                auto before = readName.find_first_of(':');
+                before = readName.find_first_of(':', before+1);
+                auto after = readName.find_first_of(':', before+1);
+                const auto& txpName = readName.substr(before+1, after-before-1);
+#endif //__DEBUG__
+                // If the read name contains multiple space-separated parts, print
+                // only the first
+                size_t splitPos = readName.find(' ');
+                if (splitPos < readName.length()) {
+                    readName[splitPos] = '\0';
+                }
+
+
+                std::string numHitFlag = fmt::format("NH:i:{}", hits.size());
+                uint32_t alnCtr{0};
+                bool haveRev{false};
+                for (auto& qa : hits) {
+                    auto& transcriptName = txpNames[qa.tid];
+                    // === SAM
+                    rapmap::utils::getSamFlags(qa, flags);
+                    if (alnCtr != 0) {
+                        flags |= 0x900;
+                    }
+
+                    std::string* readSeq = &(r.seq);
+                    std::string* qstr = &(r.qual);
+
+                    if (!qa.fwd) {
+                        if (!haveRev) {
+                            rapmap::utils::reverseRead(*readSeq, *qstr,
+                                                       readTemp, qualTemp);
+                            haveRev = true;
+                        }
+                        readSeq = &(readTemp);
+                        qstr = &(qualTemp);
+                    }
+
+                   rapmap::utils::adjustOverhang(qa.pos, qa.readLen, txpLens[qa.tid], cigarStr);
+
+                    sstream << readName.c_str() << '\t' // QNAME
+                        << flags << '\t' // FLAGS
+                        << transcriptName << '\t' // RNAME
+                        << qa.pos + 1 << '\t' // POS (1-based)
+                        << 255 << '\t' // MAPQ
+                        << cigarStr.c_str() << '\t' // CIGAR
+                        << '*' << '\t' // MATE NAME
+                        << 0 << '\t' // MATE POS
+                        << qa.fragLen << '\t' // TLEN
+                        << *readSeq << '\t' // SEQ
+                        << *qstr << '\t' // QSTR
+                        << numHitFlag << '\n';
+                    ++alnCtr;
+                    // === SAM
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                    if (txpNames[qa.tid] == txpName) { ++hctr.trueHits; }
+#endif //__DEBUG__
+                }
+                return alnCtr;
+            }
+
+        // For reads paired *in sequencing*
+        template <typename ReadPairT, typename IndexT>
+        uint32_t writeAlignmentsToStream(
+                ReadPairT& r,
+                PairAlignmentFormatter<IndexT>& formatter,
+                HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream
+                ) {
+                // Convenient variable name bindings
+                auto& txpNames = formatter.index->txpNames;
+                auto& txpLens = formatter.index->txpLens;
+
+                auto& read1Temp = formatter.read1Temp;
+                auto& read2Temp = formatter.read2Temp;
+                auto& qual1Temp = formatter.qual1Temp;
+                auto& qual2Temp = formatter.qual2Temp;
+                auto& cigarStr1 = formatter.cigarStr1;
+                auto& cigarStr2 = formatter.cigarStr2;
+
+                uint16_t flags1, flags2;
+
+                auto& readName = r.first.header;
+                // If the read name contains multiple space-separated parts,
+                // print only the first
+                size_t splitPos = readName.find(' ');
+                if (splitPos < readName.length()) {
+                    readName[splitPos] = '\0';
+                } else {
+                    splitPos = readName.length();
+                }
+
+                // trim /1 from the pe read
+                if (splitPos > 2 and readName[splitPos - 2] == '/') {
+                    readName[splitPos - 2] = '\0';
+                }
+
+                auto& mateName = r.second.header;
+                // If the read name contains multiple space-separated parts,
+                // print only the first
+                splitPos = mateName.find(' ');
+                if (splitPos < mateName.length()) {
+                    mateName[splitPos] = '\0';
+                } else {
+                    splitPos = mateName.length();
+                }
+
+                // trim /2 from the pe read
+                if (splitPos > 2 and mateName[splitPos - 2] == '/') {
+                    mateName[splitPos - 2] = '\0';
+                }
+
+                /*
+                // trim /1 and /2 from pe read names
+                if (readName.length() > 2 and
+                        readName[readName.length() - 2] == '/') {
+                    readName[readName.length() - 2] = '\0';
+                }
+                if (mateName.length() > 2 and
+                        mateName[mateName.length() - 2] == '/') {
+                    mateName[mateName.length() - 2] = '\0';
+                }
+                */
+
+                std::string numHitFlag = fmt::format("NH:i:{}", jointHits.size());
+                uint32_t alnCtr{0};
+				uint32_t trueHitCtr{0};
+				QuasiAlignment* firstTrueHit{nullptr};
+                bool haveRev1{false};
+                bool haveRev2{false};
+                bool* haveRev = nullptr;
+                size_t i{0};
+                for (auto& qa : jointHits) {
+
+                    ++i;
+                    auto& transcriptName = txpNames[qa.tid];
+                    // === SAM
+                    if (qa.isPaired) {
+                        rapmap::utils::getSamFlags(qa, true, flags1, flags2);
+                        if (alnCtr != 0) {
+                            flags1 |= 0x100; flags2 |= 0x100;
+                        }
+
+                        auto txpLen = txpLens[qa.tid];
+                        rapmap::utils::adjustOverhang(qa, txpLens[qa.tid], cigarStr1, cigarStr2);
+
+                        // Reverse complement the read and reverse
+                        // the quality string if we need to
+                        std::string* readSeq1 = &(r.first.seq);
+                        std::string* qstr1 = &(r.first.qual);
+                        if (!qa.fwd) {
+                            if (!haveRev1) {
+                                rapmap::utils::reverseRead(*readSeq1, *qstr1,
+                                        read1Temp, qual1Temp);
+                                haveRev1 = true;
+                            }
+                            readSeq1 = &(read1Temp);
+                            qstr1 = &(qual1Temp);
+                        }
+
+                        std::string* readSeq2 = &(r.second.seq);
+                        std::string* qstr2 = &(r.second.qual);
+                        if (!qa.mateIsFwd) {
+                            if (!haveRev2) {
+                                rapmap::utils::reverseRead(*readSeq2, *qstr2,
+                                        read2Temp, qual2Temp);
+                                haveRev2 = true;
+                            }
+                            readSeq2 = &(read2Temp);
+                            qstr2 = &(qual2Temp);
+                        }
+
+                        // If the fragment overhangs the right end of the transcript
+                        // adjust fragLen (overhanging the left end is already handled).
+                        int32_t read1Pos = qa.pos;
+                        int32_t read2Pos = qa.matePos;
+                        const bool read1First{read1Pos < read2Pos};
+                        const int32_t minPos = read1First ? read1Pos : read2Pos;
+                        if (minPos + qa.fragLen > txpLen) { qa.fragLen = txpLen - minPos; }
+                        
+                        // get the fragment length as a signed int
+                        const int32_t fragLen = static_cast<int32_t>(qa.fragLen);
+
+
+                        sstream << readName.c_str() << '\t' // QNAME
+                                << flags1 << '\t' // FLAGS
+                                << transcriptName << '\t' // RNAME
+                                << qa.pos + 1 << '\t' // POS (1-based)
+                                << 1 << '\t' // MAPQ
+                                << cigarStr1.c_str() << '\t' // CIGAR
+                                << '=' << '\t' // RNEXT
+                                << qa.matePos + 1 << '\t' // PNEXT
+                                << ((read1First) ? fragLen : -fragLen) << '\t' // TLEN
+                                << *readSeq1 << '\t' // SEQ
+                                << *qstr1 << '\t' // QUAL
+                                << numHitFlag << '\n';
+
+                        sstream << mateName.c_str() << '\t' // QNAME
+                                << flags2 << '\t' // FLAGS
+                                << transcriptName << '\t' // RNAME
+                                << qa.matePos + 1 << '\t' // POS (1-based)
+                                << 1 << '\t' // MAPQ
+                                << cigarStr2.c_str() << '\t' // CIGAR
+                                << '=' << '\t' // RNEXT
+                                << qa.pos + 1 << '\t' // PNEXT
+                                << ((read1First) ? -fragLen : fragLen) << '\t' // TLEN
+                                << *readSeq2 << '\t' // SEQ
+                                << *qstr2 << '\t' // QUAL
+                                << numHitFlag << '\n';
+                    } else {
+                        rapmap::utils::getSamFlags(qa, true, flags1, flags2);
+                        if (alnCtr != 0) {
+                            flags1 |= 0x100; flags2 |= 0x100;
+                        }
+			/*
+			else {
+                            // If this is the first alignment for this read
+                            // If the left end is mapped, set 0x100 on the right end
+                            if (qa.mateStatus == MateStatus::PAIRED_END_LEFT) {
+                                flags2 |= 0x100;
+                            } else {
+                            // Otherwise, set 0x100 on the left end
+                                flags1 |= 0x100;
+                            }
+                        }
+			*/
+
+                        std::string* readSeq{nullptr};
+                        std::string* unalignedSeq{nullptr};
+
+                        uint32_t flags, unalignedFlags;
+                        std::string* qstr{nullptr};
+                        std::string* unalignedQstr{nullptr};
+                        std::string* alignedName{nullptr};
+                        std::string* unalignedName{nullptr};
+                        std::string* readTemp{nullptr};
+                        std::string* qualTemp{nullptr};
+
+                        rapmap::utils::FixedWriter* cigarStr;
+                        if (qa.mateStatus == MateStatus::PAIRED_END_LEFT) { // left read
+                            alignedName = &readName;
+                            unalignedName = &mateName;
+
+                            readSeq = &(r.first.seq);
+                            unalignedSeq = &(r.second.seq);
+
+                            qstr = &(r.first.qual);
+                            unalignedQstr = &(r.second.qual);
+
+                            flags = flags1;
+                            unalignedFlags = flags2;
+
+                            cigarStr = &cigarStr1;
+
+                            haveRev = &haveRev1;
+                            readTemp = &read1Temp;
+                            qualTemp = &qual1Temp;
+                        } else { // right read
+                            alignedName = &mateName;
+                            unalignedName = &readName;
+
+                            readSeq = &(r.second.seq);
+                            unalignedSeq = &(r.first.seq);
+
+                            qstr = &(r.second.qual);
+                            unalignedQstr = &(r.first.qual);
+
+                            flags = flags2;
+                            unalignedFlags = flags1;
+
+                            cigarStr = &cigarStr2;
+                            haveRev = &haveRev2;
+                            readTemp = &read2Temp;
+                            qualTemp = &qual2Temp;
+                        }
+
+                        // Reverse complement the read and reverse
+                        // the quality string if we need to
+                        if (!qa.fwd) {
+                            if (!(*haveRev)) {
+                                rapmap::utils::reverseRead(*readSeq, *qstr,
+                                        *readTemp, *qualTemp);
+                                *haveRev = true;
+                            }
+                            readSeq = readTemp;
+                            qstr = qualTemp;
+                        }
+
+                        /*
+                        if (!qa.fwd) {
+                            rapmap::utils::reverseRead(*readSeq, *qstr,
+                                        read1Temp, qual1Temp);
+                        }
+                        */
+
+                        rapmap::utils::adjustOverhang(qa.pos, qa.readLen, txpLens[qa.tid], *cigarStr);
+                        sstream << alignedName->c_str() << '\t' // QNAME
+                                << flags << '\t' // FLAGS
+                                << transcriptName << '\t' // RNAME
+                                << qa.pos + 1 << '\t' // POS (1-based)
+                                << 1 << '\t' // MAPQ
+                                << cigarStr->c_str() << '\t' // CIGAR
+                                << '=' << '\t' // RNEXT
+                                << qa.pos+1 << '\t' // PNEXT (only 1 read in templte)
+                                << 0 << '\t' // TLEN (spec says 0, not read len)
+                                << *readSeq << '\t' // SEQ
+                                << *qstr << '\t' // QUAL
+                                << numHitFlag << '\n';
+
+
+                        // Output the info for the unaligned mate.
+                        sstream << unalignedName->c_str() << '\t' // QNAME
+                            << unalignedFlags << '\t' // FLAGS
+                            << transcriptName << '\t' // RNAME (same as mate)
+                            << qa.pos + 1 << '\t' // POS (same as mate)
+                            << 0 << '\t' // MAPQ
+                            << unalignedSeq->length() << 'S' << '\t' // CIGAR
+                            << '=' << '\t' // RNEXT
+                            << qa.pos + 1 << '\t' // PNEXT (only 1 read in template)
+                            << 0 << '\t' // TLEN (spec says 0, not read len)
+                            << *unalignedSeq << '\t' // SEQ
+                            << *unalignedQstr << '\t' // QUAL
+                            << numHitFlag << '\n';
+                    }
+                    ++alnCtr;
+                    // == SAM
+#if defined(__DEBUG__) || defined(__TRACK_CORRECT__)
+                    if (transcriptName == trueTxpName) {
+							if (trueHitCtr == 0) {
+									++hctr.trueHits;
+									++trueHitCtr;
+									firstTrueHit = &qa;
+							} else {
+									++trueHitCtr;
+									std::cerr << "Found true hit " << trueHitCtr << " times!\n";
+									std::cerr << transcriptName << '\t' << firstTrueHit->pos
+											<< '\t' << firstTrueHit->fwd << '\t' << firstTrueHit->fragLen
+											<< '\t' << (firstTrueHit->isPaired ? "Paired" : "Orphan") << '\t';
+								    printMateStatus(firstTrueHit->mateStatus);
+								    std::cerr << '\n';
+									std::cerr << transcriptName << '\t' << qa.pos
+											  << '\t' << qa.fwd << '\t' << qa.fragLen
+										      << '\t' << (qa.isPaired ? "Paired" : "Orphan") << '\t';
+								    printMateStatus(qa.mateStatus);
+								    std::cerr << '\n';
+							}
+					}
+#endif //__DEBUG__
+                }
+                return alnCtr;
+        }
+
+
+
+        // Is there a smarter way to do save / load here?
+        /*
+        template <typename Archive, typename MerT>
+            void save(Archive& archive, const MerT& mer) const {
+                auto key = mer.get_bits(0, 2*mer.k());
+                archive(key);
+            }
+
+        template <typename Archive>
+            void load(Archive& archive, const MerT& mer) {
+                mer.polyT();
+                uint64_t bits;
+                archive(bits);
+                auto k = mer.k();
+                mer.set_bits(0, 2*k, bits);
+            }
+        */
+    }
+}
+
+using SAIndex32BitDense = RapMapSAIndex<int32_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int32_t>,
+								       rapmap::utils::KmerKeyHasher>>;
+using SAIndex64BitDense = RapMapSAIndex<int64_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int64_t>,
+								       rapmap::utils::KmerKeyHasher>>;
+using SAIndex32BitPerfect = RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+using SAIndex64BitPerfect = RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
+
+// Explicit instantiations
+// pair parser, 32-bit, dense hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex32BitDense*>(
+                std::pair<header_sequence_qual, header_sequence_qual>& r,
+                PairAlignmentFormatter<SAIndex32BitDense*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+// pair parser, 64-bit, dense hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex64BitDense*>(
+                std::pair<header_sequence_qual, header_sequence_qual>& r,
+                PairAlignmentFormatter<SAIndex64BitDense*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+// pair parser, 32-bit, perfect hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex32BitPerfect*>(
+                std::pair<header_sequence_qual, header_sequence_qual>& r,
+                PairAlignmentFormatter<SAIndex32BitPerfect*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+// pair parser, 64-bit, perfect hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, SAIndex64BitPerfect*>(
+                std::pair<header_sequence_qual, header_sequence_qual>& r,
+                PairAlignmentFormatter<SAIndex64BitPerfect*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+
+// single parser, 32-bit, dense hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex32BitDense*>(
+		jellyfish::header_sequence_qual& r,
+                SingleAlignmentFormatter<SAIndex32BitDense*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+// single parser, 64-bit, dense hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex64BitDense*>(
+		jellyfish::header_sequence_qual& r,
+                SingleAlignmentFormatter<SAIndex64BitDense*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+// single parser, 32-bit, perfect hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex32BitPerfect*>(
+ 		jellyfish::header_sequence_qual& r,
+                SingleAlignmentFormatter<SAIndex32BitPerfect*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+// single parser, 64-bit, perfect hash
+template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, SAIndex64BitPerfect*>(
+		jellyfish::header_sequence_qual& r,
+                SingleAlignmentFormatter<SAIndex64BitPerfect*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+
+template uint32_t rapmap::utils::writeAlignmentsToStream<std::pair<header_sequence_qual, header_sequence_qual>, RapMapIndex*>(
+                std::pair<header_sequence_qual, header_sequence_qual>& r,
+                PairAlignmentFormatter<RapMapIndex*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream
+                );
+
+template uint32_t rapmap::utils::writeAlignmentsToStream<jellyfish::header_sequence_qual, RapMapIndex*>(
+                jellyfish::header_sequence_qual& r,
+                SingleAlignmentFormatter<RapMapIndex*>& formatter,
+                rapmap::utils::HitCounters& hctr,
+                std::vector<rapmap::utils::QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream
+                );
diff --git a/src/UtilTest.cpp b/src/UtilTest.cpp
new file mode 100644
index 0000000..e68c7bd
--- /dev/null
+++ b/src/UtilTest.cpp
@@ -0,0 +1,55 @@
+/* 
+ *  Copyright (c) 2012 Daisuke Okanohara
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *   1. Redistributions of source code must retain the above Copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above Copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the authors nor the names of its contributors
+ *      may be used to endorse or promote products derived from this
+ *      software without specific prior written permission.
+ */
+
+#include <gtest/gtest.h>
+#include "Util.hpp"
+
+using namespace std;
+
+uint64_t GetBinLen(uint64_t x){
+  uint64_t len = 0;
+  for (; x >> len; ++len) {}
+  return len;
+}
+
+TEST(Util, Slice){
+  vector<uint64_t> vals;
+  uint64_t offset = 0;
+  for (uint64_t i = 0; i < 10000; ++i){
+    vals.push_back(rand());
+  }
+  for (uint64_t i = 0; i < vals.size(); ++i){
+    offset += GetBinLen(vals[i]);
+  }
+  vector<uint64_t> bits(rsdic::Util::Floor(offset, 64));
+
+  offset = 0;
+  for (uint64_t i = 0; i < vals.size(); ++i){
+    uint64_t len = GetBinLen(vals[i]);
+    rsdic::Util::SetSlice(bits, offset, len, vals[i]);
+    offset += len;
+  }
+
+  offset = 0;
+  for (uint64_t i = 0; i < vals.size(); ++i){
+    uint64_t len = GetBinLen(vals[i]);
+    ASSERT_EQ(vals[i], rsdic::Util::GetSlice(bits, offset, len)) << i;
+    offset += len;
+  }
+}
diff --git a/src/bit_array.c b/src/bit_array.c
new file mode 100644
index 0000000..af0bc1a
--- /dev/null
+++ b/src/bit_array.c
@@ -0,0 +1,3160 @@
+/*
+ bit_array.c
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ maintainer: Isaac Turner <turner.isaac at gmail.com>
+ license: Public Domain, no warranty
+ date: Aug 2014
+*/
+
+// 64 bit words
+// Array length can be zero
+// Unused top bits must be zero
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <limits.h> // ULONG_MAX
+#include <errno.h>
+#include <signal.h> // needed for abort()
+#include <string.h> // memset()
+#include <assert.h>
+#include <time.h> // needed for seeding rand()
+#include <unistd.h>  // need for getpid() for seeding rand number
+#include <ctype.h>  // need for tolower()
+#include <errno.h>  // perror()
+#include <sys/time.h> // for seeding random
+
+// Windows includes
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "bit_array.h"
+#include "bit_macros.h"
+
+//
+// Tables of constants
+//
+
+// byte reverse look up table
+static const word_t reverse_table[256] =
+{
+  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0,
+  0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
+  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8,
+  0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
+  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4,
+  0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
+  0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
+  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2,
+  0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
+  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA,
+  0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6,
+  0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE,
+  0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
+  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1,
+  0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9,
+  0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
+  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
+  0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
+  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED,
+  0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3,
+  0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB,
+  0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7,
+  0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF,
+  0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF,
+};
+
+// Morton table for interleaving bytes
+static const word_t morton_table0[256] =
+{
+  0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
+  0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
+  0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
+  0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
+  0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
+  0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
+  0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
+  0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
+  0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
+  0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
+  0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
+  0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
+  0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
+  0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
+  0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
+  0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
+  0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
+  0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
+  0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
+  0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
+  0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
+  0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
+  0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
+  0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
+  0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
+  0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
+  0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
+  0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
+  0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
+  0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
+  0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
+  0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555,
+};
+
+// Morton table for interleaving bytes, shifted left 1 bit
+static const word_t morton_table1[256] =
+{
+  0x0000, 0x0002, 0x0008, 0x000A, 0x0020, 0x0022, 0x0028, 0x002A,
+  0x0080, 0x0082, 0x0088, 0x008A, 0x00A0, 0x00A2, 0x00A8, 0x00AA,
+  0x0200, 0x0202, 0x0208, 0x020A, 0x0220, 0x0222, 0x0228, 0x022A,
+  0x0280, 0x0282, 0x0288, 0x028A, 0x02A0, 0x02A2, 0x02A8, 0x02AA,
+  0x0800, 0x0802, 0x0808, 0x080A, 0x0820, 0x0822, 0x0828, 0x082A,
+  0x0880, 0x0882, 0x0888, 0x088A, 0x08A0, 0x08A2, 0x08A8, 0x08AA,
+  0x0A00, 0x0A02, 0x0A08, 0x0A0A, 0x0A20, 0x0A22, 0x0A28, 0x0A2A,
+  0x0A80, 0x0A82, 0x0A88, 0x0A8A, 0x0AA0, 0x0AA2, 0x0AA8, 0x0AAA,
+  0x2000, 0x2002, 0x2008, 0x200A, 0x2020, 0x2022, 0x2028, 0x202A,
+  0x2080, 0x2082, 0x2088, 0x208A, 0x20A0, 0x20A2, 0x20A8, 0x20AA,
+  0x2200, 0x2202, 0x2208, 0x220A, 0x2220, 0x2222, 0x2228, 0x222A,
+  0x2280, 0x2282, 0x2288, 0x228A, 0x22A0, 0x22A2, 0x22A8, 0x22AA,
+  0x2800, 0x2802, 0x2808, 0x280A, 0x2820, 0x2822, 0x2828, 0x282A,
+  0x2880, 0x2882, 0x2888, 0x288A, 0x28A0, 0x28A2, 0x28A8, 0x28AA,
+  0x2A00, 0x2A02, 0x2A08, 0x2A0A, 0x2A20, 0x2A22, 0x2A28, 0x2A2A,
+  0x2A80, 0x2A82, 0x2A88, 0x2A8A, 0x2AA0, 0x2AA2, 0x2AA8, 0x2AAA,
+  0x8000, 0x8002, 0x8008, 0x800A, 0x8020, 0x8022, 0x8028, 0x802A,
+  0x8080, 0x8082, 0x8088, 0x808A, 0x80A0, 0x80A2, 0x80A8, 0x80AA,
+  0x8200, 0x8202, 0x8208, 0x820A, 0x8220, 0x8222, 0x8228, 0x822A,
+  0x8280, 0x8282, 0x8288, 0x828A, 0x82A0, 0x82A2, 0x82A8, 0x82AA,
+  0x8800, 0x8802, 0x8808, 0x880A, 0x8820, 0x8822, 0x8828, 0x882A,
+  0x8880, 0x8882, 0x8888, 0x888A, 0x88A0, 0x88A2, 0x88A8, 0x88AA,
+  0x8A00, 0x8A02, 0x8A08, 0x8A0A, 0x8A20, 0x8A22, 0x8A28, 0x8A2A,
+  0x8A80, 0x8A82, 0x8A88, 0x8A8A, 0x8AA0, 0x8AA2, 0x8AA8, 0x8AAA,
+  0xA000, 0xA002, 0xA008, 0xA00A, 0xA020, 0xA022, 0xA028, 0xA02A,
+  0xA080, 0xA082, 0xA088, 0xA08A, 0xA0A0, 0xA0A2, 0xA0A8, 0xA0AA,
+  0xA200, 0xA202, 0xA208, 0xA20A, 0xA220, 0xA222, 0xA228, 0xA22A,
+  0xA280, 0xA282, 0xA288, 0xA28A, 0xA2A0, 0xA2A2, 0xA2A8, 0xA2AA,
+  0xA800, 0xA802, 0xA808, 0xA80A, 0xA820, 0xA822, 0xA828, 0xA82A,
+  0xA880, 0xA882, 0xA888, 0xA88A, 0xA8A0, 0xA8A2, 0xA8A8, 0xA8AA,
+  0xAA00, 0xAA02, 0xAA08, 0xAA0A, 0xAA20, 0xAA22, 0xAA28, 0xAA2A,
+  0xAA80, 0xAA82, 0xAA88, 0xAA8A, 0xAAA0, 0xAAA2, 0xAAA8, 0xAAAA,
+};
+
+//
+// Macros
+//
+
+// WORD_SIZE is the number of bits per word
+// sizeof gives size in bytes (8 bits per byte)
+#define WORD_SIZE 64
+// #define WORD_SIZE (sizeof(word_t) * 8)
+
+// POPCOUNT is number of bits set
+
+#if defined(_WIN32)
+
+// See http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+static word_t __inline windows_popcount(word_t w)
+{
+  w = w - ((w >> 1) & (word_t)~(word_t)0/3);
+  w = (w & (word_t)~(word_t)0/15*3) + ((w >> 2) & (word_t)~(word_t)0/15*3);
+  w = (w + (w >> 4)) & (word_t)~(word_t)0/255*15;
+  c = (word_t)(w * ((word_t)~(word_t)0/255)) >> (sizeof(word_t) - 1) * 8;
+}
+
+static word_t __inline windows_parity(word_t w)
+{
+  w ^= w >> 1;
+  w ^= w >> 2;
+  w = (w & 0x1111111111111111UL) * 0x1111111111111111UL;
+  return (w >> 60) & 1;
+}
+
+#define POPCOUNT(x) windows_popcountl(x)
+#define PARITY(x) windows_parity(x)
+#else
+#define POPCOUNT(x) (unsigned)__builtin_popcountll(x)
+#define PARITY(x) (unsigned)__builtin_parityll(x)
+#endif
+
+#define MIN(a, b)  (((a) <= (b)) ? (a) : (b))
+#define MAX(a, b)  (((a) >= (b)) ? (a) : (b))
+
+// Make this a power of two
+#define INIT_CAPACITY_WORDS 2
+
+// word of all 1s
+#define WORD_MAX  (~(word_t)0)
+
+#define SET_REGION(arr,start,len)    _set_region((arr),(start),(len),FILL_REGION)
+#define CLEAR_REGION(arr,start,len)  _set_region((arr),(start),(len),ZERO_REGION)
+#define TOGGLE_REGION(arr,start,len) _set_region((arr),(start),(len),SWAP_REGION)
+
+// Have we initialised with srand() ?
+static char rand_initiated = 0;
+
+static void _seed_rand()
+{
+  if(!rand_initiated)
+  {
+    // Initialise random number generator
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    srand((((time.tv_sec ^ getpid()) * 1000001) + time.tv_usec));
+    rand_initiated = 1;
+  }
+}
+
+//
+// Common internal functions
+//
+
+#define bits_in_top_word(nbits) ((nbits) ? bitset64_idx((nbits) - 1) + 1 : 0)
+
+// Mostly used for debugging
+static inline void _print_word(word_t word, FILE* out)
+{
+  word_offset_t i;
+  for(i = 0; i < WORD_SIZE; i++)
+  {
+    fprintf(out, "%c", ((word >> i) & (word_t)0x1) == 0 ? '0' : '1');
+  }
+}
+
+// prints right to left
+static inline char* _word_to_str(word_t word, char str[WORD_SIZE+1])
+  __attribute__((unused));
+
+static inline char* _word_to_str(word_t word, char str[WORD_SIZE+1])
+{
+  word_offset_t i;
+  for(i = 0; i < WORD_SIZE; i++)
+  {
+    str[WORD_SIZE-i-1] = ((word >> i) & (word_t)0x1) == 0 ? '0' : '1';
+  }
+  str[WORD_SIZE] = '\0';
+  return str;
+}
+
+// Used in debugging
+#ifdef DEBUG
+  #define DEBUG_PRINT(msg,...) printf("[%s:%i] "msg, __FILE__, __LINE__, ##__VA_ARGS__);
+  #define DEBUG_VALIDATE(a) validate_bitarr((a), __FILE__, __LINE__)
+#else
+  #define DEBUG_PRINT(msg,...)
+  #define DEBUG_VALIDATE(a)
+#endif
+
+void validate_bitarr(BIT_ARRAY *arr, const char *file, int lineno)
+{
+  // Check top word is masked
+  word_addr_t tw = arr->num_of_words == 0 ? 0 : arr->num_of_words - 1;
+  bit_index_t top_bits = bits_in_top_word(arr->num_of_bits);
+  int err = 0;
+
+  if(arr->words[tw] > bitmask64(top_bits))
+  {
+    _print_word(arr->words[tw], stderr);
+    fprintf(stderr, "\n[%s:%i] Expected %i bits in top word[%i]\n",
+            file, lineno, (int)top_bits, (int)tw);
+    err = 1;
+  }
+
+  // Check num of words is correct
+  word_addr_t num_words = roundup_bits2words64(arr->num_of_bits);
+  if(num_words != arr->num_of_words)
+  {
+    fprintf(stderr, "\n[%s:%i] num of words wrong "
+                    "[bits: %i, word: %i, actual words: %i]\n", file, lineno,
+            (int)arr->num_of_bits, (int)num_words, (int)arr->num_of_words);
+    err = 1;
+  }
+
+  if(err) abort();
+}
+
+// Reverse a word
+static inline word_t _reverse_word(word_t word)
+{
+  word_t reverse = (reverse_table[(word)       & 0xff] << 56) |
+                   (reverse_table[(word >>  8) & 0xff] << 48) |
+                   (reverse_table[(word >> 16) & 0xff] << 40) |
+                   (reverse_table[(word >> 24) & 0xff] << 32) |
+                   (reverse_table[(word >> 32) & 0xff] << 24) |
+                   (reverse_table[(word >> 40) & 0xff] << 16) |
+                   (reverse_table[(word >> 48) & 0xff] << 8) |
+                   (reverse_table[(word >> 56) & 0xff]);
+
+  return reverse;
+}
+
+static inline void _mask_top_word(BIT_ARRAY* bitarr)
+{
+  // Mask top word
+  word_addr_t num_of_words = MAX(1, bitarr->num_of_words);
+  word_offset_t bits_active = bits_in_top_word(bitarr->num_of_bits);
+  bitarr->words[num_of_words-1] &= bitmask64(bits_active);
+}
+
+//
+// Get and set words (internal use only -- no bounds checking)
+//
+
+static inline word_t _get_word(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  word_addr_t word_index = bitset64_wrd(start);
+  word_offset_t word_offset = bitset64_idx(start);
+
+  word_t result = bitarr->words[word_index] >> word_offset;
+
+  word_offset_t bits_taken = WORD_SIZE - word_offset;
+
+  // word_offset is now the number of bits we need from the next word
+  // Check the next word has at least some bits
+  if(word_offset > 0 && start + bits_taken < bitarr->num_of_bits)
+  {
+    result |= bitarr->words[word_index+1] << (WORD_SIZE - word_offset);
+  }
+
+  return result;
+}
+
+// Set 64 bits from a particular start position
+// Doesn't extend bit array
+static inline void _set_word(BIT_ARRAY* bitarr, bit_index_t start, word_t word)
+{
+  word_addr_t word_index = bitset64_wrd(start);
+  word_offset_t word_offset = bitset64_idx(start);
+
+  if(word_offset == 0)
+  {
+    bitarr->words[word_index] = word;
+  }
+  else
+  {
+    bitarr->words[word_index]
+      = (word << word_offset) |
+        (bitarr->words[word_index] & bitmask64(word_offset));
+
+    if(word_index+1 < bitarr->num_of_words)
+    {
+      bitarr->words[word_index+1]
+        = (word >> (WORD_SIZE - word_offset)) |
+          (bitarr->words[word_index+1] & (WORD_MAX << word_offset));
+    }
+  }
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+static inline void _set_byte(BIT_ARRAY *bitarr, bit_index_t start, uint8_t byte)
+{
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xff) | byte);
+}
+
+// 4 bits
+static inline void _set_nibble(BIT_ARRAY *bitarr, bit_index_t start,
+                               uint8_t nibble)
+{
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xf) | nibble);
+}
+
+// Wrap around
+static inline word_t _get_word_cyclic(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  word_t word = _get_word(bitarr, start);
+
+  bit_index_t bits_taken = bitarr->num_of_bits - start;
+
+  if(bits_taken < WORD_SIZE)
+  {
+    word |= (bitarr->words[0] << bits_taken);
+
+    if(bitarr->num_of_bits < (bit_index_t)WORD_SIZE)
+    {
+      // Mask word to prevent repetition of the same bits
+      word = word & bitmask64(bitarr->num_of_bits);
+    }
+  }
+
+  return word;
+}
+
+// Wrap around
+static inline void _set_word_cyclic(BIT_ARRAY* bitarr,
+                                    bit_index_t start, word_t word)
+{
+  _set_word(bitarr, start, word);
+
+  bit_index_t bits_set = bitarr->num_of_bits - start;
+
+  if(bits_set < WORD_SIZE && start > 0)
+  {
+    word >>= bits_set;
+
+    // Prevent overwriting the bits we've just set
+    // by setting 'start' as the upper bound for the number of bits to write
+    word_offset_t bits_remaining = MIN(WORD_SIZE - bits_set, start);
+    word_t mask = bitmask64(bits_remaining);
+
+    bitarr->words[0] = bitmask_merge(word, bitarr->words[0], mask);
+  }
+}
+
+//
+// Fill a region (internal use only)
+//
+
+// FillAction is fill with 0 or 1 or toggle
+typedef enum {ZERO_REGION, FILL_REGION, SWAP_REGION} FillAction;
+
+static inline void _set_region(BIT_ARRAY* bitarr, bit_index_t start,
+                               bit_index_t length, FillAction action)
+{
+  if(length == 0) return;
+
+  word_addr_t first_word = bitset64_wrd(start);
+  word_addr_t last_word = bitset64_wrd(start+length-1);
+  word_offset_t foffset = bitset64_idx(start);
+  word_offset_t loffset = bitset64_idx(start+length-1);
+
+  if(first_word == last_word)
+  {
+    word_t mask = bitmask64(length) << foffset;
+
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[first_word] &= ~mask; break;
+      case FILL_REGION: bitarr->words[first_word] |=  mask; break;
+      case SWAP_REGION: bitarr->words[first_word] ^=  mask; break;
+    }
+  }
+  else
+  {
+    // Set first word
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[first_word] &=  bitmask64(foffset); break;
+      case FILL_REGION: bitarr->words[first_word] |= ~bitmask64(foffset); break;
+      case SWAP_REGION: bitarr->words[first_word] ^= ~bitmask64(foffset); break;
+    }
+
+    word_addr_t i;
+
+    // Set whole words
+    switch(action)
+    {
+      case ZERO_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] = (word_t)0;
+        break;
+      case FILL_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] = WORD_MAX;
+        break;
+      case SWAP_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] ^= WORD_MAX;
+        break;
+    }
+
+    // Set last word
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[last_word] &= ~bitmask64(loffset+1); break;
+      case FILL_REGION: bitarr->words[last_word] |=  bitmask64(loffset+1); break;
+      case SWAP_REGION: bitarr->words[last_word] ^=  bitmask64(loffset+1); break;
+    }
+  }
+}
+
+
+
+//
+// Constructor
+//
+
+// If cannot allocate memory, set errno to ENOMEM, return NULL
+BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits)
+{
+  bitarr->num_of_bits = nbits;
+  bitarr->num_of_words = roundup_bits2words64(nbits);
+  bitarr->capacity_in_words = MAX(8, roundup2pow(bitarr->num_of_words));
+  bitarr->words = (word_t*)calloc(bitarr->capacity_in_words, sizeof(word_t));
+
+  if(bitarr->words == NULL) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return bitarr;
+}
+
+void bit_array_dealloc(BIT_ARRAY* bitarr)
+{
+  free(bitarr->words);
+  memset(bitarr, 0, sizeof(BIT_ARRAY));
+}
+
+// If cannot allocate memory, set errno to ENOMEM, return NULL
+BIT_ARRAY* bit_array_create(bit_index_t nbits)
+{
+  BIT_ARRAY* bitarr = (BIT_ARRAY*)malloc(sizeof(BIT_ARRAY));
+
+  // error if could not allocate enough memory
+  if(bitarr == NULL || bit_array_alloc(bitarr, nbits) == NULL)
+  {
+    if(bitarr != NULL) free(bitarr);
+    errno = ENOMEM;
+    return NULL;
+  }
+
+  DEBUG_PRINT("Creating BIT_ARRAY (bits: %lu; allocated words: %lu; "
+              "using words: %lu; WORD_SIZE: %i)\n",
+              (unsigned long)nbits, (unsigned long)bitarr->capacity_in_words,
+              (unsigned long)roundup_bits2words64(nbits), (int)WORD_SIZE);
+
+  DEBUG_VALIDATE(bitarr);
+
+  return bitarr;
+}
+
+//
+// Destructor
+//
+void bit_array_free(BIT_ARRAY* bitarr)
+{
+  if(bitarr->words != NULL)
+    free(bitarr->words);
+
+  free(bitarr);
+}
+
+bit_index_t bit_array_length(const BIT_ARRAY* bit_arr)
+{
+  return bit_arr->num_of_bits;
+}
+
+// Change the size of a bit array. Enlarging an array will add zeros
+// to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory)
+char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits)
+{
+  word_addr_t old_num_of_words = bitarr->num_of_words;
+  word_addr_t new_num_of_words = roundup_bits2words64(new_num_of_bits);
+
+  bitarr->num_of_bits = new_num_of_bits;
+  bitarr->num_of_words = new_num_of_words;
+
+  DEBUG_PRINT("Resize: old_num_of_words: %i; new_num_of_words: %i capacity: %i\n",
+              (int)old_num_of_words, (int)new_num_of_words,
+              (int)bitarr->capacity_in_words);
+
+  if(new_num_of_words > bitarr->capacity_in_words)
+  {
+    // Need to change the amount of memory used
+    word_addr_t old_capacity_in_words = bitarr->capacity_in_words;
+    size_t old_capacity_in_bytes = old_capacity_in_words * sizeof(word_t);
+
+    bitarr->capacity_in_words = roundup2pow(new_num_of_words);
+    bitarr->capacity_in_words = MAX(8, bitarr->capacity_in_words);
+
+    size_t new_capacity_in_bytes = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->words = (word_t*)realloc(bitarr->words, new_capacity_in_bytes);
+
+    if(bitarr->words == NULL)
+    {
+      // error - could not allocate enough memory
+      perror("resize realloc");
+      errno = ENOMEM;
+      return 0;
+    }
+
+    // Need to zero new memory
+    size_t num_bytes_to_zero = new_capacity_in_bytes - old_capacity_in_bytes;
+    memset(bitarr->words + old_capacity_in_words, 0, num_bytes_to_zero);
+
+    DEBUG_PRINT("zeroing from word %i for %i bytes\n", (int)old_capacity_in_words,
+                (int)num_bytes_to_zero);
+  }
+  else if(new_num_of_words < old_num_of_words)
+  {
+    // Shrunk -- need to zero old memory
+    size_t num_bytes_to_zero = (old_num_of_words - new_num_of_words)*sizeof(word_t);
+
+    memset(bitarr->words + new_num_of_words, 0, num_bytes_to_zero);
+  }
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+  return 1;
+}
+
+void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits)
+{
+  bit_index_t old_num_of_bits = bitarr->num_of_bits;
+
+  if(!bit_array_resize(bitarr, num_of_bits))
+  {
+    fprintf(stderr, "Ran out of memory resizing [%lu -> %lu]",
+            (unsigned long)old_num_of_bits, (unsigned long)num_of_bits);
+    abort();
+  }
+}
+
+// If bitarr length < num_bits, resizes to num_bits
+char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits)
+{
+  if(bitarr->num_of_bits < ensure_num_of_bits)
+  {
+    return bit_array_resize(bitarr, ensure_num_of_bits);
+  }
+
+  return 1;
+}
+
+void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits)
+{
+  if(num_of_bits > bitarr->num_of_bits)
+  {
+    bit_array_resize_critical(bitarr, num_of_bits);
+  }
+}
+
+static inline
+void _bit_array_ensure_nwords(BIT_ARRAY* bitarr, word_addr_t nwords,
+                              const char *file, int lineno, const char *func)
+{
+  size_t newmem, oldmem;
+  if(bitarr->capacity_in_words < nwords) {
+    oldmem = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->capacity_in_words = roundup2pow(nwords);
+    newmem = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->words = (word_t*)realloc(bitarr->words, newmem);
+
+    if(bitarr->words == NULL) {
+      fprintf(stderr, "[%s:%i:%s()] Ran out of memory resizing [%zu -> %zu]",
+              file, lineno, func, oldmem, newmem);
+      abort();
+    }
+
+    DEBUG_PRINT("Ensure nwords realloc %zu -> %zu\n", oldmem, newmem);
+  }
+}
+
+
+//
+// Get, set, clear, assign and toggle individual bits
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  return bit_array_get(bitarr, b);
+}
+
+// set a bit (to 1) at position b
+void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_set(bitarr,b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// clear a bit (to 0) at position b
+void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_clear(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If bit is 0 -> 1, if bit is 1 -> 0.  AKA 'flip'
+void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_toggle(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_assign(bitarr, b, c ? 1 : 0);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Get, set etc with resize
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  return bit_array_get(bitarr, b);
+}
+
+// set a bit (to 1) at position b
+void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_set(bitarr,b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// clear a bit (to 0) at position b
+void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_clear(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If bit is 0 -> 1, if bit is 1 -> 0.  AKA 'flip'
+void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_toggle(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_assign(bitarr, b, c ? 1 : 0);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Set, clear and toggle several bits at once
+//
+
+// Set multiple bits at once.
+// e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31);
+void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_set_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Clear multiple bits at once.
+// e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31);
+void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_clear_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Toggle multiple bits at once
+// e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31);
+void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_toggle_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Set, clear and toggle all bits in a region
+//
+
+// Set all the bits in a region
+void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  SET_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+// Clear all the bits in a region
+void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  CLEAR_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Toggle all the bits in a region
+void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  TOGGLE_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Set, clear and toggle all bits at once
+//
+
+// set all elements of data to one
+void bit_array_set_all(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bytes = bitarr->num_of_words * sizeof(word_t);
+  memset(bitarr->words, 0xFF, num_of_bytes);
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// set all elements of data to zero
+void bit_array_clear_all(BIT_ARRAY* bitarr)
+{
+  memset(bitarr->words, 0, bitarr->num_of_words * sizeof(word_t));
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Set all 1 bits to 0, and all 0 bits to 1. AKA flip
+void bit_array_toggle_all(BIT_ARRAY* bitarr)
+{
+  word_addr_t i;
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    bitarr->words[i] ^= WORD_MAX;
+  }
+
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Get a word at a time
+//
+
+uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint64_t)_get_word(bitarr, start);
+}
+
+uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint32_t)_get_word(bitarr, start);
+}
+
+uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint16_t)_get_word(bitarr, start);
+}
+
+uint8_t bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint8_t)_get_word(bitarr, start);
+}
+
+uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n)
+{
+  assert(start < bitarr->num_of_bits);
+  assert(n <= 64);
+  return (uint64_t)(_get_word(bitarr, start) & bitmask64(n));
+}
+
+//
+// Set a word at a time
+//
+
+void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  _set_word(bitarr, start, (word_t)word);
+}
+
+void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xffffffff) | word);
+}
+
+void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xffff) | word);
+}
+
+void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte)
+{
+  assert(start < bitarr->num_of_bits);
+  _set_byte(bitarr, start, byte);
+}
+
+void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n)
+{
+  assert(start < bitarr->num_of_bits);
+  assert(n <= 64);
+  word_t w = _get_word(bitarr, start), m = bitmask64(n);
+  _set_word(bitarr, start, bitmask_merge(word,w,m));
+}
+
+//
+// Number of bits set
+//
+
+// Get the number of bits set (hamming weight)
+bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr)
+{
+  word_addr_t i;
+
+  bit_index_t num_of_bits_set = 0;
+
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    if(bitarr->words[i] > 0)
+    {
+      num_of_bits_set += POPCOUNT(bitarr->words[i]);
+    }
+  }
+
+  return num_of_bits_set;
+}
+
+// Get the number of bits not set (1 - hamming weight)
+bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr)
+{
+  return bitarr->num_of_bits - bit_array_num_bits_set(bitarr);
+}
+
+
+// Get the number of bits set in on array and not the other.  This is equivalent
+// to hamming weight of the XOR when the two arrays are the same length.
+// e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010)
+bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1,
+                                       const BIT_ARRAY* arr2)
+{
+  word_addr_t min_words = MIN(arr1->num_of_words, arr2->num_of_words);
+  word_addr_t max_words = MAX(arr1->num_of_words, arr2->num_of_words);
+
+  bit_index_t hamming_distance = 0;
+  word_addr_t i;
+
+  for(i = 0; i < min_words; i++)
+  {
+    hamming_distance += POPCOUNT(arr1->words[i] ^ arr2->words[i]);
+  }
+
+  if(min_words != max_words)
+  {
+    const BIT_ARRAY* long_arr
+      = (arr1->num_of_words > arr2->num_of_words ? arr1 : arr2);
+
+    for(i = min_words; i < max_words; i++)
+    {
+      hamming_distance += POPCOUNT(long_arr->words[i]);
+    }
+  }
+
+  return hamming_distance;
+}
+
+// Parity - returns 1 if odd number of bits set, 0 if even
+char bit_array_parity(const BIT_ARRAY* bitarr)
+{
+  word_addr_t w;
+  unsigned int parity = 0;
+
+  for(w = 0; w < bitarr->num_of_words; w++)
+  {
+    parity ^= PARITY(bitarr->words[w]);
+  }
+
+  return (char)parity;
+}
+
+//
+// Find indices of set/clear bits
+//
+
+// Find the index of the next bit that is set/clear, at or after `offset`
+// Returns 1 if such a bit is found, otherwise 0
+// Index is stored in the integer pointed to by `result`
+// If no such bit is found, value at `result` is not changed
+#define _next_bit_func_def(FUNC,GET) \
+char FUNC(const BIT_ARRAY* bitarr, bit_index_t offset, bit_index_t* result) \
+{ \
+  assert(offset < bitarr->num_of_bits); \
+  if(bitarr->num_of_bits == 0 || offset >= bitarr->num_of_bits) { return 0; } \
+ \
+  /* Find first word that is greater than zero */ \
+  word_addr_t i = bitset64_wrd(offset); \
+  word_t w = GET(bitarr->words[i]) & ~bitmask64(bitset64_idx(offset)); \
+ \
+  while(1) { \
+    if(w > 0) { \
+      bit_index_t pos = i * WORD_SIZE + trailing_zeros(w); \
+      if(pos < bitarr->num_of_bits) { *result = pos; return 1; } \
+      else { return 0; } \
+    } \
+    i++; \
+    if(i >= bitarr->num_of_words) break; \
+    w = GET(bitarr->words[i]); \
+  } \
+ \
+  return 0; \
+}
+
+// Find the index of the previous bit that is set/clear, before `offset`.
+// Returns 1 if such a bit is found, otherwise 0
+// Index is stored in the integer pointed to by `result`
+// If no such bit is found, value at `result` is not changed
+#define _prev_bit_func_def(FUNC,GET) \
+char FUNC(const BIT_ARRAY* bitarr, bit_index_t offset, bit_index_t* result) \
+{ \
+  assert(offset <= bitarr->num_of_bits); \
+  if(bitarr->num_of_bits == 0 || offset == 0) { return 0; } \
+ \
+  /* Find prev word that is greater than zero */ \
+  word_addr_t i = bitset64_wrd(offset-1); \
+  word_t w = GET(bitarr->words[i]) & bitmask64(bitset64_idx(offset-1)+1); \
+ \
+  if(w > 0) { *result = (i+1) * WORD_SIZE - leading_zeros(w) - 1; return 1; } \
+ \
+  /* i is unsigned so have to use break when i == 0 */ \
+  for(--i; i != BIT_INDEX_MAX; i--) { \
+    w = GET(bitarr->words[i]); \
+    if(w > 0) { \
+      *result = (i+1) * WORD_SIZE - leading_zeros(w) - 1; \
+      return 1; \
+    } \
+  } \
+ \
+  return 0; \
+}
+
+#define GET_WORD(x) (x)
+#define NEG_WORD(x) (~(x))
+_next_bit_func_def(bit_array_find_next_set_bit,  GET_WORD);
+_next_bit_func_def(bit_array_find_next_clear_bit,NEG_WORD);
+_prev_bit_func_def(bit_array_find_prev_set_bit,  GET_WORD);
+_prev_bit_func_def(bit_array_find_prev_clear_bit,NEG_WORD);
+
+// Find the index of the first bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of first set bit is stored in the integer pointed to by result
+// If no bits are set, value at `result` is not changed
+char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_next_set_bit(bitarr, 0, result);
+}
+
+// same same
+char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_next_clear_bit(bitarr, 0, result);
+}
+
+// Find the index of the last bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of last set bit is stored in the integer pointed to by `result`
+// If no bits are set, value at `result` is not changed
+char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_prev_set_bit(bitarr, bitarr->num_of_bits, result);
+}
+
+// same same
+char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_prev_clear_bit(bitarr, bitarr->num_of_bits, result);
+}
+
+//
+// "Sorting" bits
+//
+
+// Put all the 0s before all the 1s
+void bit_array_sort_bits(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bits_set = bit_array_num_bits_set(bitarr);
+  bit_index_t num_of_bits_cleared = bitarr->num_of_bits - num_of_bits_set;
+  bit_array_set_all(bitarr);
+  CLEAR_REGION(bitarr, 0, num_of_bits_cleared);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Put all the 1s before all the 0s
+void bit_array_sort_bits_rev(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bits_set = bit_array_num_bits_set(bitarr);
+  bit_array_clear_all(bitarr);
+  SET_REGION(bitarr, 0, num_of_bits_set);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Strings and printing
+//
+
+// Construct a BIT_ARRAY from a substring with given on and off characters.
+void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset,
+                           const char *str, size_t len,
+                           const char *on, const char *off,
+                           char left_to_right)
+{
+  bit_array_ensure_size(bitarr, offset + len);
+  bit_array_clear_region(bitarr, offset, len);
+
+  // BitArray region is now all 0s -- just set the 1s
+  size_t i;
+  bit_index_t j;
+
+  for(i = 0; i < len; i++)
+  {
+    if(strchr(on, str[i]) != NULL)
+    {
+      j = offset + (left_to_right ? i : len - i - 1);
+      bit_array_set(bitarr, j);
+    }
+    else { assert(strchr(off, str[i]) != NULL); }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// From string method
+void bit_array_from_str(BIT_ARRAY* bitarr, const char* str)
+{
+  bit_array_from_substr(bitarr, 0, str, strlen(str), "1", "0", 1);
+}
+
+// Takes a char array to write to.  `str` must be bitarr->num_of_bits+1 in length
+// Terminates string with '\0'
+char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    str[i] = bit_array_get(bitarr, i) ? '1' : '0';
+  }
+
+  str[bitarr->num_of_bits] = '\0';
+
+  return str;
+}
+
+char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    str[i] = bit_array_get(bitarr, bitarr->num_of_bits-i-1) ? '1' : '0';
+  }
+
+  str[bitarr->num_of_bits] = '\0';
+
+  return str;
+}
+
+
+// Get a string representations for a given region, using given on/off characters.
+// Note: does not null-terminate
+void bit_array_to_substr(const BIT_ARRAY* bitarr,
+                         bit_index_t start, bit_index_t length,
+                         char* str, char on, char off,
+                         char left_to_right)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  bit_index_t i, j;
+  bit_index_t end = start + length - 1;
+
+  for(i = 0; i < length; i++)
+  {
+    j = (left_to_right ? start + i : end - i);
+    str[i] = bit_array_get(bitarr, j) ? on : off;
+  }
+
+//  str[length] = '\0';
+}
+
+// Print this array to a file stream.  Prints '0's and '1'.  Doesn't print newline.
+void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    fprintf(fout, "%c", bit_array_get(bitarr, i) ? '1' : '0');
+  }
+}
+
+// Print a string representations for a given region, using given on/off characters.
+void bit_array_print_substr(const BIT_ARRAY* bitarr,
+                            bit_index_t start, bit_index_t length,
+                            FILE* fout, char on, char off,
+                            char left_to_right)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  bit_index_t i, j;
+  bit_index_t end = start + length - 1;
+
+  for(i = 0; i < length; i++)
+  {
+    j = (left_to_right ? start + i : end - i);
+    fprintf(fout, "%c", bit_array_get(bitarr, j) ? on : off);
+  }
+}
+
+//
+// Decimal
+//
+
+// Get bit array as decimal str (e.g. 0b1101 -> "13")
+// len is the length of str char array -- will write at most len-1 chars
+// returns the number of characters needed
+// return is the same as strlen(str)
+size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len)
+{
+  size_t i = 0;
+
+  if(bit_array_cmp_uint64(bitarr, 0) == 0)
+  {
+    if(len >= 2)
+    {
+      *str = '0';
+      *(str+1) = '\0';
+    }
+
+    return 1;
+  }
+
+  BIT_ARRAY *tmp = bit_array_clone(bitarr);
+  uint64_t rem;
+
+  str[len-1] = '\0';
+
+  while(bit_array_cmp_uint64(tmp, 0) != 0)
+  {
+    bit_array_div_uint64(tmp, 10, &rem);
+
+    if(i < len-1)
+    {
+      str[len-2-i] = '0' + rem;
+    }
+
+    i++;
+  }
+
+  if(i < len-1)
+  {
+    // Moves null-terminator as well
+    memmove(str, str+len-i-1, i+1);
+  }
+
+  bit_array_free(tmp);
+
+  return i;
+}
+
+// Get bit array from decimal str (e.g. "13" -> 0b1101)
+// Returns number of characters used
+size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal)
+{
+  bit_array_clear_all(bitarr);
+  size_t i = 0;
+
+  if(decimal[0] == '\0' || decimal[0] < '0' || decimal[0] > '9')
+  {
+    return 0;
+  }
+
+  bit_array_add_uint64(bitarr, decimal[i] - '0');
+  i++;
+
+  while(decimal[i] != '\0' && decimal[i] >= '0' && decimal[i] <= '9')
+  {
+    bit_array_mul_uint64(bitarr, 10);
+    bit_array_add_uint64(bitarr, decimal[i] - '0');
+    i++;
+  }
+
+  return i;
+}
+
+//
+// Hexidecimal
+//
+
+char bit_array_hex_to_nibble(char c, uint8_t *b)
+{
+  c = tolower(c);
+
+  if(c >= '0' && c <= '9')
+  {
+    *b = c - '0';
+    return 1;
+  }
+  else if(c >= 'a' && c <= 'f')
+  {
+    *b = 0xa + (c - 'a');
+    return 1;
+  }
+  else
+  {
+    return 0;
+  }
+}
+
+char bit_array_nibble_to_hex(uint8_t b, char uppercase)
+{
+  if(b <= 9)
+  {
+    return '0' + b;
+  }
+  else
+  {
+    return (uppercase ? 'A' : 'a') + (b - 0xa);
+  }
+}
+
+// Loads array from hex string
+// Returns the number of bits loaded (will be chars rounded up to multiple of 4)
+// (0 on failure)
+bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset,
+                               const char* str, size_t len)
+{
+  if(str[0] == '0' && tolower(str[1]) == 'x')
+  {
+    str += 2;
+    len -= 2;
+  }
+
+  size_t i;
+  for(i = 0; i < len; i++, offset += 4)
+  {
+    uint8_t b;
+    if(bit_array_hex_to_nibble(str[i], &b))
+    {
+      bit_array_ensure_size(bitarr, offset + 4);
+      _set_nibble(bitarr, offset, b);
+    }
+    else
+    {
+      break;
+    }
+  }
+
+  return 4 * i;
+}
+
+// Returns number of characters written
+size_t bit_array_to_hex(const BIT_ARRAY* bitarr,
+                        bit_index_t start, bit_index_t length,
+                        char* str, char uppercase)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  size_t k = 0;
+  bit_index_t offset, end = start + length;
+
+  for(offset = start; offset + WORD_SIZE <= end; offset += WORD_SIZE)
+  {
+    word_t w = _get_word(bitarr, offset);
+
+    word_offset_t j;
+    for(j = 0; j < 64; j += 4)
+    {
+      str[k++] = bit_array_nibble_to_hex((w>>j) & 0xf, uppercase);
+    }
+  }
+
+  if(offset < end)
+  {
+    // Remaining full nibbles (4 bits)
+    word_t w = _get_word(bitarr, offset);
+
+    for(; offset + 4 <= end; offset += 4)
+    {
+      str[k++] = bit_array_nibble_to_hex(w & 0xf, uppercase);
+      w >>= 4;
+    }
+
+    if(offset < end)
+    {
+      // Remaining bits
+      str[k++] = bit_array_nibble_to_hex(w & bitmask64(end - offset), uppercase);
+    }
+  }
+
+  str[k] = '\0';
+
+  // Return number of characters written
+  return k;
+}
+
+// Print bit array as hex
+size_t bit_array_print_hex(const BIT_ARRAY* bitarr,
+                           bit_index_t start, bit_index_t length,
+                           FILE* fout, char uppercase)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  size_t k = 0;
+  bit_index_t offset, end = start + length;
+
+  for(offset = start; offset + WORD_SIZE <= end; offset += WORD_SIZE)
+  {
+    word_t w = _get_word(bitarr, offset);
+
+    word_offset_t j;
+    for(j = 0; j < 64; j += 4)
+    {
+      fprintf(fout, "%c", bit_array_nibble_to_hex((w>>j) & 0xf, uppercase));
+      k++;
+    }
+  }
+
+  if(offset < end)
+  {
+    // Remaining full nibbles (4 bits)
+    word_t w = _get_word(bitarr, offset);
+
+    for(; offset + 4 <= end; offset += 4)
+    {
+      fprintf(fout, "%c", bit_array_nibble_to_hex(w & 0xf, uppercase));
+      w >>= 4;
+      k++;
+    }
+
+    if(offset < end)
+    {
+      // Remaining bits
+      char hex = bit_array_nibble_to_hex(w & bitmask64(end - offset), uppercase);
+      fprintf(fout, "%c", hex);
+      k++;
+    }
+  }
+
+  return k;
+}
+
+//
+// Clone and copy
+//
+
+// Returns NULL if cannot malloc
+BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr)
+{
+  BIT_ARRAY* cpy = bit_array_create(bitarr->num_of_bits);
+
+  if(cpy == NULL)
+  {
+    return NULL;
+  }
+
+  // Copy across bits
+  memcpy(cpy->words, bitarr->words, bitarr->num_of_words * sizeof(word_t));
+
+  DEBUG_VALIDATE(cpy);
+  return cpy;
+}
+
+// destination and source may be the same bit_array
+// and src/dst regions may overlap
+static void _array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                        const BIT_ARRAY* src, bit_index_t srcindx,
+                        bit_index_t length)
+{
+  DEBUG_PRINT("bit_array_copy(dst: %zu, src: %zu, length: %zu)\n",
+              (size_t)dstindx, (size_t)srcindx, (size_t)length);
+
+  // Num of full words to copy
+  word_addr_t num_of_full_words = length / WORD_SIZE;
+  word_addr_t i;
+
+  word_offset_t bits_in_last_word = bits_in_top_word(length);
+
+  if(dst == src && srcindx > dstindx)
+  {
+    // Work left to right
+    DEBUG_PRINT("work left to right\n");
+
+    for(i = 0; i < num_of_full_words; i++)
+    {
+      word_t word = _get_word(src, srcindx+i*WORD_SIZE);
+      _set_word(dst, dstindx+i*WORD_SIZE, word);
+    }
+
+    if(bits_in_last_word > 0)
+    {
+      word_t src_word = _get_word(src, srcindx+i*WORD_SIZE);
+      word_t dst_word = _get_word(dst, dstindx+i*WORD_SIZE);
+
+      word_t mask = bitmask64(bits_in_last_word);
+      word_t word = bitmask_merge(src_word, dst_word, mask);
+
+      _set_word(dst, dstindx+num_of_full_words*WORD_SIZE, word);
+    }
+  }
+  else
+  {
+    // Work right to left
+    DEBUG_PRINT("work right to left\n");
+
+    for(i = 0; i < num_of_full_words; i++)
+    {
+      word_t word = _get_word(src, srcindx+length-(i+1)*WORD_SIZE);
+      _set_word(dst, dstindx+length-(i+1)*WORD_SIZE, word);
+    }
+
+    DEBUG_PRINT("Copy %i,%i to %i\n", (int)srcindx, (int)bits_in_last_word,
+                                      (int)dstindx);
+
+    if(bits_in_last_word > 0)
+    {
+      word_t src_word = _get_word(src, srcindx);
+      word_t dst_word = _get_word(dst, dstindx);
+
+      word_t mask = bitmask64(bits_in_last_word);
+      word_t word = bitmask_merge(src_word, dst_word, mask);
+      _set_word(dst, dstindx, word);
+    }
+  }
+
+  _mask_top_word(dst);
+}
+
+// destination and source may be the same bit_array
+// and src/dst regions may overlap
+void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                    const BIT_ARRAY* src, bit_index_t srcindx,
+                    bit_index_t length)
+{
+  assert(srcindx + length <= src->num_of_bits);
+  assert(dstindx <= dst->num_of_bits);
+  _array_copy(dst, dstindx, src, srcindx, length);
+  DEBUG_VALIDATE(dst);
+}
+
+// Clone `src` into `dst`. Resizes `dst`.
+void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src)
+{
+  bit_array_resize_critical(dst, src->num_of_bits);
+  memmove(dst->words, src->words, src->num_of_words * sizeof(word_t));
+  DEBUG_VALIDATE(dst);
+}
+
+
+//
+// Logic operators
+//
+
+// Destination can be the same as one or both of the sources
+void bit_array_and(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  // Ensure dst array is big enough
+  word_addr_t max_bits = MAX(src1->num_of_bits, src2->num_of_bits);
+  bit_array_ensure_size_critical(dst, max_bits);
+
+  word_addr_t min_words = MIN(src1->num_of_words, src2->num_of_words);
+
+  word_addr_t i;
+
+  for(i = 0; i < min_words; i++)
+  {
+    dst->words[i] = src1->words[i] & src2->words[i];
+  }
+
+  // Set remaining bits to zero
+  for(i = min_words; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = (word_t)0;
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// Destination can be the same as one or both of the sources
+static void _logical_or_xor(BIT_ARRAY* dst,
+                            const BIT_ARRAY* src1,
+                            const BIT_ARRAY* src2,
+                            char use_xor)
+{
+  // Ensure dst array is big enough
+  bit_array_ensure_size_critical(dst, MAX(src1->num_of_bits, src2->num_of_bits));
+
+  word_addr_t min_words = MIN(src1->num_of_words, src2->num_of_words);
+  word_addr_t max_words = MAX(src1->num_of_words, src2->num_of_words);
+
+  word_addr_t i;
+
+  if(use_xor)
+  {
+    for(i = 0; i < min_words; i++)
+      dst->words[i] = src1->words[i] ^ src2->words[i];
+  }
+  else
+  {
+    for(i = 0; i < min_words; i++)
+      dst->words[i] = src1->words[i] | src2->words[i];
+  }
+
+  // Copy remaining bits from longer src array
+  if(min_words != max_words)
+  {
+    const BIT_ARRAY* longer = src1->num_of_words > src2->num_of_words ? src1 : src2;
+
+    for(i = min_words; i < max_words; i++)
+    {
+      dst->words[i] = longer->words[i];
+    }
+  }
+
+  // Set remaining bits to zero
+  size_t size = (dst->num_of_words - max_words) * sizeof(word_t);
+  memset(dst->words + max_words, 0, size);
+
+  DEBUG_VALIDATE(dst);
+}
+
+void bit_array_or(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  _logical_or_xor(dst, src1, src2, 0);
+}
+
+// Destination can be the same as one or both of the sources
+void bit_array_xor(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  _logical_or_xor(dst, src1, src2, 1);
+}
+
+// If dst is longer than src, top bits are set to 1
+void bit_array_not(BIT_ARRAY* dst, const BIT_ARRAY* src)
+{
+  bit_array_ensure_size_critical(dst, src->num_of_bits);
+
+  word_addr_t i;
+
+  for(i = 0; i < src->num_of_words; i++)
+  {
+    dst->words[i] = ~(src->words[i]);
+  }
+
+  // Set remaining words to 1s
+  for(i = src->num_of_words; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = WORD_MAX;
+  }
+
+  _mask_top_word(dst);
+
+  DEBUG_VALIDATE(dst);
+}
+
+//
+// Comparisons
+//
+
+// Compare two bit arrays by value stored, with index 0 being the Least
+// Significant Bit (LSB). Arrays do not have to be the same length.
+// Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side]
+// Sorts on length if all zeros: (0,0) < (0,0,0)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2)
+{
+  word_addr_t i;
+  word_t word1, word2;
+  word_addr_t min_words = bitarr1->num_of_words;
+
+  // i is unsigned so break when i == 0
+  if(bitarr1->num_of_words > bitarr2->num_of_words) {
+    min_words = bitarr2->num_of_words;
+    for(i = bitarr1->num_of_words-1; ; i--) {
+      if(bitarr1->words[i]) return 1;
+      if(i == bitarr2->num_of_words) break;
+    }
+  }
+  else if(bitarr1->num_of_words < bitarr2->num_of_words) {
+    for(i = bitarr2->num_of_words-1; ; i--) {
+      if(bitarr2->words[i]) return 1;
+      if(i == bitarr1->num_of_words) break;
+    }
+  }
+
+  if(min_words == 0) return 0;
+
+  for(i = min_words-1; ; i--)
+  {
+    word1 = bitarr1->words[i];
+    word2 = bitarr2->words[i];
+    if(word1 != word2) return (word1 > word2 ? 1 : -1);
+    if(i == 0) break;
+  }
+
+  if(bitarr1->num_of_bits == bitarr2->num_of_bits) return 0;
+  return bitarr1->num_of_bits > bitarr2->num_of_bits ? 1 : -1;
+}
+
+// Compare two bit arrays by value stored, with index 0 being the Most
+// Significant Bit (MSB). Arrays do not have to be the same length.
+// Example: 10.. > 01.. [index 0 is MSB at left hand side]
+// Sorts on length if all zeros: (0,0) < (0,0,0)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2)
+{
+  word_addr_t min_words = MAX(bitarr1->num_of_words, bitarr2->num_of_words);
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = 0; i < min_words; i++) {
+    word1 = _reverse_word(bitarr1->words[i]);
+    word2 = _reverse_word(bitarr2->words[i]);
+    if(word1 != word2) return (word1 > word2 ? 1 : -1);
+  }
+
+  // Check remaining words. Only one of these loops will execute
+  for(; i < bitarr1->num_of_words; i++)
+    if(bitarr1->words[i]) return 1;
+  for(; i < bitarr2->num_of_words; i++)
+    if(bitarr2->words[i]) return -1;
+
+  if(bitarr1->num_of_bits == bitarr2->num_of_bits) return 0;
+  return bitarr1->num_of_bits > bitarr2->num_of_bits ? 1 : -1;
+}
+
+// compare bitarr with (bitarr2 << pos)
+// bit_array_cmp(bitarr1, bitarr2<<pos)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp_words(const BIT_ARRAY *arr1,
+                        bit_index_t pos, const BIT_ARRAY *arr2)
+{
+  if(arr1->num_of_bits == 0 && arr2->num_of_bits == 0)
+  {
+    return 0;
+  }
+
+  bit_index_t top_bit1 = 0, top_bit2 = 0;
+
+  char arr1_zero = !bit_array_find_last_set_bit(arr1, &top_bit1);
+  char arr2_zero = !bit_array_find_last_set_bit(arr2, &top_bit2);
+
+  if(arr1_zero && arr2_zero) return 0;
+  if(arr1_zero) return -1;
+  if(arr2_zero) return 1;
+
+  bit_index_t top_bit2_offset = top_bit2 + pos;
+
+  if(top_bit1 != top_bit2_offset) {
+    return top_bit1 > top_bit2_offset ? 1 : -1;
+  }
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = top_bit2 / WORD_SIZE; i > 0; i--)
+  {
+    word1 = _get_word(arr1, pos + i * WORD_SIZE);
+    word2 = arr2->words[i];
+
+    if(word1 > word2) return 1;
+    if(word1 < word2) return -1;
+  }
+
+  word1 = _get_word(arr1, pos);
+  word2 = arr2->words[0];
+
+  if(word1 > word2) return 1;
+  if(word1 < word2) return -1;
+
+  // return 1 if arr1[0..pos] != 0, 0 otherwise
+
+  // Whole words
+  word_addr_t num_words = pos / WORD_SIZE;
+
+  for(i = 0; i < num_words; i++)
+  {
+    if(arr1->words[i] > 0)
+    {
+      return 1;
+    }
+  }
+
+  word_offset_t bits_remaining = pos - num_words * WORD_SIZE;
+
+  if(arr1->words[num_words] & bitmask64(bits_remaining))
+  {
+    return 1;
+  }
+
+  return 0;
+}
+
+
+//
+// Reverse -- coords may wrap around
+//
+
+// No bounds checking
+// length cannot be zero
+static void _reverse_region(BIT_ARRAY* bitarr,
+                            bit_index_t start,
+                            bit_index_t length)
+{
+  bit_index_t left = start;
+  bit_index_t right = (start + length - WORD_SIZE) % bitarr->num_of_bits;
+
+  while(length >= 2 * WORD_SIZE)
+  {
+    // Swap entire words
+    word_t left_word = _get_word_cyclic(bitarr, left);
+    word_t right_word = _get_word_cyclic(bitarr, right);
+
+    // reverse words individually
+    left_word = _reverse_word(left_word);
+    right_word = _reverse_word(right_word);
+
+    // Swap
+    _set_word_cyclic(bitarr, left, right_word);
+    _set_word_cyclic(bitarr, right, left_word);
+
+    // Update
+    left = (left + WORD_SIZE) % bitarr->num_of_bits;
+    right = (right < WORD_SIZE ? right + bitarr->num_of_bits : right) - WORD_SIZE;
+    length -= 2 * WORD_SIZE;
+  }
+
+  word_t word, rev;
+
+  if(length == 0)
+  {
+    return;
+  }
+  else if(length > WORD_SIZE)
+  {
+    // Words overlap
+    word_t left_word = _get_word_cyclic(bitarr, left);
+    word_t right_word = _get_word_cyclic(bitarr, right);
+
+    rev = _reverse_word(left_word);
+    right_word = _reverse_word(right_word);
+
+    // fill left 64 bits with right word rev
+    _set_word_cyclic(bitarr, left, right_word);
+
+    // Now do remaining bits (length is between 1 and 64 bits)
+    left += WORD_SIZE;
+    length -= WORD_SIZE;
+
+    word = _get_word_cyclic(bitarr, left);
+  }
+  else
+  {
+    word = _get_word_cyclic(bitarr, left);
+    rev = _reverse_word(word);
+  }
+
+  rev >>= WORD_SIZE - length;
+  word_t mask = bitmask64(length);
+
+  word = bitmask_merge(rev, word, mask);
+
+  _set_word_cyclic(bitarr, left, word);
+}
+
+void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  if(len > 0) _reverse_region(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+void bit_array_reverse(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits > 0) _reverse_region(bitarr, 0, bitarr->num_of_bits);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Shift left / right
+//
+
+// Shift towards MSB / higher index
+void bit_array_shift_left(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill)
+{
+  if(shift_dist >= bitarr->num_of_bits)
+  {
+    fill ? bit_array_set_all(bitarr) : bit_array_clear_all(bitarr);
+    return;
+  }
+  else if(shift_dist == 0)
+  {
+    return;
+  }
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+
+  bit_index_t cpy_length = bitarr->num_of_bits - shift_dist;
+  _array_copy(bitarr, shift_dist, bitarr, 0, cpy_length);
+  _set_region(bitarr, 0, shift_dist, action);
+}
+
+// shift left extend - don't truncate bits when shifting UP, instead
+// make room for them.
+void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist,
+                                 char fill)
+{
+   bit_index_t newlen = bitarr->num_of_bits + shift_dist;
+   bit_index_t cpy_length = bitarr->num_of_bits;
+
+  if(shift_dist == 0)
+  {
+    return;
+  }
+
+  bit_array_resize_critical(bitarr, newlen);
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+  _array_copy(bitarr, shift_dist, bitarr, 0, cpy_length);
+  _set_region(bitarr, 0, shift_dist, action);
+}
+
+// Shift towards LSB / lower index
+void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill)
+{
+  if(shift_dist >= bitarr->num_of_bits)
+  {
+    fill ? bit_array_set_all(bitarr) : bit_array_clear_all(bitarr);
+    return;
+  }
+  else if(shift_dist == 0)
+  {
+    return;
+  }
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+
+  bit_index_t cpy_length = bitarr->num_of_bits - shift_dist;
+  bit_array_copy(bitarr, 0, bitarr, shift_dist, cpy_length);
+
+  _set_region(bitarr, cpy_length, shift_dist, action);
+}
+
+//
+// Cycle
+//
+
+// Cycle towards index 0
+void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t cycle_dist)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  cycle_dist = cycle_dist % bitarr->num_of_bits;
+
+  if(cycle_dist == 0)
+  {
+    return;
+  }
+
+  bit_index_t len1 = cycle_dist;
+  bit_index_t len2 = bitarr->num_of_bits - cycle_dist;
+
+  _reverse_region(bitarr, 0, len1);
+  _reverse_region(bitarr, len1, len2);
+  bit_array_reverse(bitarr);
+}
+
+// Cycle away from index 0
+void bit_array_cycle_left(BIT_ARRAY* bitarr, bit_index_t cycle_dist)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  cycle_dist = cycle_dist % bitarr->num_of_bits;
+
+  if(cycle_dist == 0)
+  {
+    return;
+  }
+
+  bit_index_t len1 = bitarr->num_of_bits - cycle_dist;
+  bit_index_t len2 = cycle_dist;
+
+  _reverse_region(bitarr, 0, len1);
+  _reverse_region(bitarr, len1, len2);
+  bit_array_reverse(bitarr);
+}
+
+//
+// Next permutation
+//
+
+static word_t _next_permutation(word_t v)
+{
+  // From http://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation
+  word_t t = v | (v - 1); // t gets v's least significant 0 bits set to 1
+  // Next set to 1 the most significant bit to change,
+  // set to 0 the least significant ones, and add the necessary 1 bits.
+  return (t+1) | (((~t & (t+1)) - 1) >> (trailing_zeros(v) + 1));
+}
+
+// Get the next permutation of an array with a fixed size and given number of
+// bits set.  Also known as next lexicographic permutation.
+// Given a bit array find the next lexicographic orginisation of the bits
+// Number of possible combinations given by (size choose bits_set) i.e. nCk
+// 00011 -> 00101 -> 00110 -> 01001 -> 01010 ->
+// 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start)
+void bit_array_next_permutation(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  word_addr_t w;
+
+  char carry = 0;
+  word_offset_t top_bits = bitset64_idx(bitarr->num_of_bits);
+
+  for(w = 0; w < bitarr->num_of_words; w++)
+  {
+    word_t mask
+      = (w < bitarr->num_of_words - 1 || top_bits == 0) ? WORD_MAX
+                                                        : bitmask64(top_bits);
+
+    if(bitarr->words[w] > 0 &&
+       (bitarr->words[w] | (bitarr->words[w]-1)) == mask)
+    {
+      // Bits in this word cannot be moved forward
+      carry = 1;
+    }
+    else if(carry)
+    {
+      // 0111 -> 1000, 1000 -> 1001
+      word_t tmp = bitarr->words[w] + 1;
+
+      // Count bits previously set
+      bit_index_t bits_previously_set = POPCOUNT(bitarr->words[w]);
+
+      // set new word
+      bitarr->words[w] = tmp;
+
+      // note: w is unsigned
+      // Zero words while counting bits set
+      while(w > 0)
+      {
+        bits_previously_set += POPCOUNT(bitarr->words[w-1]);
+        bitarr->words[w-1] = 0;
+        w--;
+      }
+
+      // Set bits at the beginning
+      SET_REGION(bitarr, 0, bits_previously_set - POPCOUNT(tmp));
+
+      carry = 0;
+      break;
+    }
+    else if(bitarr->words[w] > 0)
+    {
+      bitarr->words[w] = _next_permutation(bitarr->words[w]);
+      break;
+    }
+  }
+
+  if(carry)
+  {
+    // Loop around
+    bit_index_t num_bits_set = bit_array_num_bits_set(bitarr);
+    bit_array_clear_all(bitarr);
+    SET_REGION(bitarr, 0, num_bits_set);
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Interleave
+//
+
+// dst cannot point to the same bit array as src1 or src2
+// src1, src2 may point to the same bit array
+// abcd 1234 -> a1b2c3d4
+// 0011 0000 -> 00001010
+// 1111 0000 -> 10101010
+// 0101 1010 -> 01100110
+void bit_array_interleave(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1,
+                          const BIT_ARRAY* src2)
+{
+  // dst cannot be either src1 or src2
+  assert(dst != src1 && dst != src2);
+  // Behaviour undefined when src1 length != src2 length",
+  assert(src1->num_of_bits == src2->num_of_bits);
+
+  // Need at least src1->num_of_words + src2->num_of_words
+  size_t nwords = MIN(src1->num_of_words + src2->num_of_words, 2);
+  _bit_array_ensure_nwords(dst, nwords, __FILE__, __LINE__, __func__);
+  dst->num_of_bits = src1->num_of_bits + src2->num_of_bits;
+  dst->num_of_words = roundup_bits2words64(dst->num_of_bits);
+
+  word_addr_t i, j;
+
+  for(i = 0, j = 0; i < src1->num_of_words; i++)
+  {
+    word_t a = src1->words[i];
+    word_t b = src2->words[i];
+
+    dst->words[j++] =  morton_table0[(a      ) & 0xff] |
+                       morton_table1[(b      ) & 0xff] |
+                      (morton_table0[(a >>  8) & 0xff] << 16) |
+                      (morton_table1[(b >>  8) & 0xff] << 16) |
+                      (morton_table0[(a >> 16) & 0xff] << 32) |
+                      (morton_table1[(b >> 16) & 0xff] << 32) |
+                      (morton_table0[(a >> 24) & 0xff] << 48) |
+                      (morton_table1[(b >> 24) & 0xff] << 48);
+
+    dst->words[j++] =  morton_table0[(a >> 32) & 0xff] |
+                       morton_table1[(b >> 32) & 0xff] |
+                      (morton_table0[(a >> 40) & 0xff] << 16) |
+                      (morton_table1[(b >> 40) & 0xff] << 16) |
+                      (morton_table0[(a >> 48) & 0xff] << 32) |
+                      (morton_table1[(b >> 48) & 0xff] << 32) |
+                      (morton_table0[(a >> 56)       ] << 48) |
+                      (morton_table1[(b >> 56)       ] << 48);
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+//
+// Random
+//
+
+// Set bits randomly with probability prob : 0 <= prob <= 1
+void bit_array_random(BIT_ARRAY* bitarr, float prob)
+{
+  assert(prob >= 0 && prob <= 1);
+
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+  else if(prob == 1)
+  {
+    bit_array_set_all(bitarr);
+    return;
+  }
+
+  // rand() generates number between 0 and RAND_MAX inclusive
+  // therefore we want to check if rand() <= p
+  long p = RAND_MAX * prob;
+
+  _seed_rand();
+
+  word_addr_t w;
+  word_offset_t o;
+
+  // Initialise to zero
+  memset(bitarr->words, 0, bitarr->num_of_words * sizeof(word_t));
+
+  for(w = 0; w < bitarr->num_of_words - 1; w++)
+  {
+    for(o = 0; o < WORD_SIZE; o++)
+    {
+      if(rand() <= p)
+      {
+        bitarr->words[w] |= ((word_t)0x1 << o);
+      }
+    }
+  }
+
+  // Top word
+  word_offset_t bits_in_last_word = bits_in_top_word(bitarr->num_of_bits);
+  w = bitarr->num_of_words - 1;
+
+  for(o = 0; o < bits_in_last_word; o++)
+  {
+    if(rand() <= p)
+    {
+      bitarr->words[w] |= ((word_t)0x1 << o);
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Shuffle the bits in an array randomly
+void bit_array_shuffle(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits == 0)
+    return;
+
+  _seed_rand();
+
+  bit_index_t i, j;
+
+  for(i = bitarr->num_of_bits - 1; i > 0; i--)
+  {
+    j = (bit_index_t)rand() % i;
+
+    // Swap i and j
+    char x = (bitarr->words[bitset64_wrd(i)] >> bitset64_idx(i)) & 0x1;
+    char y = (bitarr->words[bitset64_wrd(j)] >> bitset64_idx(j)) & 0x1;
+
+    if(!y)
+      bitarr->words[bitset64_wrd(i)] &= ~((word_t)0x1 << bitset64_idx(i));
+    else
+      bitarr->words[bitset64_wrd(i)] |= (word_t)0x1 << bitset64_idx(i);
+
+    if(!x)
+      bitarr->words[bitset64_wrd(j)] &= ~((word_t)0x1 << bitset64_idx(j));
+    else
+      bitarr->words[bitset64_wrd(j)] |= (word_t)0x1 << bitset64_idx(j);
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Arithmetic
+//
+
+// Returns 1 on sucess, 0 if value in array is too big
+char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    *result = 0;
+    return 1;
+  }
+
+  word_addr_t w;
+
+  for(w = bitarr->num_of_words-1; w > 0; w--)
+  {
+    if(bitarr->words[w] > 0)
+    {
+      return 0;
+    }
+  }
+
+  *result = bitarr->words[0];
+  return 1;
+}
+
+
+// 1 iff bitarr > value
+// 0 iff bitarr == value
+// -1 iff bitarr < value
+int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value)
+{
+  uint64_t arr_num = 0;
+
+  // If cannot put bitarr in uint64, it is > value
+  if(!bit_array_as_num(bitarr, &arr_num)) return 1;
+
+  if(arr_num > value)      return  1;
+  else if(arr_num < value) return -1;
+  else                     return  0;
+}
+
+// If value is zero, no change is made
+void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value)
+{
+  if(value == 0)
+  {
+    return;
+  }
+  else if(bitarr->num_of_bits == 0)
+  {
+    bit_array_resize_critical(bitarr, WORD_SIZE - leading_zeros(value));
+    bitarr->words[0] = (word_t)value;
+    return;
+  }
+
+  char carry = 0;
+  word_addr_t i;
+
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    if(WORD_MAX - bitarr->words[i] < value)
+    {
+      carry = 1;
+      bitarr->words[i] += value;
+    }
+    else
+    {
+      // Carry is absorbed
+      bitarr->words[i] += value;
+      carry = 0;
+      break;
+    }
+  }
+
+  if(carry)
+  {
+    // Bit array full, need another bit after all words filled
+    bit_array_resize_critical(bitarr, bitarr->num_of_words * WORD_SIZE + 1);
+
+    // Set top word to 1
+    bitarr->words[bitarr->num_of_words-1] = 1;
+  }
+  else
+  {
+    word_t final_word = bitarr->words[bitarr->num_of_words-1];
+    word_offset_t expected_bits = bits_in_top_word(bitarr->num_of_bits);
+    word_offset_t actual_bits = WORD_SIZE - leading_zeros(final_word);
+
+    if(actual_bits > expected_bits)
+    {
+      // num_of_bits has increased -- num_of_words has not
+      bitarr->num_of_bits += (actual_bits - expected_bits);
+    }
+  }
+}
+
+// If value is greater than bitarr, bitarr is not changed and 0 is returned
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value)
+{
+  if(value == 0)
+  {
+    return 1;
+  }
+  else if(bitarr->words[0] >= value)
+  {
+    bitarr->words[0] -= value;
+    return 1;
+  }
+
+  value -= bitarr->words[0];
+
+  word_addr_t i;
+
+  for(i = 1; i < bitarr->num_of_words; i++)
+  {
+    if(bitarr->words[i] > 0)
+    {
+      // deduct one
+      bitarr->words[i]--;
+
+      for(; i > 0; i--)
+      {
+        bitarr->words[i] = WORD_MAX;
+      }
+
+      // -1 since we've already deducted 1
+      bitarr->words[0] = WORD_MAX - value - 1;
+
+      return 1;
+    }
+  }
+
+  // subtract value is greater than array
+  return 0;
+}
+
+//
+// Arithmetic between bit arrays
+//
+
+// src1, src2 and dst can all be the same BIT_ARRAY
+static void _arithmetic(BIT_ARRAY* dst,
+                        const BIT_ARRAY* src1,
+                        const BIT_ARRAY* src2,
+                        char subtract)
+{
+  word_addr_t max_words = MAX(src1->num_of_words, src2->num_of_words);
+
+  // Adding: dst_words >= max(src1 words, src2 words)
+  // Subtracting: dst_words is >= src1->num_of_words
+
+  char carry = subtract ? 1 : 0;
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = 0; i < max_words; i++)
+  {
+    word1 = (i < src1->num_of_words ? src1->words[i] : 0);
+    word2 = (i < src2->num_of_words ? src2->words[i] : 0);
+
+    if(subtract)
+      word2 = ~word2;
+
+    dst->words[i] = word1 + word2 + carry;
+    // Update carry
+    carry = WORD_MAX - word1 < word2 || WORD_MAX - word1 - word2 < (word_t)carry;
+  }
+
+  if(subtract)
+  {
+    carry = 0;
+  }
+  else
+  {
+    // Check last word
+    word_offset_t bits_on_last_word = bits_in_top_word(dst->num_of_bits);
+
+    if(bits_on_last_word < WORD_SIZE)
+    {
+      word_t mask = bitmask64(bits_on_last_word);
+
+      if(dst->words[max_words-1] > mask)
+      {
+        // Array has overflowed, increase size
+        dst->num_of_bits++;
+      }
+    }
+    else if(carry)
+    {
+      // Carry onto a new word
+      if(dst->num_of_words == max_words)
+      {
+        // Need to resize for the carry bit
+        bit_array_resize_critical(dst, dst->num_of_bits+1);
+      }
+
+      dst->words[max_words] = (word_t)1;
+    }
+  }
+
+  // Zero the rest of dst array
+  for(i = max_words+carry; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = (word_t)0;
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than either of src1, src2, it is enlarged
+void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  bit_array_ensure_size_critical(dst, MAX(src1->num_of_bits, src2->num_of_bits));
+  _arithmetic(dst, src1, src2, 0);
+}
+
+// dst = src1 - src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than src1, it will be extended to be as long as src1
+// src1 must be greater than or equal to src2 (src1 >= src2)
+void bit_array_subtract(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  // subtraction by method of complements:
+  // a - b = a + ~b + 1 = src1 + ~src2 +1
+
+  assert(bit_array_cmp(src1, src2) >= 0); // Require src1 >= src2
+
+  bit_array_ensure_size_critical(dst, src1->num_of_bits);
+  _arithmetic(dst, src1, src2, 1);
+}
+
+
+// Add `add` to `bitarr` at `pos`
+// Bounds checking not needed as out of bounds is valid
+void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add)
+{
+  DEBUG_VALIDATE(bitarr);
+
+  if(add == 0)
+  {
+    return;
+  }
+  else if(pos >= bitarr->num_of_bits)
+  {
+    // Resize and add!
+    bit_index_t num_bits_required = pos + (WORD_SIZE - leading_zeros(add));
+    bit_array_resize_critical(bitarr, num_bits_required);
+    _set_word(bitarr, pos, (word_t)add);
+    return;
+  }
+
+  /*
+  char str[1000];
+  printf(" add_word: %s\n", bit_array_to_str_rev(bitarr, str));
+  printf("     word: %s [pos: %i]\n", _word_to_str(add, str), (int)pos);
+  */
+
+  word_t w = _get_word(bitarr, pos);
+  word_t sum = w + add;
+  char carry = WORD_MAX - w < add;
+
+  // Ensure array is big enough
+  bit_index_t num_bits_required = pos + (carry ? WORD_SIZE + 1
+                                               : (WORD_SIZE - leading_zeros(sum)));
+
+  bit_array_ensure_size(bitarr, num_bits_required);
+
+  _set_word(bitarr, pos, sum);
+  pos += WORD_SIZE;
+
+  if(carry)
+  {
+    word_offset_t offset = pos % WORD_SIZE;
+    word_addr_t addr = bitset64_wrd(pos);
+
+    add = (word_t)0x1 << offset;
+    carry = (WORD_MAX - bitarr->words[addr] < add);
+    sum = bitarr->words[addr] + add;
+
+    num_bits_required = addr * WORD_SIZE +
+                        (carry ? WORD_SIZE + 1 : (WORD_SIZE - leading_zeros(sum)));
+
+    bit_array_ensure_size(bitarr, num_bits_required);
+
+    bitarr->words[addr++] = sum;
+
+    if(carry)
+    {
+      while(addr < bitarr->num_of_words && bitarr->words[addr] == WORD_MAX)
+      {
+        bitarr->words[addr++] = 0;
+      }
+
+      if(addr == bitarr->num_of_words)
+      {
+        bit_array_resize_critical(bitarr, addr * WORD_SIZE + 1);
+      }
+      else if(addr == bitarr->num_of_words-1 &&
+              bitarr->words[addr] == bitmask64(bits_in_top_word(bitarr->num_of_bits)))
+      {
+        bit_array_resize_critical(bitarr, bitarr->num_of_bits + 1);
+      }
+
+      bitarr->words[addr]++;
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Add `add` to `bitarr` at `pos`
+// Bounds checking not needed as out of bounds is valid
+void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add)
+{
+  assert(bitarr != add); // bitarr and add cannot point to the same bit array
+
+  bit_index_t add_top_bit_set;
+
+  if(!bit_array_find_last_set_bit(add, &add_top_bit_set))
+  {
+    // No bits set in add
+    return;
+  }
+  else if(pos >= bitarr->num_of_bits)
+  {
+    // Just resize and copy!
+    bit_index_t num_bits_required = pos + add_top_bit_set + 1;
+    bit_array_resize_critical(bitarr, num_bits_required);
+    _array_copy(bitarr, pos, add, 0, add->num_of_bits);
+    return;
+  }
+  else if(pos == 0)
+  {
+    bit_array_add(bitarr, bitarr, add);
+    return;
+  }
+
+  /*
+  char str[1000];
+  printf(" add_words1: %s\n", bit_array_to_str_rev(bitarr, str));
+  printf(" add_words2: %s\n", bit_array_to_str_rev(add, str));
+  printf(" [pos: %i]\n", (int)pos);
+  */
+
+  bit_index_t num_bits_required = pos + add_top_bit_set + 1;
+  bit_array_ensure_size(bitarr, num_bits_required);
+
+  word_addr_t first_word = bitset64_wrd(pos);
+  word_offset_t first_offset = bitset64_idx(pos);
+
+  word_t w = add->words[0] << first_offset;
+  unsigned char carry = (WORD_MAX - bitarr->words[first_word] < w);
+
+  bitarr->words[first_word] += w;
+
+  word_addr_t i = first_word + 1;
+  bit_index_t offset = WORD_SIZE - first_offset;
+
+  for(; carry || offset <= add_top_bit_set; i++, offset += WORD_SIZE)
+  {
+    w = offset < add->num_of_bits ? _get_word(add, offset) : (word_t)0;
+
+    if(i >= bitarr->num_of_words)
+    {
+      // Extend by a word
+      bit_array_resize_critical(bitarr, (bit_index_t)(i+1)*WORD_SIZE+1);
+    }
+
+    word_t prev = bitarr->words[i];
+
+    bitarr->words[i] += w + carry;
+
+    carry = (WORD_MAX - prev < w || (carry && prev + w == WORD_MAX)) ? 1 : 0;
+  }
+
+  word_offset_t top_bits
+    = WORD_SIZE - leading_zeros(bitarr->words[bitarr->num_of_words-1]);
+
+  bit_index_t min_bits = (bitarr->num_of_words-1)*WORD_SIZE + top_bits;
+
+  if(bitarr->num_of_bits < min_bits)
+  {
+    // Extend within the last word
+    bitarr->num_of_bits = min_bits;
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+char bit_array_sub_word(BIT_ARRAY* bitarr, bit_index_t pos, word_t minus)
+{
+  DEBUG_VALIDATE(bitarr);
+
+  if(minus == 0)
+  {
+    return 1;
+  }
+
+  word_t w = _get_word(bitarr, pos);
+
+  if(w >= minus)
+  {
+    _set_word(bitarr, pos, w - minus);
+    DEBUG_VALIDATE(bitarr);
+    return 1;
+  }
+
+  minus -= w;
+
+  bit_index_t offset;
+  for(offset = pos + WORD_SIZE; offset < bitarr->num_of_bits; offset += WORD_SIZE)
+  {
+    w = _get_word(bitarr, offset);
+
+    if(w > 0)
+    {
+      // deduct one
+      _set_word(bitarr, offset, w - 1);
+
+      SET_REGION(bitarr, pos, offset-pos);
+
+      // -1 since we've already deducted 1
+      minus--;
+
+      _set_word(bitarr, pos, WORD_MAX - minus);
+
+      DEBUG_VALIDATE(bitarr);
+      return 1;
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+
+  return 0;
+}
+
+char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus)
+{
+  assert(bitarr != minus); // bitarr and minus cannot point to the same bit array
+
+  int cmp = bit_array_cmp_words(bitarr, pos, minus);
+
+  if(cmp == 0)
+  {
+    bit_array_clear_all(bitarr);
+    return 1;
+  }
+  else if(cmp < 0)
+  {
+    return 0;
+  }
+
+  bit_index_t bitarr_length = bitarr->num_of_bits;
+
+  bit_index_t bitarr_top_bit_set;
+  bit_array_find_last_set_bit(bitarr, &bitarr_top_bit_set);
+
+  // subtraction by method of complements:
+  // a - b = a + ~b + 1 = src1 + ~src2 +1
+
+  bit_array_not(minus, minus);
+
+  bit_array_add_words(bitarr, pos, minus);
+  bit_array_add_word(bitarr, pos, (word_t)1);
+
+  bit_array_sub_word(bitarr, pos+minus->num_of_bits, 1);
+  bit_array_resize(bitarr, bitarr_length);
+
+  bit_array_not(minus, minus);
+
+  DEBUG_VALIDATE(bitarr);
+
+  return 1;
+}
+
+void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier)
+{
+  if(bitarr->num_of_bits == 0 || multiplier == 1)
+  {
+    return;
+  }
+  else if(multiplier == 0)
+  {
+    bit_array_clear_all(bitarr);
+    return;
+  }
+
+  bit_index_t i;
+
+  for(i = bitarr->num_of_bits; i > 0; i--)
+  {
+    if(bit_array_get(bitarr, i-1))
+    {
+      bit_array_clear(bitarr, i-1);
+      bit_array_add_word(bitarr, i-1, multiplier);
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2)
+{
+  if(src1->num_of_bits == 0 || src2->num_of_bits == 0)
+  {
+    bit_array_clear_all(dst);
+    return;
+  }
+
+  // Cannot pass the same array as dst, src1 AND src2
+  assert(dst != src1 || dst != src2);
+
+  // Dev: multiplier == 1?
+
+  BIT_ARRAY *read_arr, *add_arr;
+
+  if(src1 == dst)
+  {
+    read_arr = src1;
+    add_arr = src2;
+  }
+  else
+  {
+    read_arr = src2;
+    add_arr = src1;
+  }
+
+  if(dst != src1 && dst != src2)
+  {
+    bit_array_clear_all(dst);
+  }
+
+  bit_index_t i;
+
+  for(i = read_arr->num_of_bits; i > 0; i--)
+  {
+    if(bit_array_get(read_arr, i-1))
+    {
+      bit_array_clear(dst, i-1);
+      bit_array_add_words(dst, i-1, add_arr);
+    }
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// bitarr = round_down(bitarr / divisor)
+// rem = bitarr % divisor
+void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem)
+{
+  assert(divisor != 0); // cannot divide by zero
+
+  bit_index_t div_top_bit = 63 - leading_zeros(divisor);
+  bit_index_t bitarr_top_bit;
+
+  if(!bit_array_find_last_set_bit(bitarr, &bitarr_top_bit))
+  {
+    *rem = 0;
+    return;
+  }
+
+  if(bitarr_top_bit < div_top_bit)
+  {
+    *rem = bitarr->words[0];
+    bit_array_clear_all(bitarr);
+    return;
+  }
+
+  // When div is shifted by offset, their top set bits are aligned
+  bit_index_t offset = bitarr_top_bit - div_top_bit;
+
+  uint64_t tmp = _get_word(bitarr, offset);
+  _set_word(bitarr, offset, (word_t)0);
+
+  // Carry if 1 if the top bit was set before left shift
+  char carry = 0;
+
+  // offset unsigned so break when offset == 0
+  while(1)
+  {
+    if(carry)
+    {
+      // (carry:tmp) - divisor = (WORD_MAX+1+tmp)-divisor
+      tmp = WORD_MAX - divisor + tmp + 1;
+      bit_array_set(bitarr, offset);
+    }
+    else if(tmp >= divisor)
+    {
+      tmp -= divisor;
+      bit_array_set(bitarr, offset);
+    }
+    else
+    {
+      bit_array_clear(bitarr, offset);
+    }
+
+    if(offset == 0)
+      break;
+
+    offset--;
+
+    // Is the top bit set (that we're about to shift off)?
+    carry = tmp & 0x8000000000000000;
+
+    tmp <<= 1;
+    tmp |= bit_array_get(bitarr, offset);
+  }
+
+  *rem = tmp;
+}
+
+// Results in:
+//   quotient = dividend / divisor
+//   dividend = dividend % divisor
+// (dividend is used to return the remainder)
+void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor)
+{
+  assert(bit_array_cmp_uint64(divisor, 0) != 0); // Cannot divide by zero
+
+  bit_array_clear_all(quotient);
+
+  int cmp = bit_array_cmp(dividend, divisor);
+
+  if(cmp == 0)
+  {
+    bit_array_ensure_size(quotient, 1);
+    bit_array_set(quotient, 0);
+    bit_array_clear_all(dividend);
+    return;
+  }
+  else if(cmp < 0)
+  {
+    // dividend is < divisor, quotient is zero -- done
+    return;
+  }
+
+  // now we know: dividend > divisor, quotient is zero'd,
+  //              dividend != 0, divisor != 0
+  bit_index_t dividend_top_bit = 0, div_top_bit = 0;
+
+  bit_array_find_last_set_bit(dividend, &dividend_top_bit);
+  bit_array_find_last_set_bit(divisor, &div_top_bit);
+
+  // When divisor is shifted by offset, their top set bits are aligned
+  bit_index_t offset = dividend_top_bit - div_top_bit;
+
+  // offset unsigned so break when offset == 0
+  for(; ; offset--)
+  {
+    if(bit_array_cmp_words(dividend, offset, divisor) >= 0)
+    {
+      bit_array_sub_words(dividend, offset, divisor);
+      bit_array_ensure_size(quotient, offset+1);
+      bit_array_set(quotient, offset);
+    }
+
+    if(offset == 0)
+      break;
+  }
+}
+
+//
+// Read/Write from files
+//
+// file format is [8 bytes: for number of elements in array][data]
+// data is written in little endian order (least sig byte first)
+//
+
+// Saves bit array to a file. Returns the number of bytes written
+// number of bytes returned should be 8+(bitarr->num_of_bits+7)/8
+bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f)
+{
+  bit_index_t num_of_bytes = roundup_bits2bytes(bitarr->num_of_bits);
+  bit_index_t bytes_written = 0;
+
+  const int endian = 1;
+  if(*(uint8_t*)&endian == 1)
+  {
+    // Little endian machine
+    // Write 8 bytes to store the number of bits in the array
+    bytes_written += fwrite(&bitarr->num_of_bits, 1, 8, f);
+
+    // Write the array
+    bytes_written += fwrite(bitarr->words, 1, num_of_bytes, f);
+  }
+  else
+  {
+    // Big endian machine
+    uint64_t i, w, whole_words = num_of_bytes/sizeof(word_t);
+    uint64_t rem_bytes = num_of_bytes - whole_words*sizeof(word_t);
+    uint64_t n_bits = byteswap64(bitarr->num_of_bits);
+
+    // Write 8 bytes to store the number of bits in the array
+    bytes_written += fwrite(&n_bits, 1, 8, f);
+
+    // Write the array
+    for(i = 0; i < whole_words; i++) {
+      w = byteswap64(bitarr->words[i]);
+      bytes_written += fwrite(&w, 1, 8, f);
+    }
+
+    if(rem_bytes > 0) {
+      w = byteswap64(bitarr->words[whole_words]);
+      bytes_written += fwrite(&w, 1, rem_bytes, f);
+    }
+  }
+
+  return bytes_written;
+}
+
+// Load a uint64 from little endian format.
+// Works for both big and little endian architectures
+static inline uint64_t le64_to_cpu(const uint8_t *x)
+{
+  return (((uint64_t)(x[0]))       | ((uint64_t)(x[1]) << 8)  |
+          ((uint64_t)(x[2]) << 16) | ((uint64_t)(x[3]) << 24) |
+          ((uint64_t)(x[4]) << 32) | ((uint64_t)(x[5]) << 40) |
+          ((uint64_t)(x[6]) << 48) | ((uint64_t)(x[7]) << 56));
+}
+
+// Reads bit array from a file. bitarr is resized and filled.
+// Returns 1 on success, 0 on failure
+char bit_array_load(BIT_ARRAY* bitarr, FILE* f)
+{
+  // Read in number of bits, return 0 if we can't read in
+  bit_index_t num_bits;
+  if(fread(&num_bits, 1, 8, f) != 8) return 0;
+  num_bits = le64_to_cpu((uint8_t*)&num_bits);
+
+  // Resize
+  bit_array_resize_critical(bitarr, num_bits);
+
+  // Have to calculate how many bytes are needed for the file
+  // (Note: this may be different from num_of_words * sizeof(word_t))
+  bit_index_t num_of_bytes = roundup_bits2bytes(bitarr->num_of_bits);
+  if(fread(bitarr->words, 1, num_of_bytes, f) != num_of_bytes) return 0;
+
+  // Fix endianness
+  word_addr_t i;
+  for(i = 0; i < bitarr->num_of_words; i++)
+    bitarr->words[i] = le64_to_cpu((uint8_t*)&bitarr->words[i]);
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+  return 1;
+}
+
+//
+// Hash function
+//
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define hashsize(n) ((uint32_t)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+/*
+From: lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+--------------------------------------------------------------------
+hashword2() -- same as hashword(), but take two seeds and return two
+32-bit values.  pc and pb must both be nonnull, and *pc and *pb must
+both be initialized with seeds.  If you pass in (*pb)==0, the output
+(*pc) will be the same as the return value from hashword().
+--------------------------------------------------------------------
+*/
+static void hashword2 (
+const uint32_t *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32_t       *pc,                      /* IN: seed OUT: primary hash value */
+uint32_t       *pb)               /* IN: more seed OUT: secondary hash value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
+  c += *pb;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  {
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  *pc=c; *pb=b;
+}
+
+// Pass seed as 0 on first call, pass previous hash value if rehashing due
+// to a collision
+// Using bob jenkins hash lookup3
+uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed)
+{
+  uint32_t seed32[2];
+  memcpy(seed32, &seed, sizeof(uint32_t)*2);
+
+  // Round up length to number 32bit words
+  hashword2((uint32_t*)bitarr->words, (bitarr->num_of_bits + 31) / 32,
+            &seed32[0], &seed32[1]);
+
+  // XOR with array length. This ensures arrays with different length but same
+  // contents have different hash values
+  seed ^= bitarr->num_of_bits;
+
+  return seed;
+}
+
+
+//
+// Generally useful functions
+//
+
+// Generalised 'binary to string' function
+// Adds bits to the string in order of lsb to msb
+// e.g. 0b11010 (26 in decimal) would come out as "01011"
+char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str)
+{
+  const uint8_t* d = (const uint8_t*)ptr;
+
+  size_t i;
+  for(i = 0; i < num_of_bits; i++)
+  {
+    uint8_t bit = (d[i/8] >> (i % 8)) & 0x1;
+    str[i] = bit ? '1' : '0';
+  }
+  str[num_of_bits] = '\0';
+  return str;
+}
+
+char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str)
+{
+  const uint8_t* d = (const uint8_t*)ptr;
+
+  size_t i;
+  for(i = 0; i < num_of_bits; i++)
+  {
+    uint8_t bit = (d[i/8] >> (i % 8)) & 0x1;
+    str[num_of_bits-1-i] = bit ? '1' : '0';
+  }
+  str[num_of_bits] = '\0';
+  return str;
+}
diff --git a/src/rank9b.cpp b/src/rank9b.cpp
new file mode 100644
index 0000000..58756a4
--- /dev/null
+++ b/src/rank9b.cpp
@@ -0,0 +1,67 @@
+/*		 
+ * Sux: Succinct data structures
+ *
+ * Copyright (C) 2007-2013 Sebastiano Vigna 
+ *
+ *  This library is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU Lesser General Public License as published by the Free
+ *  Software Foundation; either version 3 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This library is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <cassert>
+#include <cstring>
+#include "rank9b.h"
+
+rank9b::rank9b() {}
+
+rank9b::rank9b( const uint64_t * const bits, const uint64_t num_bits ) {
+	this->bits = bits;
+	num_words = ( num_bits + 63 ) / 64;
+	num_counts = ( ( num_bits + 64 * 8 - 1 ) / ( 64 * 8 ) ) * 2;
+	
+	// Init rank structure
+	counts = new uint64_t[ num_counts + 1 ];
+	memset( counts, 0, ( num_counts + 1 ) * sizeof *counts );
+
+	uint64_t c = 0;
+	uint64_t pos = 0;
+	for( uint64_t i = 0; i < num_words; i += 8, pos += 2 ) {
+		counts[ pos ] = c;
+		c += __builtin_popcountll( bits[ i ] );
+		for( int j = 1;  j < 8; j++ ) {
+			counts[ pos + 1 ] |= ( c - counts[ pos ] ) << 63 - 9 * j;
+			if ( i + j < num_words ) c += __builtin_popcountll( bits[ i + j ] );
+		}
+	}
+
+	counts[ num_counts ] = c;
+
+	assert( c <= num_bits );
+}
+
+rank9b::~rank9b() {
+	delete [] counts;
+}
+
+
+uint64_t rank9b::rank( const uint64_t k ) {
+	const uint64_t word = k / 64;
+	const uint64_t block = word / 4 & ~1;
+	const int offset = word % 8;
+	return counts[ block ] + ( counts[ block + 1 ] >> ( 63 - offset * 9 ) & 0x1FF ) + __builtin_popcountll( bits[ word ] & ( ( 1ULL << k % 64 ) - 1 ) );
+}
+
+uint64_t rank9b::bit_count() {
+	return num_counts * 64;
+}
+
+void rank9b::print_counts() {}
diff --git a/src/stringpiece.cc b/src/stringpiece.cc
new file mode 100644
index 0000000..1b248ae
--- /dev/null
+++ b/src/stringpiece.cc
@@ -0,0 +1,90 @@
+// Copyright 2004 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <ostream>
+#include "stringpiece.h"
+//#include "util/util.h"
+
+std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
+  o.write(piece.data(), piece.size());
+  return o;
+}
+
+bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
+  int len = x.size();
+  if (len != y.size()) {
+    return false;
+  }
+  const char* p = x.data();
+  const char* p2 = y.data();
+  // Test last byte in case strings share large common prefix
+  if ((len > 0) && (p[len-1] != p2[len-1])) return false;
+  const char* p_limit = p + len;
+  for (; p < p_limit; p++, p2++) {
+    if (*p != *p2)
+      return false;
+  }
+  return true;
+}
+
+void StringPiece::CopyToString(std::string* target) const {
+  target->assign(ptr_, length_);
+}
+
+int StringPiece::copy(char* buf, size_type n, size_type pos) const {
+  int ret = std::min(length_ - pos, n);
+  memcpy(buf, ptr_ + pos, ret);
+  return ret;
+}
+
+bool StringPiece::contains(StringPiece s) const {
+  return (size_t)find(s, 0) != npos;
+}
+
+int StringPiece::find(const StringPiece& s, size_type pos) const {
+  if (length_ < 0 || pos > static_cast<size_type>(length_))
+    return npos;
+
+  const char* result = std::search(ptr_ + pos, ptr_ + length_,
+                                   s.ptr_, s.ptr_ + s.length_);
+  const size_type xpos = result - ptr_;
+  return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos;
+}
+
+int StringPiece::find(char c, size_type pos) const {
+  if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
+    return npos;
+  }
+  const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
+  return result != ptr_ + length_ ? result - ptr_ : npos;
+}
+
+int StringPiece::rfind(const StringPiece& s, size_type pos) const {
+  if (length_ < s.length_) return npos;
+  const size_t ulen = length_;
+  if (s.length_ == 0) return std::min(ulen, pos);
+
+  const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_;
+  const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
+  return result != last ? result - ptr_ : npos;
+}
+
+int StringPiece::rfind(char c, size_type pos) const {
+  if (length_ <= 0) return npos;
+  for (int i = std::min(pos, static_cast<size_type>(length_ - 1));
+       i >= 0; --i) {
+    if (ptr_[i] == c) {
+      return i;
+    }
+  }
+  return npos;
+}
+
+StringPiece StringPiece::substr(size_type pos, size_type n) const {
+  if (pos > static_cast<size_type>(length_)) pos = static_cast<size_type>(length_);
+  if (n > length_ - pos) n = length_ - pos;
+  return StringPiece(ptr_ + pos, n);
+}
+
+const StringPiece::size_type StringPiece::npos = size_type(-1);
diff --git a/src/xxhash.c b/src/xxhash.c
new file mode 100644
index 0000000..e6fb8f1
--- /dev/null
+++ b/src/xxhash.c
@@ -0,0 +1,915 @@
+/*
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2015, Yann Collet
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/**************************************
+*  Tuning parameters
+**************************************/
+/* Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+ * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+ * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+ * You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+ */
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
+
+/* XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/* XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1.
+ * It will improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+
+/**************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define FORCE_INLINE static __forceinline
+#else
+#  if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/**************************************
+*  Includes & Memory related functions
+***************************************/
+#include "xxhash.h"
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+
+/**************************************
+*  Basic Types
+***************************************/
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
+#endif
+
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val32;
+    memcpy(&val32, memPtr, 4);
+    return val32;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val64;
+    memcpy(&val64, memPtr, 8);
+    return val64;
+}
+
+
+
+/******************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/***************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+#ifndef XXH_CPU_LITTLE_ENDIAN   /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example using a compiler switch */
+static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
+#endif
+
+
+/*****************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+
+/***************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/***************************************
+*  Constants
+***************************************/
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+#define PRIME64_1 11400714785074694791ULL
+#define PRIME64_2 14029467366897019727ULL
+#define PRIME64_3  1609587929392839161ULL
+#define PRIME64_4  9650029242287828579ULL
+#define PRIME64_5  2870177450012600261ULL
+
+
+/*****************************
+*  Simple Hash Functions
+*****************************/
+FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_get32bits(p) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_get32bits(p) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_get32bits(p) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_get32bits(p) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+unsigned XXH32 (const void* input, size_t len, unsigned seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, input, len);
+    return XXH32_digest(&state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL)
+    {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do
+        {
+            v1 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            v2 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            v3 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            v4 += XXH_get64bits(p) * PRIME64_2;
+            p+=8;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+        }
+        while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64 * PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_get64bits(p);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+/****************************************************
+*  Advanced Hash Functions
+****************************************************/
+
+/*** Allocation ***/
+typedef struct
+{
+    U64 total_len;
+    U32 seed;
+    U32 v1;
+    U32 v2;
+    U32 v3;
+    U32 v4;
+    U32 mem32[4];   /* defined as U32 for alignment */
+    U32 memsize;
+} XXH_istate32_t;
+
+typedef struct
+{
+    U64 total_len;
+    U64 seed;
+    U64 v1;
+    U64 v2;
+    U64 v3;
+    U64 v4;
+    U64 mem64[4];   /* defined as U64 for alignment */
+    U32 memsize;
+} XXH_istate64_t;
+
+
+XXH32_state_t* XXH32_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t));   /* A compilation error here means XXH32_state_t is not large enough */
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH64_state_t* XXH64_createState(void)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t));   /* A compilation error here means XXH64_state_t is not large enough */
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
+{
+    XXH_istate32_t* state = (XXH_istate32_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed)
+{
+    XXH_istate64_t* state = (XXH_istate64_t*) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME64_1 + PRIME64_2;
+    state->v2 = seed + PRIME64_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME64_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+{
+    XXH_istate32_t* state = (XXH_istate32_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   /* fill in tmp buffer */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {
+            const U32* p32 = state->mem32;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v1 = XXH_rotl32(state->v1, 13);
+            state->v1 *= PRIME32_1;
+            p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v2 = XXH_rotl32(state->v2, 13);
+            state->v2 *= PRIME32_1;
+            p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v3 = XXH_rotl32(state->v3, 13);
+            state->v3 *= PRIME32_1;
+            p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2;
+            state->v4 = XXH_rotl32(state->v4, 13);
+            state->v4 *= PRIME32_1;
+            p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32(p, endian) * PRIME32_2;
+            v1 = XXH_rotl32(v1, 13);
+            v1 *= PRIME32_1;
+            p+=4;
+            v2 += XXH_readLE32(p, endian) * PRIME32_2;
+            v2 = XXH_rotl32(v2, 13);
+            v2 *= PRIME32_1;
+            p+=4;
+            v3 += XXH_readLE32(p, endian) * PRIME32_2;
+            v3 = XXH_rotl32(v3, 13);
+            v3 *= PRIME32_1;
+            p+=4;
+            v4 += XXH_readLE32(p, endian) * PRIME32_2;
+            v4 = XXH_rotl32(v4, 13);
+            v4 *= PRIME32_1;
+            p+=4;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem32, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian)
+{
+    const XXH_istate32_t* state = (const XXH_istate32_t*) state_in;
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (U32) state->total_len;
+
+    while (p+4<=bEnd)
+    {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32_digest (const XXH32_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian)
+{
+    XXH_istate64_t * state = (XXH_istate64_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32)   /* fill in tmp buffer */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   /* some data left from previous update */
+    {
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        {
+            const U64* p64 = state->mem64;
+            state->v1 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v1 = XXH_rotl64(state->v1, 31);
+            state->v1 *= PRIME64_1;
+            p64++;
+            state->v2 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v2 = XXH_rotl64(state->v2, 31);
+            state->v2 *= PRIME64_1;
+            p64++;
+            state->v3 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v3 = XXH_rotl64(state->v3, 31);
+            state->v3 *= PRIME64_1;
+            p64++;
+            state->v4 += XXH_readLE64(p64, endian) * PRIME64_2;
+            state->v4 = XXH_rotl64(state->v4, 31);
+            state->v4 *= PRIME64_1;
+            p64++;
+        }
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd)
+    {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE64(p, endian) * PRIME64_2;
+            v1 = XXH_rotl64(v1, 31);
+            v1 *= PRIME64_1;
+            p+=8;
+            v2 += XXH_readLE64(p, endian) * PRIME64_2;
+            v2 = XXH_rotl64(v2, 31);
+            v2 *= PRIME64_1;
+            p+=8;
+            v3 += XXH_readLE64(p, endian) * PRIME64_2;
+            v3 = XXH_rotl64(v3, 31);
+            v3 *= PRIME64_1;
+            p+=8;
+            v4 += XXH_readLE64(p, endian) * PRIME64_2;
+            v4 = XXH_rotl64(v4, 31);
+            v4 *= PRIME64_1;
+            p+=8;
+        }
+        while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->mem64, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian)
+{
+    const XXH_istate64_t * state = (const XXH_istate64_t *) state_in;
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32)
+    {
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+
+        v1 *= PRIME64_2;
+        v1 = XXH_rotl64(v1, 31);
+        v1 *= PRIME64_1;
+        h64 ^= v1;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v2 *= PRIME64_2;
+        v2 = XXH_rotl64(v2, 31);
+        v2 *= PRIME64_1;
+        h64 ^= v2;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v3 *= PRIME64_2;
+        v3 = XXH_rotl64(v3, 31);
+        v3 *= PRIME64_1;
+        h64 ^= v3;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+
+        v4 *= PRIME64_2;
+        v4 = XXH_rotl64(v4, 31);
+        v4 *= PRIME64_1;
+        h64 ^= v4;
+        h64 = h64*PRIME64_1 + PRIME64_4;
+    }
+    else
+    {
+        h64  = state->seed + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd)
+    {
+        U64 k1 = XXH_readLE64(p, endian);
+        k1 *= PRIME64_2;
+        k1 = XXH_rotl64(k1,31);
+        k1 *= PRIME64_1;
+        h64 ^= k1;
+        h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd)
+    {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rapmap.git