[med-svn] [salmon] 02/02: fix building

Wed Sep 7 20:40:08 UTC 2016

This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository salmon.

commit 4b16494e279cc81ab1525c9a6a1f5d6e18a2d126
Author: Sascha Steinbiss <satta at debian.org>
Date:   Wed Sep 7 20:39:57 2016 +0000

    fix building
---
 debian/control                                     |    3 +
 ...007-Remove-unnecessarily-linked-libraries.patch |    8 +-
 debian/patches/0008-Remove-salmon_core-lib.patch   |   19 +-
 ...emove-FIND_PACKAGE-for-liblzma-and-libbz2.patch |    2 +-
 debian/patches/cmake-typo-fixes                    |    2 +-
 debian/patches/dependency-fix                      |   51 +-
 debian/patches/use_debian_packaged_rapmap.patch    |   28 +
 debian/rapmap/BooMap.hpp                           |  193 ++
 debian/rapmap/BooPHF.hpp                           | 1221 ++++++++
 debian/rapmap/HitManager.cpp                       |  700 +++++
 debian/rapmap/HitManager.hpp                       |  109 +
 debian/rapmap/IndexHeader.hpp                      |   77 +
 debian/rapmap/JFRaw.hpp                            |   30 +
 debian/rapmap/RapMapConfig.hpp                     |   14 +
 debian/rapmap/RapMapFileSystem.cpp                 |   37 +
 debian/rapmap/RapMapFileSystem.hpp                 |   15 +
 debian/rapmap/RapMapIndex.hpp                      |   52 +
 debian/rapmap/RapMapSAIndex.cpp                    |  177 ++
 debian/rapmap/RapMapSAIndex.hpp                    |   63 +
 debian/rapmap/RapMapSAIndexer.cpp                  |  731 +++++
 debian/rapmap/RapMapUtils.hpp                      |  825 +++++
 debian/rapmap/SACollector.hpp                      |  580 ++++
 debian/rapmap/SASearcher.hpp                       |  631 ++++
 debian/rapmap/ScopedTimer.hpp                      |   22 +
 debian/rapmap/SpinLock.hpp                         |   25 +
 debian/rapmap/bit_array.c                          | 3160 ++++++++++++++++++++
 debian/rapmap/bit_array.h                          |  552 ++++
 debian/rapmap/bit_macros.h                         |  205 ++
 debian/rapmap/kseq.h                               |  235 ++
 debian/rapmap/macros.h                             |   59 +
 debian/rapmap/rank9b.cpp                           |   67 +
 debian/rapmap/rank9b.h                             |   42 +
 debian/rules                                       |   39 +
 33 files changed, 9957 insertions(+), 17 deletions(-)

diff --git a/debian/control b/debian/control
index de6d553..b7e48bd 100644
--- a/debian/control
+++ b/debian/control
@@ -8,6 +8,7 @@ Build-Depends: debhelper (>= 9),
                cmake,
                libboost-filesystem-dev,
                libboost-system-dev,
+               libboost-iostreams-dev,
                libboost-thread-dev,
                libboost-program-options-dev,
                libboost-timer-dev,
@@ -22,9 +23,11 @@ Build-Depends: debhelper (>= 9),
                libgff-dev,
                libstaden-read-dev,
                libspdlog-dev,
+               libtclap-dev,
                help2man,
                sphinx-doc,
                python-sphinx | python3-sphinx,
+               python-sphinx-rtd-theme | python3-sphinx-rtd-theme,
                zlib1g-dev,
                libeigen3-dev,
                rapmap
diff --git a/debian/patches/0007-Remove-unnecessarily-linked-libraries.patch b/debian/patches/0007-Remove-unnecessarily-linked-libraries.patch
index b07b979..1bf1677 100644
--- a/debian/patches/0007-Remove-unnecessarily-linked-libraries.patch
+++ b/debian/patches/0007-Remove-unnecessarily-linked-libraries.patch
@@ -9,9 +9,9 @@ Subject: Remove unnecessarily linked libraries
 
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
-@@ -134,16 +134,12 @@
-     libjellyfish-2.0.so
-     /usr/lib/libbwa.a
+@@ -135,16 +135,12 @@
+     jellyfish-2.0
+     bwa
      m
 -    ${LIBLZMA_LIBRARIES}
 -    ${BZIP2_LIBRARIES}
@@ -34,7 +34,7 @@ Subject: Remove unnecessarily linked libraries
  ##
  set(Boost_ADDITIONAL_VERSIONS "1.53" "1.53.0" "1.54" "1.55" "1.56" "1.57.0" "1.58" "1.59" "1.60" "1.61")
 -find_package(Boost 1.61.0 COMPONENTS iostreams filesystem system thread timer chrono program_options serialization)
-+find_package(Boost 1.61.0 COMPONENTS filesystem system thread timer program_options)
++find_package(Boost 1.61.0 COMPONENTS iostreams filesystem system thread timer program_options)
  message("BOOST_INCLUDEDIR = ${BOOST_INCLUDEDIR}")
  message("BOOST_LIBRARYDIR = ${BOOST_LIBRARYDIR}")
  message("Boost_FOUND = ${Boost_FOUND}")
diff --git a/debian/patches/0008-Remove-salmon_core-lib.patch b/debian/patches/0008-Remove-salmon_core-lib.patch
index 508f7ec..1bd87fd 100644
--- a/debian/patches/0008-Remove-salmon_core-lib.patch
+++ b/debian/patches/0008-Remove-salmon_core-lib.patch
@@ -8,7 +8,7 @@ Subject: Remove salmon_core lib
 
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
-@@ -102,11 +102,8 @@ else()
+@@ -103,13 +103,10 @@
    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
  endif()
  
@@ -19,9 +19,12 @@ Subject: Remove salmon_core lib
 -add_executable(salmon ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
 +add_executable(salmon ${SALMON_LIB_SRCS} ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
  
- add_executable(unitTests ${UNIT_TESTS_SRCS})
+-add_executable(unitTests ${UNIT_TESTS_SRCS})
++add_executable(unitTests ${SALMON_LIB_SRCS} ${UNIT_TESTS_SRCS})
  
-@@ -123,7 +120,6 @@ set (SUFFARRAY64_LIB ${GAT_SOURCE_DIR}/e
+ #add_executable(salmon-read ${SALMON_READ_SRCS})
+ #set_target_properties(salmon-read PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_LIBPTHREAD -D_PBGZF_USE -fopenmp"
+@@ -124,7 +121,6 @@
  
  # Link the executable
  target_link_libraries(salmon
@@ -29,7 +32,15 @@ Subject: Remove salmon_core lib
      gff
      ${PTHREAD_LIB}
      ${Boost_LIBRARIES}
-@@ -217,7 +213,7 @@ install(DIRECTORY
+@@ -143,7 +139,6 @@
+ 
+ # Link the executable
+ target_link_libraries(unitTests
+-    salmon_core
+     gff
+     ${PTHREAD_LIB}
+     ${Boost_LIBRARIES}
+@@ -218,7 +213,7 @@
  # install(FILES ${Boost_LIBRARIES}
  # 	           DESTINATION ${INSTALL_LIB_DIR})
  
diff --git a/debian/patches/0009-Remove-FIND_PACKAGE-for-liblzma-and-libbz2.patch b/debian/patches/0009-Remove-FIND_PACKAGE-for-liblzma-and-libbz2.patch
index 09ae7fe..a7b815c 100644
--- a/debian/patches/0009-Remove-FIND_PACKAGE-for-liblzma-and-libbz2.patch
+++ b/debian/patches/0009-Remove-FIND_PACKAGE-for-liblzma-and-libbz2.patch
@@ -9,7 +9,7 @@ As these seem not to be required.
 
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -188,53 +188,53 @@ if (NOT ZLIB_FOUND)
+@@ -188,53 +188,53 @@
  	message (FATAL_ERROR "zlib must be installed before configuration & building can proceed")
  endif()
  
diff --git a/debian/patches/cmake-typo-fixes b/debian/patches/cmake-typo-fixes
index 4c9a96d..782a0df 100644
--- a/debian/patches/cmake-typo-fixes
+++ b/debian/patches/cmake-typo-fixes
@@ -2,7 +2,7 @@ Author: Michael R. Crusoe <crusoe at ucdavis.edu>
 Description: fix upstream's typos
 --- a/src/SalmonQuantify.cpp
 +++ b/src/SalmonQuantify.cpp
-@@ -1985,7 +1985,7 @@ int salmonQuantify(int argc, char* argv[
+@@ -1985,7 +1985,7 @@
      (
       "maxOcc,m", 
       po::value<int>(&(memOptions->max_occ))->default_value(200),
diff --git a/debian/patches/dependency-fix b/debian/patches/dependency-fix
index 68a8c5d..2f50a41 100644
--- a/debian/patches/dependency-fix
+++ b/debian/patches/dependency-fix
@@ -484,7 +484,15 @@ Description: Use Debian version of dependencies, don't download them
  ${ZLIB_INCLUDE_DIR}
  ${TBB_INCLUDE_DIRS}
  ${Boost_INCLUDE_DIRS}
-@@ -107,7 +103,7 @@
+@@ -71,6 +67,7 @@
+ set ( UNIT_TESTS_SRCS
+     ${GAT_SOURCE_DIR}/tests/UnitTests.cpp
+     FragmentLengthDistribution.cpp
++    xxhash.c
+ )
+ 
+ 
+@@ -107,7 +104,7 @@
  endif()
  
  # Build the Salmon library
@@ -493,19 +501,48 @@ Description: Use Debian version of dependencies, don't download them
  
  # Build the salmon executable
  add_executable(salmon ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
-@@ -131,12 +127,12 @@
+@@ -131,12 +128,12 @@
+     gff
+     ${PTHREAD_LIB}
+     ${Boost_LIBRARIES}
+-    ${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a
++    staden-read
+     ${ZLIB_LIBRARY}
+-    ${SUFFARRAY_LIB}
+-    ${SUFFARRAY64_LIB}
+-    ${GAT_SOURCE_DIR}/external/install/lib/libjellyfish-2.0.a
+-    ${GAT_SOURCE_DIR}/external/install/lib/libbwa.a
++    divsufsort
++    divsufsort64
++    jellyfish-2.0
++    bwa
+     m
+     ${LIBLZMA_LIBRARIES}
+     ${BZIP2_LIBRARIES}
+@@ -154,12 +151,12 @@
      gff
      ${PTHREAD_LIB}
      ${Boost_LIBRARIES}
 -    ${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a
-+    libstaden-read.so
++    staden-read
      ${ZLIB_LIBRARY}
-     ${SUFFARRAY_LIB}
-     ${SUFFARRAY64_LIB}
+-    ${SUFFARRAY_LIB}
+-    ${SUFFARRAY64_LIB}
 -    ${GAT_SOURCE_DIR}/external/install/lib/libjellyfish-2.0.a
 -    ${GAT_SOURCE_DIR}/external/install/lib/libbwa.a
-+    libjellyfish-2.0.so
-+    /usr/lib/libbwa.a
++    divsufsort
++    divsufsort64
++    jellyfish-2.0
++    bwa
      m
      ${LIBLZMA_LIBRARIES}
      ${BZIP2_LIBRARIES}
+@@ -167,7 +164,7 @@
+     ${LIBSALMON_LINKER_FLAGS}
+     ${NON_APPLECLANG_LIBS}
+     ${FAST_MALLOC_LIB}
+-    )
++)
+ 
+ ### No need for this, I think
+ ##  This ensures that the salmon executable should work with or without `make install`
diff --git a/debian/patches/use_debian_packaged_rapmap.patch b/debian/patches/use_debian_packaged_rapmap.patch
index 8c12145..98303f2 100644
--- a/debian/patches/use_debian_packaged_rapmap.patch
+++ b/debian/patches/use_debian_packaged_rapmap.patch
@@ -31,6 +31,34 @@
  )
  
  set (SALMON_ALIGN_SRCS
+@@ -64,6 +54,15 @@
+ SGSmooth.cpp
+ )
+ 
++set (RAPMAP_EMBED_SRCS
++bit_array.c
++HitManager.cpp
++RapMapFileSystem.cpp
++RapMapSAIndex.cpp
++RapMapSAIndexer.cpp
++rank9b.cpp
++)
++
+ set ( UNIT_TESTS_SRCS
+     ${GAT_SOURCE_DIR}/tests/UnitTests.cpp
+     FragmentLengthDistribution.cpp
+@@ -104,9 +103,9 @@
+ endif()
+ 
+ # Build the salmon executable
+-add_executable(salmon ${SALMON_LIB_SRCS} ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
++add_executable(salmon ${SALMON_LIB_SRCS} ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS} ${RAPMAP_EMBED_SRCS})
+ 
+-add_executable(unitTests ${SALMON_LIB_SRCS} ${UNIT_TESTS_SRCS})
++add_executable(unitTests ${SALMON_LIB_SRCS} ${UNIT_TESTS_SRCS} ${RAPMAP_EMBED_SRCS})
+ 
+ #add_executable(salmon-read ${SALMON_READ_SRCS})
+ #set_target_properties(salmon-read PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_LIBPTHREAD -D_PBGZF_USE -fopenmp"
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -161,7 +161,7 @@
diff --git a/debian/rapmap/BooMap.hpp b/debian/rapmap/BooMap.hpp
new file mode 100644
index 0000000..e2056c6
--- /dev/null
+++ b/debian/rapmap/BooMap.hpp
@@ -0,0 +1,193 @@
+#ifndef __BOO_MAP__
+#define __BOO_MAP__
+
+#include "BooPHF.hpp"
+
+#include "cereal/types/vector.hpp"
+#include "cereal/types/utility.hpp"
+#include "cereal/archives/binary.hpp"
+
+#include <fstream>
+#include <vector>
+#include <iterator>
+#include <type_traits>
+
+#include <sys/stat.h>
+
+// adapted from :
+// http://stackoverflow.com/questions/34875315/implementation-my-own-list-and-iterator-stl-c
+template <typename Iter>
+class KeyIterator {
+public:
+    typedef KeyIterator<Iter> self_type;
+    typedef typename std::iterator_traits<Iter>::value_type::first_type value_type;
+    typedef value_type& reference;
+    typedef value_type* pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef int64_t difference_type;
+
+    KeyIterator(Iter first) : curr_(first) {}
+    KeyIterator operator++() { KeyIterator i = *this; curr_++; return i; }
+    KeyIterator operator++(int) { ++curr_; return *this; }
+    reference operator*() { return curr_->first; }
+    pointer operator->() { return &(curr_->first); }
+    bool operator==(const self_type& rhs) { return curr_ == rhs.curr_; }
+    bool operator!=(const self_type& rhs) { return curr_ != rhs.curr_; }
+    bool operator<(const self_type& rhs) { return curr_ < rhs.curr_; }
+    bool operator<=(const self_type& rhs) { return curr_ <= rhs.curr_; }
+    
+private:
+    Iter curr_;
+};
+
+template <typename KeyT, typename ValueT>
+class BooMap {
+public:
+    using HasherT = boomphf::SingleHashFunctor<KeyT>;
+    using BooPHFT = boomphf::mphf<KeyT, HasherT>;
+    using IteratorT = typename std::vector<std::pair<KeyT, ValueT>>::iterator;
+
+    BooMap() : built_(false) {}
+    void add(KeyT&& k, ValueT&& v) {
+        data_.emplace_back(k, v);
+    }
+
+    bool build(int nthreads=1) {
+        size_t numElem = data_.size();
+        KeyIterator<decltype(data_.begin())> kb(data_.begin());
+        KeyIterator<decltype(data_.begin())> ke(data_.end());
+        auto keyIt = boomphf::range(kb, ke);
+        BooPHFT* ph = new BooPHFT(numElem, keyIt, nthreads);
+        boophf_.reset(ph);
+        std::cerr << "reordering keys and values to coincide with phf ... ";
+        std::vector<size_t> inds; inds.reserve(data_.size());
+        for (size_t i = 0; i < data_.size(); ++i) {
+            inds.push_back(ph->lookup(data_[i].first));
+        }
+        reorder_destructive_(inds.begin(), inds.end(), data_.begin());
+        std::cerr << "done\n";
+        built_ = true;
+        return built_;
+    }
+
+    inline IteratorT find(const KeyT& k) {
+        auto ind = boophf_->lookup(k);
+        return (ind < data_.size()) ? (data_[ind].first == k ? data_.begin() + ind : data_.end()) : data_.end();
+    }
+    
+    /**
+     * NOTE: This function *assumes* that the key is in the hash.
+     * If it isn't, you'll get back a random element!
+     */
+    inline ValueT& operator[](const KeyT& k) {
+        auto ind = boophf_->lookup(k);
+        return (ind < data_.size() ? data_[ind].second : data_[0].second);
+    }
+    
+    inline IteratorT begin() { return data_.begin(); }
+    inline IteratorT end() { return data_.end(); }
+    inline IteratorT cend() const { return data_.cend(); }
+    inline IteratorT cbegin() const { return data_.cbegin(); }
+    
+    void save(const std::string& ofileBase) {
+        if (built_) {
+            std::string hashFN = ofileBase + ".bph";
+            // save the perfect hash function
+            {
+                std::ofstream os(hashFN, std::ios::binary);
+                if (!os.is_open()) {
+                    std::cerr << "BooM: unable to open output file [" << hashFN << "]; exiting!\n";
+                    std::exit(1);
+                }
+                boophf_->save(os);
+                os.close();
+            }
+            // and the values
+            std::string dataFN = ofileBase + ".val";
+            {
+                std::ofstream valStream(dataFN, std::ios::binary);
+                if (!valStream.is_open()) {
+                    std::cerr << "BooM: unable to open output file [" << dataFN << "]; exiting!\n";
+                    std::exit(1);
+                }
+                {
+                    cereal::BinaryOutputArchive outArchive(valStream);
+                    outArchive(data_);
+                }
+                valStream.close();
+            }
+        }
+    }
+    
+    void load(const std::string& ofileBase) {
+        std::string hashFN = ofileBase + ".bph";
+        std::string dataFN = ofileBase + ".val";
+
+        if ( !FileExists_(hashFN.c_str()) ) {
+            std::cerr << "BooM: Looking for perfect hash function file [" << hashFN << "], which doesn't exist! exiting.\n";
+            std::exit(1);
+        }
+        if ( !FileExists_(dataFN.c_str()) ) {
+            std::cerr << "BooM: Looking for key-value file [" << dataFN << "], which doesn't exist! exiting.\n";
+            std::exit(1);
+        }
+
+        // load the perfect hash function
+        {
+            boophf_.reset(new BooPHFT);
+            std::ifstream is(hashFN, std::ios::binary);
+            boophf_->load(is);
+            is.close();
+        }
+        // and the values
+        {
+            std::ifstream dataStream(dataFN, std::ios::binary);
+            {
+                cereal::BinaryInputArchive inArchive(dataStream);
+                inArchive(data_);
+            }
+            dataStream.close();
+        }
+        built_ = true;
+    }
+
+private:
+    // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+    bool FileExists_(const char *path) {
+        struct stat fileStat;
+        if ( stat(path, &fileStat) ) {
+            return false;
+        }
+        if ( !S_ISREG(fileStat.st_mode) ) {
+            return false;
+        }
+        return true;
+    }
+
+    // From : http://stackoverflow.com/questions/838384/reorder-vector-using-a-vector-of-indices
+    template< typename order_iterator, typename value_iterator >
+    void reorder_destructive_( order_iterator order_begin, order_iterator order_end, value_iterator v )  {
+        using value_t = typename std::iterator_traits< value_iterator >::value_type;
+        using index_t = typename std::iterator_traits< order_iterator >::value_type;
+        using diff_t = typename std::iterator_traits< order_iterator >::difference_type;
+
+        diff_t remaining = order_end - 1 - order_begin;
+        for ( index_t s = index_t(); remaining > 0; ++ s ) {
+            index_t d = order_begin[s];
+            if ( d == (diff_t) -1 ) continue;
+            -- remaining;
+            value_t temp = v[s];
+            for ( index_t d2; d != s; d = d2 ) {
+                std::swap( temp, v[d] );
+                std::swap( order_begin[d], d2 = (diff_t) -1 );
+                -- remaining;
+            }
+            v[s] = temp;
+        }
+    }
+
+    bool built_;
+    std::vector<std::pair<KeyT, ValueT>> data_;
+    std::unique_ptr<BooPHFT> boophf_{nullptr};
+};
+#endif // __BOO_MAP__ 
diff --git a/debian/rapmap/BooPHF.hpp b/debian/rapmap/BooPHF.hpp
new file mode 100644
index 0000000..64b11c7
--- /dev/null
+++ b/debian/rapmap/BooPHF.hpp
@@ -0,0 +1,1221 @@
+// BooPHF library
+// intended to be a minimal perfect hash function with fast and low memory construction, at the cost of (slightly) higher bits/elem than other state of the art libraries once built.
+// should work with arbitray large number of elements, based on a cascade of  "collision-free" bit arrays
+
+#ifndef __BOO_PHF__
+#define __BOO_PHF__
+
+#include <stdio.h>
+#include <climits>
+#include <stdlib.h>
+#include <iostream>
+#include <math.h>
+
+#include <array>
+#include <unordered_map>
+#include <vector>
+#include <assert.h>
+#include <sys/time.h>
+#include <string.h>
+#include <memory> // for make_shared
+
+
+namespace boomphf {
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark utils
+////////////////////////////////////////////////////////////////
+
+	inline unsigned int popcount_32(unsigned int x)
+	{
+		unsigned int m1 = 0x55555555;
+		unsigned int m2 = 0x33333333;
+		unsigned int m4 = 0x0f0f0f0f;
+		unsigned int h01 = 0x01010101;
+		x -= (x >> 1) & m1;               /* put count of each 2 bits into those 2 bits */
+		x = (x & m2) + ((x >> 2) & m2);   /* put count of each 4 bits in */
+		x = (x + (x >> 4)) & m4;          /* put count of each 8 bits in partie droite  4bit piece*/
+		return (x * h01) >> 24;           /* returns left 8 bits of x + (x<<8) + ... */
+	}
+
+
+	inline unsigned int popcount_64(uint64_t x)
+	{
+		unsigned int low = x & 0xffffffff ;
+		unsigned int high = ( x >> 32LL) & 0xffffffff ;
+
+		return (popcount_32(low) + popcount_32(high));
+	}
+
+
+	///// progress bar
+	class Progress
+	{
+	public:
+		int timer_mode;
+		struct timeval timestamp;
+		double heure_debut, heure_actuelle ;
+		std::string   message;
+
+		uint64_t done;
+		uint64_t todo;
+		int subdiv ; // progress printed every 1/subdiv of total to do
+		double partial;
+		int _nthreads;
+		std::vector<double > partial_threaded;
+		std::vector<uint64_t > done_threaded;
+
+		double steps ; //steps = todo/subidv
+
+		void init(uint64_t ntasks, const char * msg,int nthreads =1)
+		{
+			_nthreads = nthreads;
+			message = std::string(msg);
+			gettimeofday(&timestamp, NULL);
+			heure_debut = timestamp.tv_sec +(timestamp.tv_usec/1000000.0);
+
+			//fprintf(stderr,"| %-*s |\n",98,msg);
+
+			todo= ntasks;
+			done = 0;
+			partial =0;
+			
+			partial_threaded.resize(_nthreads);
+			done_threaded.resize(_nthreads);
+			
+			for (int ii=0; ii<_nthreads;ii++) partial_threaded[ii]=0;
+			for (int ii=0; ii<_nthreads;ii++) done_threaded[ii]=0;
+			subdiv= 1000;
+			steps = (double)todo / (double)subdiv;
+
+			if(!timer_mode)
+			{
+				 fprintf(stderr,"[");fflush(stderr);
+			}
+		}
+
+		void finish()
+		{
+			set(todo);
+			 if(timer_mode)
+			 	fprintf(stderr,"\n");
+			 else
+			 	fprintf(stderr,"]\n");
+
+			fflush(stderr);
+			todo= 0;
+			done = 0;
+			partial =0;
+
+		}
+		void finish_threaded()// called by only one of the threads
+		{
+			done = 0;
+			double rem = 0;
+			for (int ii=0; ii<_nthreads;ii++) done += (done_threaded[ii] );
+			for (int ii=0; ii<_nthreads;ii++) partial += (partial_threaded[ii] );
+
+			finish();
+
+		}
+		void inc(uint64_t ntasks_done)
+		{
+			done += ntasks_done;
+			partial += ntasks_done;
+
+
+			while(partial >= steps)
+			{
+				if(timer_mode)
+				{
+					gettimeofday(&timestamp, NULL);
+					heure_actuelle = timestamp.tv_sec +(timestamp.tv_usec/1000000.0);
+					double elapsed = heure_actuelle - heure_debut;
+					double speed = done / elapsed;
+					double rem = (todo-done) / speed;
+					if(done>todo) rem=0;
+					int min_e  = (int)(elapsed / 60) ;
+					elapsed -= min_e*60;
+					int min_r  = (int)(rem / 60) ;
+					rem -= min_r*60;
+
+				 fprintf(stderr,"%c[%s]  %-5.3g%%   elapsed: %3i min %-2.0f sec   remaining: %3i min %-2.0f sec",13,
+				 		message.c_str(),
+				 		100*(double)done/todo,
+				 		min_e,elapsed,min_r,rem);
+
+				}
+				else
+				{
+					 fprintf(stderr,"-");fflush(stderr);
+				}
+				partial -= steps;
+			}
+
+
+		}
+
+		void inc(uint64_t ntasks_done, int tid) //threads collaborate to this same progress bar
+		{
+			partial_threaded[tid] += ntasks_done;
+			done_threaded[tid] += ntasks_done;
+			while(partial_threaded[tid] >= steps)
+			{
+				if(timer_mode)
+				{
+					struct timeval timet;
+					double now;
+					gettimeofday(&timet, NULL);
+					now = timet.tv_sec +(timet.tv_usec/1000000.0);
+					uint64_t total_done  = 0;
+					for (int ii=0; ii<_nthreads;ii++) total_done += (done_threaded[ii] );
+					double elapsed = now - heure_debut;
+					double speed = total_done / elapsed;
+					double rem = (todo-total_done) / speed;
+					if(total_done > todo) rem =0;
+					int min_e  =  (int)(elapsed / 60) ;
+					elapsed -= min_e*60;
+					int min_r  =  (int)(rem / 60) ;
+					rem -= min_r*60;
+
+					 fprintf(stderr,"%c[%s]  %-5.3g%%   elapsed: %3i min %-2.0f sec   remaining: %3i min %-2.0f sec",13,
+					 		message.c_str(),
+					 		100*(double)total_done/todo,
+					 		min_e,elapsed,min_r,rem);
+				}
+				else
+				{
+					 fprintf(stderr,"-");fflush(stderr);
+				}
+				partial_threaded[tid] -= steps;
+
+			}
+
+		}
+
+		void set(uint64_t ntasks_done)
+		{
+			if(ntasks_done > done)
+				inc(ntasks_done-done);
+		}
+		Progress () :     timer_mode(0) {}
+		//include timer, to print ETA ?
+	};
+
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark hasher
+////////////////////////////////////////////////////////////////
+
+	typedef std::array<uint64_t,10> hash_set_t;
+	typedef std::array<uint64_t,2> hash_pair_t;
+
+
+
+	template <typename Item> class HashFunctors
+	{
+	public:
+
+		/** Constructor.
+		 * \param[in] nbFct : number of hash functions to be used
+		 * \param[in] seed : some initialization code for defining the hash functions. */
+		HashFunctors ()
+		{
+			_nbFct = 7; // use 7 hash func
+			_user_seed = 0;
+			generate_hash_seed ();
+		}
+
+		//return one hash
+        uint64_t operator ()  (const Item& key, size_t idx)  const {  return hash64 (key, _seed_tab[idx]);  }
+
+        uint64_t hashWithSeed(const Item& key, uint64_t seed)  const {  return hash64 (key, seed);  }
+
+		//this one returns all the 7 hashes
+		//maybe use xorshift instead, for faster hash compute
+		hash_set_t operator ()  (const Item& key)
+		{
+			hash_set_t	 hset;
+
+			for(size_t ii=0;ii<10; ii++)
+			{
+				hset[ii] =  hash64 (key, _seed_tab[ii]);
+			}
+			return hset;
+		}
+
+	private:
+
+
+		inline static uint64_t hash64 (Item key, uint64_t seed)
+		{
+			uint64_t hash = seed;
+			hash ^= (hash <<  7) ^  key * (hash >> 3) ^ (~((hash << 11) + (key ^ (hash >> 5))));
+			hash = (~hash) + (hash << 21);
+			hash = hash ^ (hash >> 24);
+			hash = (hash + (hash << 3)) + (hash << 8);
+			hash = hash ^ (hash >> 14);
+			hash = (hash + (hash << 2)) + (hash << 4);
+			hash = hash ^ (hash >> 28);
+			hash = hash + (hash << 31);
+
+			return hash;
+		}
+
+		/* */
+		void generate_hash_seed ()
+		{
+			static const uint64_t rbase[MAXNBFUNC] =
+			{
+				0xAAAAAAAA55555555ULL,  0x33333333CCCCCCCCULL,  0x6666666699999999ULL,  0xB5B5B5B54B4B4B4BULL,
+				0xAA55AA5555335533ULL,  0x33CC33CCCC66CC66ULL,  0x6699669999B599B5ULL,  0xB54BB54B4BAA4BAAULL,
+				0xAA33AA3355CC55CCULL,  0x33663366CC99CC99ULL
+			};
+
+			for (size_t i=0; i<MAXNBFUNC; ++i)  {  _seed_tab[i] = rbase[i];  }
+			for (size_t i=0; i<MAXNBFUNC; ++i)  {  _seed_tab[i] = _seed_tab[i] * _seed_tab[(i+3) % MAXNBFUNC] + _user_seed ;  }
+		}
+
+		size_t _nbFct;
+
+		static const size_t MAXNBFUNC = 10;
+		uint64_t _seed_tab[MAXNBFUNC];
+		uint64_t _user_seed;
+	};
+
+/* alternative hash functor based on xorshift, taking a single hash functor as input.
+we need this 2-functors scheme because HashFunctors won't work with unordered_map.
+(rayan)
+*/
+
+    // wrapper around HashFunctors to return only one value instead of 7
+    template <typename Item> class SingleHashFunctor
+	{
+	public:
+		uint64_t operator ()  (const Item& key, uint64_t seed=0xAAAAAAAA55555555ULL) const  {  return hashFunctors.hashWithSeed(key, seed);  }
+
+	private:
+		HashFunctors<Item> hashFunctors;
+	};
+
+
+
+    template <typename Item, class SingleHasher_t> class XorshiftHashFunctors
+    {
+        /*  Xorshift128*
+            Written in 2014 by Sebastiano Vigna (vigna at acm.org)
+
+            To the extent possible under law, the author has dedicated all copyright
+            and related and neighboring rights to this software to the public domain
+            worldwide. This software is distributed without any warranty.
+
+            See <http://creativecommons.org/publicdomain/zero/1.0/>. */
+        /* This is the fastest generator passing BigCrush without
+           systematic failures, but due to the relatively short period it is
+           acceptable only for applications with a mild amount of parallelism;
+           otherwise, use a xorshift1024* generator.
+
+           The state must be seeded so that it is not everywhere zero. If you have
+           a nonzero 64-bit seed, we suggest to pass it twice through
+           MurmurHash3's avalanching function. */
+
+      //  uint64_t s[ 2 ];
+
+        uint64_t next(uint64_t * s) {
+            uint64_t s1 = s[ 0 ];
+            const uint64_t s0 = s[ 1 ];
+            s[ 0 ] = s0;
+            s1 ^= s1 << 23; // a
+            return ( s[ 1 ] = ( s1 ^ s0 ^ ( s1 >> 17 ) ^ ( s0 >> 26 ) ) ) + s0; // b, c
+        }
+
+        public:
+
+
+		uint64_t h0(hash_pair_t  & s, const Item& key )
+		{
+			s[0] =  singleHasher (key, 0xAAAAAAAA55555555ULL);
+			return s[0];
+		}
+
+		uint64_t h1(hash_pair_t  & s, const Item& key )
+		{
+			s[1] =  singleHasher (key, 0x33333333CCCCCCCCULL);
+			return s[1];
+		}
+
+
+		//return next hash an update state s
+		uint64_t next(hash_pair_t  & s ) {
+			uint64_t s1 = s[ 0 ];
+			const uint64_t s0 = s[ 1 ];
+			s[ 0 ] = s0;
+			s1 ^= s1 << 23; // a
+			return ( s[ 1 ] = ( s1 ^ s0 ^ ( s1 >> 17 ) ^ ( s0 >> 26 ) ) ) + s0; // b, c
+		}
+
+        //this one returns all the  hashes
+        hash_set_t operator ()  (const Item& key)
+        {
+			uint64_t s[ 2 ];
+
+            hash_set_t   hset;
+
+            hset[0] =  singleHasher (key, 0xAAAAAAAA55555555ULL);
+            hset[1] =  singleHasher (key, 0x33333333CCCCCCCCULL);
+
+            s[0] = hset[0];
+            s[1] = hset[1];
+
+            for(size_t ii=2;ii< 10 /* it's much better have a constant here, for inlining; this loop is super performance critical*/; ii++)
+            {
+                hset[ii] = next(s);
+            }
+
+            return hset;
+        }
+    private:
+        SingleHasher_t singleHasher;
+    };
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark iterators
+////////////////////////////////////////////////////////////////
+
+	template <typename Iterator>
+	struct iter_range
+	{
+		iter_range(Iterator b, Iterator e)
+		: m_begin(b)
+		, m_end(e)
+		{}
+
+		Iterator begin() const
+		{ return m_begin; }
+
+		Iterator end() const
+		{ return m_end; }
+
+		Iterator m_begin, m_end;
+	};
+
+	template <typename Iterator>
+	iter_range<Iterator> range(Iterator begin, Iterator end)
+	{
+		return iter_range<Iterator>(begin, end);
+	}
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark BitVector
+////////////////////////////////////////////////////////////////
+
+	class bitVector {
+
+	public:
+
+		bitVector() : _size(0)
+		{
+			_bitArray = nullptr;
+		}
+
+		bitVector(uint64_t n) : _size(n)
+		{
+			_nchar  = (1ULL+n/64ULL);
+			_bitArray =  (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+		}
+
+		~bitVector()
+		{
+			if(_bitArray != nullptr)
+				free(_bitArray);
+		}
+
+		 //copy constructor
+		 bitVector(bitVector const &r)
+		 {
+			 _size =  r._size;
+			 _nchar = r._nchar;
+			 _ranks = r._ranks;
+			 _bitArray = (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+			 memcpy(_bitArray, r._bitArray, _nchar*sizeof(uint64_t) );
+		 }
+		
+		// Copy assignment operator
+		bitVector &operator=(bitVector const &r)
+		{
+			if (&r != this)
+			{
+				_size =  r._size;
+				_nchar = r._nchar;
+				_ranks = r._ranks;
+				if(_bitArray != nullptr)
+					free(_bitArray);
+				_bitArray = (uint64_t *) calloc (_nchar,sizeof(uint64_t));
+				memcpy(_bitArray, r._bitArray, _nchar*sizeof(uint64_t) );
+			}
+			return *this;
+		}
+	
+		// Move assignment operator
+		bitVector &operator=(bitVector &&r)
+		{
+			//printf("bitVector move assignment \n");
+			if (&r != this)
+			{
+				if(_bitArray != nullptr)
+					free(_bitArray);
+				
+				_size =  std::move (r._size);
+				_nchar = std::move (r._nchar);
+				_ranks = std::move (r._ranks);
+				_bitArray = r._bitArray;
+				r._bitArray = nullptr;
+			}
+			return *this;
+		}
+		// Move constructor
+		bitVector(bitVector &&r) : _bitArray ( nullptr),_size(0)
+		{
+			*this = std::move(r);
+		}
+		
+		
+		void resize(uint64_t newsize)
+		{
+			//printf("bitvector resize from  %llu bits to %llu \n",_size,newsize);
+			_nchar  = (1ULL+newsize/64ULL);
+			_bitArray = (uint64_t *) realloc(_bitArray,_nchar*sizeof(uint64_t));
+			_size = newsize;
+		}
+
+		size_t size() const
+		{
+			return _size;
+		}
+
+		uint64_t bitSize() const {return (_nchar*64ULL + _ranks.capacity()*64ULL );}
+
+		//clear whole array
+		void clear()
+		{
+			memset(_bitArray,0,_nchar*sizeof(uint64_t));
+		}
+
+		//clear collisions in interval, only works with start and size multiple of 64
+		void clearCollisions(uint64_t start, size_t size, bitVector * cc)
+		{
+			assert( (start & 63) ==0);
+			assert( (size & 63) ==0);
+			uint64_t ids = (start/64ULL);
+			for(uint64_t ii =0;  ii< (size/64ULL); ii++ )
+			{
+				_bitArray[ids+ii] =  _bitArray[ids+ii] & (~ (cc->get64(ii)) );
+			}
+
+			cc->clear();
+		}
+
+
+		//clear interval, only works with start and size multiple of 64
+		void clear(uint64_t start, size_t size)
+		{
+			assert( (start & 63) ==0);
+			assert( (size & 63) ==0);
+			memset(_bitArray + (start/64ULL),0,(size/64ULL)*sizeof(uint64_t));
+		}
+
+		//for debug purposes
+		void print() const
+		{
+			printf("bit array of size %llu: \n", _size);
+			for(uint64_t ii = 0; ii< _size; ii++)
+			{
+				if(ii%10==0)
+					printf(" (%llu) ",ii);
+				int val = (_bitArray[ii >> 6] >> (ii & 63 ) ) & 1;
+				printf("%i",val);
+			}
+			printf("\n");
+
+			printf("rank array : size %lu \n",_ranks.size());
+			for (uint64_t ii = 0; ii< _ranks.size(); ii++)
+			{
+				printf("%llu:  %llu,  ",ii,_ranks[ii]);
+			}
+			printf("\n");
+		}
+
+		//return value at pos
+		uint64_t operator[](uint64_t pos) const
+		{
+			return (_bitArray[pos >> 6ULL] >> (pos & 63 ) ) & 1;
+		}
+
+		//atomically   return old val and set to 1
+		uint64_t atomic_test_and_set(uint64_t pos)
+		{
+			uint64_t oldval = 	__sync_fetch_and_or (_bitArray + (pos >> 6), (uint64_t) (1ULL << (pos & 63)) );
+
+			return  ( oldval >> (pos & 63 ) ) & 1;
+		}
+
+
+		uint64_t get(uint64_t pos) const
+		{
+			return (*this)[pos];
+		}
+
+		uint64_t get64(uint64_t cell64) const
+		{
+			return _bitArray[cell64];
+		}
+
+		//set bit pos to 1
+		void set(uint64_t pos)
+		{
+			assert(pos<_size);
+			//_bitArray [pos >> 6] |=   (1ULL << (pos & 63) ) ;
+			__sync_fetch_and_or (_bitArray + (pos >> 6ULL), (1ULL << (pos & 63)) );
+		}
+
+		//set bit pos to 0
+		void reset(uint64_t pos)
+		{
+			//_bitArray [pos >> 6] &=   ~(1ULL << (pos & 63) ) ;
+			__sync_fetch_and_and (_bitArray + (pos >> 6ULL), ~(1ULL << (pos & 63) ));
+		}
+
+		//return value of  last rank
+		// add offset to  all ranks  computed
+		uint64_t build_ranks(uint64_t offset =0)
+		{
+			_ranks.reserve(2+ _size/_nb_bits_per_rank_sample);
+
+			uint64_t curent_rank = offset;
+			for (size_t ii = 0; ii < _nchar; ii++) {
+				if (((ii*64)  % _nb_bits_per_rank_sample) == 0) {
+					_ranks.push_back(curent_rank);
+				}
+				curent_rank +=  popcount_64(_bitArray[ii]);
+			}
+
+			return curent_rank;
+		}
+
+		uint64_t rank(uint64_t pos) const
+		{
+			uint64_t word_idx = pos / 64ULL;
+			uint64_t word_offset = pos % 64;
+			uint64_t block = pos / _nb_bits_per_rank_sample;
+			uint64_t r = _ranks[block];
+			for (uint64_t w = block * _nb_bits_per_rank_sample / 64; w < word_idx; ++w) {
+				r += popcount_64( _bitArray[w] );
+			}
+			uint64_t mask = (uint64_t(1) << word_offset ) - 1;
+			r += popcount_64( _bitArray[word_idx] & mask);
+
+			return r;
+		}
+
+
+		void save(std::ostream& os) const
+		{
+			os.write(reinterpret_cast<char const*>(&_size), sizeof(_size));
+			os.write(reinterpret_cast<char const*>(&_nchar), sizeof(_nchar));
+			os.write(reinterpret_cast<char const*>(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
+			size_t sizer = _ranks.size();
+			os.write(reinterpret_cast<char const*>(&sizer),  sizeof(size_t));
+			os.write(reinterpret_cast<char const*>(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
+		}
+
+		void load(std::istream& is)
+		{
+			is.read(reinterpret_cast<char*>(&_size), sizeof(_size));
+			is.read(reinterpret_cast<char*>(&_nchar), sizeof(_nchar));
+			this->resize(_size);
+			is.read(reinterpret_cast<char *>(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
+
+			size_t sizer;
+			is.read(reinterpret_cast<char *>(&sizer),  sizeof(size_t));
+			_ranks.resize(sizer);
+			is.read(reinterpret_cast<char*>(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
+		}
+
+
+	protected:
+		uint64_t*  _bitArray;
+		//uint64_t* _bitArray;
+		uint64_t _size;
+		uint64_t _nchar;
+
+		 // epsilon =  64 / _nb_bits_per_rank_sample   bits
+		// additional size for rank is epsilon * _size
+		static const uint64_t _nb_bits_per_rank_sample = 512; //512 seems ok
+		std::vector<uint64_t> _ranks;
+	};
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark level
+////////////////////////////////////////////////////////////////
+
+	class level{
+	public:
+		level(){ }
+
+		~level() {
+		}
+
+		uint64_t get(uint64_t hash_raw)
+		{
+			uint64_t hashi =    hash_raw %  hash_domain;
+			return bitset.get(hashi);
+		}
+		
+		uint64_t idx_begin;
+		uint64_t hash_domain;
+		bitVector  bitset;
+	};
+
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark mphf
+////////////////////////////////////////////////////////////////
+
+
+#define NBBUFF 10000
+
+	template<typename Range,typename Iterator>
+	struct thread_args
+	{
+		void * boophf;
+		Range const * range;
+		std::shared_ptr<void> it_p; /* used to be "Iterator it" but because of fastmode, iterator is polymorphic; TODO: think about whether it should be a unique_ptr actually */
+		std::shared_ptr<void> until_p; /* to cache the "until" variable */
+		int level;
+	};
+
+	//forward declaration
+
+    template <typename elem_t, typename Hasher_t, typename Range, typename it_type>
+	void * thread_processLevel(void * args);
+
+
+    /* Hasher_t returns a single hash when operator()(elem_t key) is called.
+       if used with XorshiftHashFunctors, it must have the following operator: operator()(elem_t key, uint64_t seed) */
+    template <typename elem_t, typename Hasher_t>
+	class mphf {
+
+        /* this mechanisms gets P hashes out of Hasher_t */
+        typedef XorshiftHashFunctors<elem_t,Hasher_t> MultiHasher_t ;
+       // typedef HashFunctors<elem_t> MultiHasher_t; // original code (but only works for int64 keys)  (seems to be as fast as the current xorshift)
+		//typedef IndepHashFunctors<elem_t,Hasher_t> MultiHasher_t; //faster than xorshift
+
+	public:
+		mphf() : _built(false)
+		{}
+
+
+		~mphf()
+		{
+
+		}
+
+		
+		// allow perc_elem_loaded  elements to be loaded in ram for faster construction (default 3%), set to 0 to desactivate
+		template <typename Range>
+		mphf( size_t n, Range const& input_range,int num_thread = 1,  double gamma = 2.0 , bool progress =true, float perc_elem_loaded = 0.03) :
+		_gamma(gamma), _hash_domain(size_t(ceil(double(n) * gamma))), _nelem(n), _num_thread(num_thread), _percent_elem_loaded_for_fastMode (perc_elem_loaded), _withprogress(progress)
+		{
+			if(n ==0) return;
+			
+			if(_percent_elem_loaded_for_fastMode > 0.0 )
+				_fastmode =true;
+
+			setup();
+
+			if(_withprogress)
+			{
+			_progressBar.timer_mode=1;
+
+			if(_fastmode)
+				_progressBar.init( _nelem * (_fastModeLevel+1) +  ( _nelem * pow(_proba_collision,_fastModeLevel)) * (_nb_levels-(_fastModeLevel+1))    ,"Building BooPHF",num_thread);
+			else
+				_progressBar.init( _nelem * _nb_levels ,"Building BooPHF");
+			}
+
+			uint64_t offset = 0;
+			for(int ii = 0; ii< _nb_levels; ii++)
+			{
+				_tempBitset =  new bitVector(_levels[ii].hash_domain); // temp collision bitarray for this level
+
+				processLevel(input_range,ii);
+
+				_levels[ii].bitset.clearCollisions(0 , _levels[ii].hash_domain , _tempBitset);
+
+				offset = _levels[ii].bitset.build_ranks(offset);
+
+				delete _tempBitset;
+			}
+
+			if(_withprogress)
+			_progressBar.finish_threaded();
+
+
+			_lastbitsetrank = offset ;
+
+			//printf("used temp ram for construction : %lli MB \n",setLevelFastmode.capacity()* sizeof(elem_t) /1024ULL/1024ULL);
+
+			std::vector<elem_t>().swap(setLevelFastmode);   // clear setLevelFastmode reallocating
+
+
+			pthread_mutex_destroy(&_mutex);
+			
+			_built = true;
+		}
+
+
+		uint64_t lookup(elem_t elem)
+		{
+			if(! _built) return ULLONG_MAX;
+			
+			//auto hashes = _hasher(elem);
+			uint64_t non_minimal_hp,minimal_hp;
+
+
+			hash_pair_t bbhash;  int level;
+			uint64_t level_hash = getLevel(bbhash,elem,&level);
+
+			if( level == (_nb_levels-1))
+			{
+				auto in_final_map  = _final_hash.find (elem);
+				if ( in_final_map == _final_hash.end() )
+				{
+					//elem was not in orignal set of keys
+					return ULLONG_MAX; //  means elem not in set
+				}
+				else
+				{
+					minimal_hp =  in_final_map->second + _lastbitsetrank;
+					return minimal_hp;
+				}
+//				minimal_hp = _final_hash[elem] + _lastbitsetrank;
+//				return minimal_hp;
+			}
+			else
+			{
+				non_minimal_hp =  level_hash %  _levels[level].hash_domain; // in fact non minimal hp would be  + _levels[level]->idx_begin
+			}
+
+			minimal_hp = _levels[level].bitset.rank(non_minimal_hp );
+
+			return minimal_hp;
+		}
+
+		uint64_t nbKeys() const
+		{
+            return _nelem;
+        }
+
+		uint64_t totalBitSize()
+		{
+
+			uint64_t totalsizeBitset = 0;
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				totalsizeBitset += _levels[ii].bitset.bitSize();
+			}
+
+			uint64_t totalsize =  totalsizeBitset +  _final_hash.size()*42*8 ;  // unordered map takes approx 42B per elem [personal test] (42B with uint64_t key, would be larger for other type of elem)
+
+			printf("Bitarray    %12llu  bits (%.2f %%)   (array + ranks )\n",
+				   totalsizeBitset, 100*(float)totalsizeBitset/totalsize);
+			printf("final hash  %12lu  bits (%.2f %%) (nb in final hash %lu)\n",
+				   _final_hash.size()*42*8, 100*(float)(_final_hash.size()*42*8)/totalsize,
+				   _final_hash.size() );
+			return totalsize;
+		}
+
+		template <typename Iterator>  //typename Range,
+        void pthread_processLevel( std::vector<elem_t>  & buffer , std::shared_ptr<Iterator> shared_it, std::shared_ptr<Iterator> until_p, int i)
+		{
+			uint64_t nb_done =0;
+			int tid =  __sync_fetch_and_add (&_nb_living, 1);
+			auto until = *until_p;
+			uint64_t inbuff =0;
+
+
+			
+			for (bool isRunning=true;  isRunning ; )
+			{
+
+				//safely copy n items into buffer
+				pthread_mutex_lock(&_mutex);
+                for(; inbuff<NBBUFF && (*shared_it)!=until;  ++(*shared_it))
+				{
+                    buffer[inbuff]= *(*shared_it); inbuff++;
+				}
+                if((*shared_it)==until) isRunning =false;
+				pthread_mutex_unlock(&_mutex);
+
+
+				//do work on the n elems of the buffer
+                for(uint64_t ii=0; ii<inbuff ; ii++)
+				{
+					elem_t val = buffer[ii];
+
+					//auto hashes = _hasher(val);
+					hash_pair_t bbhash;  int level;
+					uint64_t level_hash = getLevel(bbhash,val,&level, i);
+
+					if(level == i) //insert into lvl i
+					{
+							__sync_fetch_and_add(& _cptLevel,1);
+
+						if(i == _fastModeLevel && _fastmode)
+						{
+							uint64_t idxl2 = __sync_fetch_and_add(& _idxLevelsetLevelFastmode,1);
+							//si depasse taille attendue pour setLevelFastmode, fall back sur slow mode mais devrait pas arriver si hash ok et proba avec nous
+							if(idxl2>= setLevelFastmode.size())
+								_fastmode = false;
+							else
+								setLevelFastmode[idxl2] = val; // create set for fast mode
+						}
+
+						//insert to level i+1 : either next level of the cascade or final hash if last level reached
+						if(i == _nb_levels-1) //stop cascade here, insert into exact hash
+						{
+							uint64_t hashidx =  __sync_fetch_and_add (& _hashidx, 1);
+
+							pthread_mutex_lock(&_mutex); //see later if possible to avoid this, mais pas bcp item vont la
+							// calc rank de fin  precedent level qq part, puis init hashidx avec ce rank, direct minimal, pas besoin inser ds bitset et rank
+							_final_hash[val] = hashidx;
+							pthread_mutex_unlock(&_mutex);
+						}
+						else
+						{
+							//computes next hash
+
+							if ( level == 0)
+								level_hash = _hasher.h0(bbhash,val);
+							else if ( level == 1)
+								level_hash = _hasher.h1(bbhash,val);
+							else
+							{
+								level_hash = _hasher.next(bbhash);
+							}
+							insertIntoLevel(level_hash,i); //should be safe
+						}
+					}
+
+					nb_done++;
+					if((nb_done&1023) ==0  && _withprogress) {_progressBar.inc(nb_done,tid);nb_done=0; }
+
+				}
+
+				inbuff = 0;
+			}
+
+		}
+
+
+		void save(std::ostream& os) const
+		{
+
+			os.write(reinterpret_cast<char const*>(&_gamma), sizeof(_gamma));
+			os.write(reinterpret_cast<char const*>(&_nb_levels), sizeof(_nb_levels));
+			os.write(reinterpret_cast<char const*>(&_lastbitsetrank), sizeof(_lastbitsetrank));
+			os.write(reinterpret_cast<char const*>(&_nelem), sizeof(_nelem));
+			 for(int ii=0; ii<_nb_levels; ii++)
+			 {
+			  	_levels[ii].bitset.save(os);
+			 }
+
+			//save final hash
+			size_t final_hash_size = _final_hash.size();
+
+			os.write(reinterpret_cast<char const*>(&final_hash_size), sizeof(size_t));
+
+
+			// typename std::unordered_map<elem_t,uint64_t,Hasher_t>::iterator
+			for (auto it = _final_hash.begin(); it != _final_hash.end(); ++it )
+			{
+				os.write(reinterpret_cast<char const*>(&(it->first)), sizeof(elem_t));
+				os.write(reinterpret_cast<char const*>(&(it->second)), sizeof(uint64_t));
+			}
+
+		}
+
+		void load(std::istream& is)
+		{
+
+			is.read(reinterpret_cast<char*>(&_gamma), sizeof(_gamma));
+			is.read(reinterpret_cast<char*>(&_nb_levels), sizeof(_nb_levels));
+			is.read(reinterpret_cast<char*>(&_lastbitsetrank), sizeof(_lastbitsetrank));
+			is.read(reinterpret_cast<char*>(&_nelem), sizeof(_nelem));
+			
+			_levels.resize(_nb_levels);
+			
+
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				//_levels[ii].bitset = new bitVector();
+				_levels[ii].bitset.load(is);
+			}
+
+
+
+			//mini setup, recompute size of each level
+			_proba_collision = 1.0 -  pow(((_gamma*(double)_nelem -1 ) / (_gamma*(double)_nelem)),_nelem-1);
+			uint64_t previous_idx =0;
+			_hash_domain = (size_t)  (ceil(double(_nelem) * _gamma)) ;
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				//_levels[ii] = new level();
+				_levels[ii].idx_begin = previous_idx;
+				_levels[ii].hash_domain =  (( (uint64_t) (_hash_domain * pow(_proba_collision,ii)) + 63) / 64 ) * 64;
+				if(_levels[ii].hash_domain == 0 )
+					_levels[ii].hash_domain  = 64 ;
+				previous_idx += _levels[ii].hash_domain;
+			}
+
+			//restore final hash
+
+			_final_hash.clear();
+			size_t final_hash_size ;
+
+			is.read(reinterpret_cast<char *>(&final_hash_size), sizeof(size_t));
+
+			for(unsigned int ii=0; ii<final_hash_size; ii++)
+			{
+				elem_t key;
+				uint64_t value;
+
+				is.read(reinterpret_cast<char *>(&key), sizeof(elem_t));
+				is.read(reinterpret_cast<char *>(&value), sizeof(uint64_t));
+
+				_final_hash[key] = value;
+			}
+			_built = true;
+		}
+
+
+		private :
+
+		void setup()
+		{
+			pthread_mutex_init(&_mutex, NULL);
+
+
+			if(_fastmode)
+				setLevelFastmode.resize(_percent_elem_loaded_for_fastMode * (double)_nelem );
+
+			_proba_collision = 1.0 -  pow(((_gamma*(double)_nelem -1 ) / (_gamma*(double)_nelem)),_nelem-1);
+
+			double sum_geom =_gamma * ( 1.0 +  _proba_collision / (1.0 - _proba_collision));
+			// printf("proba collision %f  sum_geom  %f   \n",_proba_collision,sum_geom);
+
+			_nb_levels = 25;
+			_levels.resize(_nb_levels);
+
+			//build levels
+			uint64_t previous_idx =0;
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+
+				_levels[ii].idx_begin = previous_idx;
+
+				// round size to nearest superior multiple of 64, makes it easier to clear a level
+				_levels[ii].hash_domain =  (( (uint64_t) (_hash_domain * pow(_proba_collision,ii)) + 63) / 64 ) * 64;
+				if(_levels[ii].hash_domain == 0 ) _levels[ii].hash_domain  = 64 ;
+				previous_idx += _levels[ii].hash_domain;
+
+				//printf("build level %i bit array : start %12llu, size %12llu  ",ii,_levels[ii]->idx_begin,_levels[ii]->hash_domain );
+				//printf(" expected elems : %.2f %% total \n",100.0*pow(_proba_collision,ii));
+
+			}
+			
+			for(int ii=0; ii<_nb_levels; ii++)
+			{
+				 if(pow(_proba_collision,ii) < _percent_elem_loaded_for_fastMode)
+				 {
+				 	_fastModeLevel = ii;
+				 	// printf("fast mode level :  %i \n",ii);
+				 	break;
+				 }
+			}
+
+
+		}
+
+
+		//compute level and returns hash of last level reached
+		uint64_t getLevel(hash_pair_t & bbhash, elem_t val,int * res_level, int maxlevel = 100)
+		{
+			int level = 0;
+			uint64_t hash_raw=0;
+
+			for (int ii=0; ii<(_nb_levels-1) &&  ii < maxlevel ; ii++ )
+			{
+
+				//calc le hash suivant
+				 if ( ii == 0)
+					hash_raw = _hasher.h0(bbhash,val);
+				else if ( ii == 1)
+					hash_raw = _hasher.h1(bbhash,val);
+				else
+				{
+					hash_raw = _hasher.next(bbhash);
+				}
+
+
+				if( _levels[ii].get(hash_raw) )
+				{
+					break;
+				}
+
+				level++;
+			}
+
+			*res_level = level;
+			return hash_raw;
+		}
+
+
+		//insert into bitarray
+		void insertIntoLevel(uint64_t level_hash, int i)
+		{
+			uint64_t hashl =  level_hash % _levels[i].hash_domain;
+
+			if( _levels[i].bitset.atomic_test_and_set(hashl) )
+			{
+				_tempBitset->atomic_test_and_set(hashl);
+			}
+
+		}
+
+
+		//loop to insert into level i
+		template <typename Range>
+		void processLevel(Range const& input_range,int i)
+		{
+			////alloc the bitset for this level
+			_levels[i].bitset =  bitVector(_levels[i].hash_domain); ;
+
+			_cptLevel = 0;
+			_hashidx = 0;
+			_idxLevelsetLevelFastmode =0;
+			_nb_living =0;
+			//create  threads
+			pthread_t *tab_threads= new pthread_t [_num_thread];
+			typedef decltype(input_range.begin()) it_type;
+			thread_args<Range, it_type> t_arg; // meme arg pour tous
+			t_arg.boophf = this;
+			t_arg.range = &input_range;
+			t_arg.it_p =  std::static_pointer_cast<void>(std::make_shared<it_type>(input_range.begin()));
+			t_arg.until_p =  std::static_pointer_cast<void>(std::make_shared<it_type>(input_range.end()));
+
+			t_arg.level = i;
+			if(i >= (_fastModeLevel+1) && _fastmode)
+			{
+				auto data_iterator = boomphf::range(static_cast<const elem_t*>( &setLevelFastmode[0]), static_cast<const elem_t*>( (&setLevelFastmode[0]) +setLevelFastmode.size()));
+                typedef decltype(data_iterator.begin()) fastmode_it_type;
+				t_arg.it_p =  std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(data_iterator.begin()));
+				t_arg.until_p =  std::static_pointer_cast<void>(std::make_shared<fastmode_it_type>(data_iterator.end()));
+
+                /* we'd like to do t_arg.it = data_iterator.begin() but types are different;
+                    so, casting to (void*) because of that; and we remember the type in the template */
+
+                for(int ii=0;ii<_num_thread;ii++)
+                    pthread_create (&tab_threads[ii], NULL,  thread_processLevel<elem_t, Hasher_t, Range, fastmode_it_type>, &t_arg); //&t_arg[ii]
+			}
+			else
+			{
+			    for(int ii=0;ii<_num_thread;ii++)
+                    pthread_create (&tab_threads[ii], NULL,  thread_processLevel<elem_t, Hasher_t, Range, decltype(input_range.begin())>, &t_arg); //&t_arg[ii]
+			}
+			//joining
+			for(int ii=0;ii<_num_thread;ii++)
+			{
+				pthread_join(tab_threads[ii], NULL);
+			}
+		//	printf("\ngoing to level %i  : %llu elems  %.2f %%  expected : %.2f %% \n",i,_cptLevel,100.0* _cptLevel/(float)_nelem,100.0* pow(_proba_collision,i) );
+
+			if(i == _fastModeLevel) //shrink to actual number of elements in set
+			{
+				//printf("resize setLevelFastmode to %lli \n",_idxLevelsetLevelFastmode);
+				setLevelFastmode.resize(_idxLevelsetLevelFastmode);
+			}
+			delete [] tab_threads;
+		}
+
+	private:
+		//level ** _levels;
+		std::vector<level> _levels;
+		int _nb_levels;
+        MultiHasher_t _hasher;
+		bitVector * _tempBitset;
+
+		double _gamma;
+		uint64_t _hash_domain;
+		uint64_t _nelem;
+        std::unordered_map<elem_t,uint64_t,Hasher_t> _final_hash;
+		Progress _progressBar;
+		int _nb_living;
+		int _num_thread;
+		uint64_t _hashidx;
+		double _proba_collision;
+		uint64_t _lastbitsetrank;
+		uint64_t _idxLevelsetLevelFastmode;
+		uint64_t _cptLevel;
+
+		// fast build mode , requires  that _percent_elem_loaded_for_fastMode %   elems are loaded in ram
+		float _percent_elem_loaded_for_fastMode ;
+		bool _fastmode;
+		std::vector< elem_t > setLevelFastmode;
+		int _fastModeLevel;
+		bool _withprogress;
+		bool _built;
+	public:
+		pthread_mutex_t _mutex;
+	};
+
+////////////////////////////////////////////////////////////////
+#pragma mark -
+#pragma mark threading
+////////////////////////////////////////////////////////////////
+
+
+    template <typename elem_t, typename Hasher_t, typename Range, typename it_type>
+	void * thread_processLevel(void * args)
+	{
+		if(args ==NULL) return NULL;
+
+		thread_args<Range,it_type> *targ = (thread_args<Range,it_type>*) args;
+
+		mphf<elem_t, Hasher_t>  * obw = (mphf<elem_t, Hasher_t > *) targ->boophf;
+		int level = targ->level;
+		std::vector<elem_t> buffer;
+		buffer.resize(NBBUFF);
+		
+		pthread_mutex_t * mutex =  & obw->_mutex;
+
+		pthread_mutex_lock(mutex); // from comment above: "//get starting iterator for this thread, must be protected (must not be currently used by other thread to copy elems in buff)"
+        std::shared_ptr<it_type> startit = std::static_pointer_cast<it_type>(targ->it_p);
+        std::shared_ptr<it_type> until_p = std::static_pointer_cast<it_type>(targ->until_p);
+		pthread_mutex_unlock(mutex);
+
+		obw->pthread_processLevel(buffer, startit, until_p, level);
+
+		return NULL;
+	}
+}
+
+#endif //__BOO_PHF__
diff --git a/debian/rapmap/HitManager.cpp b/debian/rapmap/HitManager.cpp
new file mode 100644
index 0000000..b5fc9e7
--- /dev/null
+++ b/debian/rapmap/HitManager.cpp
@@ -0,0 +1,700 @@
+#include "HitManager.hpp"
+#include "BooMap.hpp"
+#include <type_traits>
+
+namespace rapmap {
+    namespace hit_manager {
+    	// Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimple(std::vector<ProcessedHit>& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus){
+            bool foundHit{false};
+            // One processed hit per transcript
+            for (auto& ph : processedHits) {
+                auto tid = ph.tid;
+                std::sort(ph.tqvec.begin(), ph.tqvec.end(),
+                        [](const TxpQueryPos& x, const TxpQueryPos& y) -> bool {
+                        return x.txpPosInfo.pos() < y.txpPosInfo.pos();
+                        });
+                auto& firstHit = ph.tqvec[0];
+                bool hitRC = firstHit.queryRC;
+                bool txpRC = ph.tqvec[0].txpPosInfo.isRC();
+                bool isFwd = (hitRC == txpRC);
+                int32_t hitPos = firstHit.txpPosInfo.pos() - firstHit.queryPos;
+
+                // determine forward
+                hits.emplace_back(tid, hitPos, isFwd, readLen);
+                hits.back().mateStatus = mateStatus;
+            }
+
+            return true;
+        }
+
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA(SAHitMap& processedHits,
+                        uint32_t readLen,
+                        uint32_t maxDist,
+                        std::vector<QuasiAlignment>& hits,
+                        MateStatus mateStatus){
+                bool foundHit{false};
+                // One processed hit per transcript
+	            auto startOffset = hits.size();
+                for (auto& ph : processedHits) {
+                        // If this is an *active* position list
+                        if (ph.second.active) {
+                                auto tid = ph.first;
+				auto minPosIt = std::min_element(ph.second.tqvec.begin(),
+						ph.second.tqvec.end(),
+						[](const SATxpQueryPos& a, const SATxpQueryPos& b) -> bool {
+						    return a.pos < b.pos;
+						});
+                                bool hitRC = minPosIt->queryRC;
+                                int32_t hitPos = minPosIt->pos - minPosIt->queryPos;
+                                bool isFwd = !hitRC;
+                                hits.emplace_back(tid, hitPos, isFwd, readLen);
+                                hits.back().mateStatus = mateStatus;
+                        }
+                }
+                // if SAHitMap is sorted, no need to sort here
+                /*
+                std::sort(hits.begin() + startOffset, hits.end(),
+                                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                                return a.tid < b.tid;
+                                });
+                                */
+                return true;
+        }
+
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA2(std::vector<ProcessedSAHit>& processedHits,
+                        uint32_t readLen,
+                        uint32_t maxDist,
+                        std::vector<QuasiAlignment>& hits,
+                        MateStatus mateStatus){
+                bool foundHit{false};
+
+                // One processed hit per transcript
+                for (auto& ph : processedHits) {
+                        // If this is an *active* position list
+                        if (ph.active) {
+                                auto tid = ph.tid;
+                                auto minPosIt =
+                                    std::min_element(ph.tqvec.begin(),
+                                                     ph.tqvec.end(),
+                                                     [](const SATxpQueryPos& a, const SATxpQueryPos& b) -> bool {
+                                                        return a.pos < b.pos;
+                                                        });
+
+                                bool hitRC = minPosIt->queryRC;
+                                int32_t hitPos = minPosIt->pos - minPosIt->queryPos;
+                                bool isFwd = !hitRC;
+                                hits.emplace_back(tid, hitPos, isFwd, readLen);
+                                hits.back().mateStatus = mateStatus;
+                        }
+                }
+                return true;
+        }
+
+
+
+
+        // Intersects the hit h2 with outHits.
+        // This will modify outHits so that the tqvec field of the
+        // entries in outHits that are labeled by the transcripts in
+        // which h2 appears will have an iterator to the beginning of
+        // the position list for h2.
+        void intersectWithOutput(HitInfo& h2, RapMapIndex& rmi,
+                std::vector<ProcessedHit>& outHits) {
+
+            // Convenient bindings for variables we'll use
+            auto& eqClasses = rmi.eqClassList;
+            auto& eqClassLabels = rmi.eqLabelList;
+            auto& posList = rmi.posList;
+
+            // Iterator to the beginning and end of the output hits
+            auto outHitIt = outHits.begin();
+            auto outHitEnd = outHits.end();
+
+            // Equiv. class for h2
+            auto& eqClassRight = eqClasses[h2.kinfo->eqId];
+
+            // Iterator into, length of and end of the positon list for h2
+            auto rightPosIt = posList.begin() + h2.kinfo->offset;
+            auto rightPosLen = h2.kinfo->count;
+            auto rightPosEnd = rightPosIt + rightPosLen;
+            // Iterator into, length of and end of the transcript list for h2
+            auto rightTxpIt = eqClassLabels.begin() + eqClassRight.txpListStart;
+            auto rightTxpListLen = eqClassRight.txpListLen;
+            auto rightTxpEnd = rightTxpIt + rightTxpListLen;
+
+            auto rightQueryPos = h2.queryPos;
+            auto rightQueryRC = h2.queryRC;
+            PositionListHelper rightPosHelper(rightPosIt, posList.end());
+
+            uint32_t leftTxp, rightTxp;
+            while (outHitIt != outHitEnd and rightTxpIt != rightTxpEnd) {
+                // Get the current transcript ID for the left and right eq class
+                leftTxp = outHitIt->tid;
+                rightTxp = *rightTxpIt;
+                // If we need to advance the left txp, do it
+                if (leftTxp < rightTxp) {
+                    // Advance to the next transcript in the
+                    // equivalence class label
+                    ++outHitIt;
+                } else {
+                    // If the transcripts are equal (i.e. leftTxp >= rightTxp and !(rightTxp < leftTxp))
+                    // Then see if there are any hits here.
+                    if (!(rightTxp < leftTxp)) {
+                        // Add the position list iterator and query pos for the
+                        // hit from h2 to the back of outHits' tqvec.
+                        outHitIt->tqvec.emplace_back(rightPosHelper, rightQueryPos, rightQueryRC);
+                        ++outHitIt;
+                    }
+                    // advance the hit we're intersecting to the next transcript
+                    rightPosHelper.advanceToNextTranscript();
+                    // Advance the right transcript id regardless of whether
+                    // we found a hit or not.
+                    ++rightTxpIt;
+                }
+            }
+
+        }
+
+        /** from http://en.cppreference.com/w/cpp/algorithm/lower_bound **/
+        template <typename ForwardIt>
+        ForwardIt binarySearch(
+                ForwardIt first,
+                ForwardIt last,
+                uint32_t value) {
+            ForwardIt it;
+            typename std::iterator_traits<ForwardIt>::difference_type count, step;
+            count = std::distance(first, last);
+
+            while (count > 0) {
+                it = first;
+                step = count / 2;
+                std::advance(it, step);
+                if (*it < value) {
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else {
+                    count = step;
+                }
+            }
+            return first;
+        }
+
+        /** from http://en.cppreference.com/w/cpp/algorithm/find **/
+        template<class InputIt>
+        InputIt linearSearch(InputIt first, InputIt last, uint32_t value) {
+            for (; first != last; ++first) {
+                if (*first == value) {
+                    return first;
+                }
+            }
+            return last;
+        }
+
+        /** adapted from https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/ **/
+        uint32_t binarySearchFast(const std::vector<uint32_t>& arr, size_t n, uint32_t key) {
+            uint32_t min = 0, max = n;
+            while (min < max) {
+                int middle = (min + max) >> 1;
+                min = (key > arr[middle]) ? middle+1 : min;
+                max = (key <= arr[middle]) ? middle : max;
+            }
+            return (arr[min] == key) ? min : std::numeric_limits<uint32_t>::max();
+        }
+
+        /** adapted from https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/ **/
+        // ASSUMES SENTINEL VALUE (value in array >= key *MUST* exist)
+        uint32_t linearSearchUnrolled16(const std::vector<uint32_t>& arr, size_t n, uint32_t key) {
+            uint32_t i{0};
+                for (;;) {
+                    if ( arr[i + 0] >= key) return  i + 0;
+                    if ( arr[i + 1] >= key) return  i + 1;
+                    if ( arr[i + 2] >= key) return  i + 2;
+                    if ( arr[i + 3] >= key) return  i + 3;
+                    if ( arr[i + 4] >= key) return  i + 4;
+                    if ( arr[i + 5] >= key) return  i + 5;
+                    if ( arr[i + 6] >= key) return  i + 6;
+                    if ( arr[i + 7] >= key) return  i + 7;
+                    if ( arr[i + 8] >= key) return  i + 8;
+                    if ( arr[i + 9] >= key) return  i + 9;
+                    if ( arr[i + 10] >= key) return i + 10;
+                    if ( arr[i + 11] >= key) return i + 11;
+                    if ( arr[i + 12] >= key) return i + 12;
+                    if ( arr[i + 13] >= key) return i + 13;
+                    if ( arr[i + 14] >= key) return i + 14;
+                    if ( arr[i + 15] >= key) return i + 15;
+                    i += 16;
+                }
+            }
+
+          template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput2(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                RapMapIndexT& rmi,
+                //fbs::eytzinger_array_bfp<uint32_t, uint32_t, true>& outTxps,
+                //std::vector<uint32_t>& outTxps,
+                SAProcessedHitVec& processedHits) {
+            // Convenient bindings for variables we'll use
+            auto& SA = rmi.SA;
+            auto& txpIDs = rmi.positionIDs;
+            auto& txpStarts = rmi.txpOffsets;
+
+            auto& outStructs = processedHits.hits;
+            auto& outTxps = processedHits.txps;
+
+            // Iterator to the beginning and end of the output hits
+            auto txpIt = processedHits.txps.begin();
+            auto txpEnd = processedHits.txps.end();
+
+            uint32_t arraySize = processedHits.txps.size();
+
+            uint32_t rightTxp;
+            uint32_t pos;
+            //decltype(processedHits.txps)::iterator searchIt = txpEnd;
+            uint32_t searchInd{0};
+            for (auto i = h.begin; i < h.end; ++i) {
+                rightTxp = txpIDs[SA[i]];
+                if (arraySize > 64) {
+                    searchInd = binarySearchFast(outTxps, arraySize, rightTxp);
+                } else {
+                    searchInd = linearSearchUnrolled16(outTxps, arraySize, rightTxp);
+                }
+                // If we found this transcript (make sure it's not the sentinel) then
+                // add it to the list.
+                if ( searchInd < arraySize - 1 ) {
+                    //auto offset = std::distance(txpIt, searchIt);
+                    pos = static_cast<uint32_t>(SA[i]) - txpStarts[rightTxp];
+                    outStructs[searchInd].tqvec.emplace_back(pos, h.queryPos, h.queryRC);
+                }
+                /*
+                auto searchIdx = outTxps.search(rightTxp);
+                if (searchIdx < arraySize) {
+                    pos = static_cast<uint32_t>(SA[i]) - txpStarts[rightTxp];
+                    outStructs[searchIdx].tqvec.emplace_back(pos, h.queryPos, h.queryRC);
+                }
+                */
+            }
+        }
+
+
+        /*
+        void intersectSAIntervalWithOutput3(SAIntervalHit& h,
+                RapMapSAIndex& rmi,
+                SAProcessedHitVec& outHits) {
+            // Convenient bindings for variables we'll use
+            auto& SA = rmi.SA;
+            auto& txpIDs = rmi.positionIDs;
+            auto& txpStarts = rmi.txpOffsets;
+
+            // Iterator to the beginning and end of the output hits
+            auto outHitIt = outHits.begin();
+            auto outHitEnd = outHits.end();
+
+            // Make a vector of iterators into the right interval
+            std::vector<int*> rightHitIterators;
+            rightHitIterators.reserve(h.span());
+            for (auto i = h.begin; i < h.end; ++i) {
+                rightHitIterators.emplace_back(&SA[i]);
+            }
+            // Sort the iterators by their transcript ID
+            std::sort(rightHitIterators.begin(), rightHitIterators.end(),
+                    [&txpIDs](const int* a, const int* b) -> bool {
+                    return txpIDs[*a] < txpIDs[*b];
+                    });
+            auto rightIntHit = rightHitIterators.begin();
+            auto rightIntHitEnd = rightHitIterators.end();
+
+            uint32_t leftTxp, rightTxp;
+            uint32_t pos;
+            while (outHitIt != outHitEnd and rightIntHit != rightIntHitEnd) {
+                // Get the current transcript ID for the left and right eq class
+                leftTxp = outHitIt->tid;
+                rightTxp = txpIDs[(*(*rightIntHit))];
+                // If we need to advance the left txp, do it
+                if (leftTxp < rightTxp) {
+                    // Advance to the next transcript in the
+                    // equivalence class label
+                    ++outHitIt;
+                } else {
+                    // If the transcripts are equal (i.e. leftTxp >= rightTxp and !(rightTxp < leftTxp))
+                    // Then see if there are any hits here.
+                    if (!(rightTxp < leftTxp)) {
+                        // Add the position list iterator and query pos for the
+                        // hit from h2 to the back of outHits' tqvec.
+                        pos = static_cast<uint32_t>(*(*rightIntHit)) - txpStarts[rightTxp];
+                        outHitIt->tqvec.emplace_back(pos, h.queryPos, h.queryRC);
+                        //++outHitIt;
+                    }
+                    ++rightIntHit;
+                }
+            }
+        }
+        */
+
+
+
+        template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                                           RapMapIndexT& rmi,
+                                           uint32_t intervalCounter,
+                                           SAHitMap& outHits) {
+            using OffsetT = typename RapMapIndexT::IndexType;
+            // Convenient bindings for variables we'll use
+            auto& SA = rmi.SA;
+            //auto& txpIDs = rmi.positionIDs;
+            auto& rankDict = rmi.rankDict;
+            auto& txpStarts = rmi.txpOffsets;
+
+            // Walk through every hit in the new interval 'h'
+            for (OffsetT i = h.begin; i != h.end; ++i) {
+              //auto txpID = txpIDs[SA[i]];
+              // auto txpID = rankDict.Rank(SA[i], 1);
+              auto txpID = rmi.transcriptAtPosition(SA[i]);
+              auto txpListIt = outHits.find(txpID);
+              // If we found this transcript
+              // Add this position to the list
+              if (txpListIt != outHits.end()) {
+                txpListIt->second.numActive += (txpListIt->second.numActive == intervalCounter - 1) ? 1 : 0;
+                if (txpListIt->second.numActive == intervalCounter) {
+                  auto globalPos = SA[i];
+                  auto localPos = globalPos - txpStarts[txpID];
+                  txpListIt->second.tqvec.emplace_back(localPos, h.queryPos, h.queryRC);
+                }
+              }
+            }
+          }
+
+
+
+        std::vector<ProcessedHit> intersectHits(
+                std::vector<HitInfo>& inHits,
+                RapMapIndex& rmi
+                ) {
+            // Each inHit is a HitInfo structure that contains
+            // an iterator to the KmerInfo for this k-mer, the k-mer ID,
+            // and the query position where this k-mer appeared.
+            // We want to find the transcripts that appear in *every*
+            // hit.  Further, for each transcript, we want to
+            // know the k-mers that appear in this txp.
+
+            // Check this --- we should never call this function
+            // with less than 2 hits.
+            if (inHits.size() < 2) {
+                std::cerr << "intersectHits() called with < 2 k-mer "
+                    " hits; this shouldn't happen\n";
+                return {};
+            }
+
+            auto& eqClasses = rmi.eqClassList;
+            auto& eqClassLabels = rmi.eqLabelList;
+            auto& posList = rmi.posList;
+
+            // The HitInfo with the smallest equivalence class
+            // i.e. label with the fewest transcripts.
+            HitInfo* minHit = &inHits[0];
+            for (auto& h : inHits) {
+                if (h.kinfo->count < minHit->kinfo->count) {
+                    minHit = &h;
+                }
+            }
+
+            std::vector<ProcessedHit> outHits;
+            outHits.reserve(minHit->kinfo->count);
+            // =========
+            { // Add the info from minHit to outHits
+                // Equiv. class for minHit
+                auto& eqClass = eqClasses[minHit->kinfo->eqId];
+                // Iterator into, length of and end of the positon list
+                auto posIt = posList.begin() + minHit->kinfo->offset;
+                auto posLen = minHit->kinfo->count;
+                auto posEnd = posIt + posLen;
+                // Iterator into, length of and end of the transcript list
+                auto txpIt = eqClassLabels.begin() + eqClass.txpListStart;
+                auto txpListLen = eqClass.txpListLen;
+                auto txpEnd = txpIt + txpListLen;
+                PositionListHelper posHelper(posIt, posList.end());
+
+                while (txpIt != txpEnd) {
+                    auto tid = *txpIt;
+                    outHits.emplace_back(tid, posHelper, minHit->queryPos, minHit->queryRC);
+                    posHelper.advanceToNextTranscript();
+                    ++txpIt;
+                }
+            }
+            // =========
+
+            // Now intersect everything in inHits (apart from minHits)
+            // to get the final set of mapping info.
+            for (auto& h : inHits) {
+                if (&h != minHit) { // don't intersect minHit with itself
+                    intersectWithOutput(h, rmi, outHits);
+                }
+            }
+
+            size_t requiredNumHits = inHits.size();
+            // do we need stable_partition? --- don't think so.
+            auto newEnd = std::stable_partition(outHits.begin(), outHits.end(),
+                    [requiredNumHits] (const ProcessedHit& ph) -> bool {
+                    // should never really be greater.
+                    return (ph.tqvec.size() >= requiredNumHits);
+                    });
+            /*
+               bool didDrop = false;
+               for (auto it = newEnd; it != outHits.end(); ++it) {
+               std::cerr << "Dropped hit for txp " << it->tid << "\n";
+               didDrop = true;
+               }
+               if (didDrop) {
+               auto& eqClass = eqClasses[inHits[0].kinfo->eqId];
+               auto txpIt = eqClassLabels.begin() + eqClass.txpListStart;
+               auto txpListLen = eqClass.txpListLen;
+               auto txpEnd = txpIt + txpListLen;
+               std::cerr << "hits1: {";
+               while (txpIt != txpEnd) {
+               std::cerr << *txpIt << ", ";
+               ++txpIt;
+               }
+               std::cerr << "}\n";
+               auto& eqClass2 = eqClasses[inHits[1].kinfo->eqId];
+               txpIt = eqClassLabels.begin() + eqClass2.txpListStart;
+               txpListLen = eqClass2.txpListLen;
+               txpEnd = txpIt + txpListLen;
+               std::cerr << "hits2: {";
+               while (txpIt != txpEnd) {
+               std::cerr << *txpIt << ", ";
+               ++txpIt;
+               }
+               std::cerr << "}\n";
+               }
+               */
+            // return only the valid hits
+            outHits.resize(std::distance(outHits.begin(), newEnd));
+            return outHits;
+        }
+
+        template <typename RapMapIndexT>
+        std::vector<ProcessedSAHit> intersectSAHits2(
+                std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                RapMapIndexT& rmi
+                ) {
+            using OffsetT = typename RapMapIndexT::IndexType;
+
+            // Each inHit is a SAIntervalHit structure that contains
+            // an SA interval with all hits for a particuar query location
+            // on the read.
+            //
+            // We want to find the transcripts that appear in *every*
+            // interavl.  Further, for each transcript, we want to
+            // know the positions within this txp.
+
+            // Check this --- we should never call this function
+            // with less than 2 hits.
+            SAProcessedHitVec outHits;
+            if (inHits.size() < 2) {
+                std::cerr << "intersectHitsSA() called with < 2 k-mer "
+                    " hits; this shouldn't happen\n";
+                return outHits.hits;
+            }
+
+            auto& SA = rmi.SA;
+            auto& txpStarts = rmi.txpOffsets;
+            auto& txpIDs = rmi.positionIDs;
+
+            // Start with the smallest interval
+            // i.e. interval with the fewest hits.
+            SAIntervalHit<OffsetT>* minHit = &inHits[0];
+            for (auto& h : inHits) {
+                if (h.span() < minHit->span()) {
+                    minHit = &h;
+                }
+            }
+
+            auto& outStructs = outHits.hits;
+            auto& outTxps = outHits.txps;
+            outStructs.reserve(minHit->span());
+            outTxps.reserve(minHit->span());
+            std::map<int, uint32_t> posMap;
+            // =========
+            //{ // Add the info from minHit to outHits
+                for (int i = minHit->begin; i < minHit->end; ++i) {
+                    auto globalPos = SA[i];
+                    auto tid = txpIDs[globalPos];
+                    auto txpPos = globalPos - txpStarts[tid];
+                    auto posIt = posMap.find(tid);
+                    if (posIt == posMap.end()) {
+                        posMap[tid] = outStructs.size();
+                        outStructs.emplace_back(tid, txpPos, minHit->queryPos, minHit->queryRC);
+                    } else {
+                        outStructs[posIt->second].tqvec.emplace_back(txpPos, minHit->queryPos, minHit->queryRC);
+                    }
+                }
+                std::sort(outStructs.begin(), outStructs.end(),
+                          [] (const ProcessedSAHit& a, const ProcessedSAHit& b) -> bool {
+                            return a.tid < b.tid;
+                          });
+                for (auto it = outStructs.begin(); it != outStructs.end(); ++it) {
+                    outTxps.emplace_back(it->tid);
+                }
+                // Sentinel value for search
+                outTxps.emplace_back(std::numeric_limits<uint32_t>::max());
+                /*
+                fbs::eytzinger_array_bfp<uint32_t, uint32_t, true> searchArray(
+                        txpIndices.begin(), txpIndices.size()
+                        );
+                        */
+            //}
+            // =========
+
+            // Now intersect everything in inHits (apart from minHits)
+            // to get the final set of mapping info.
+            for (auto& h : inHits) {
+                if (&h != minHit) { // don't intersect minHit with itself
+                    intersectSAIntervalWithOutput2(h, rmi, outHits);
+                }
+            }
+
+            size_t requiredNumHits = inHits.size();
+            // Mark as active any transcripts with the required number of hits.
+            for (auto it = outStructs.begin(); it != outStructs.end(); ++it) {
+                if (it->tqvec.size() >= requiredNumHits) {
+                    it->active = true;
+                }
+            }
+            return outStructs;
+        }
+
+        template <typename RapMapIndexT>
+        SAHitMap intersectSAHits(
+                std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                RapMapIndexT& rmi,
+                bool strictFilter 
+                ) {
+            using OffsetT = typename RapMapIndexT::IndexType;
+            // Each inHit is a SAIntervalHit structure that contains
+            // an SA interval with all hits for a particuar query location
+            // on the read.
+            //
+            // We want to find the transcripts that appear in *every*
+            // interavl.  Further, for each transcript, we want to
+            // know the positions within this txp.
+
+            // Check this --- we should never call this function
+            // with less than 2 hits.
+            SAHitMap outHits;
+            if (inHits.size() < 2) {
+                std::cerr << "intersectHitsSA() called with < 2 hits "
+                    " hits; this shouldn't happen\n";
+                return outHits;
+            }
+
+            auto& SA = rmi.SA;
+            auto& txpStarts = rmi.txpOffsets;
+            //auto& txpIDs = rmi.positionIDs;
+	    auto& rankDict = rmi.rankDict;
+
+            // Start with the smallest interval
+            // i.e. interval with the fewest hits.
+            SAIntervalHit<OffsetT>* minHit = &inHits[0];
+            for (auto& h : inHits) {
+                if (h.span() < minHit->span()) {
+                    minHit = &h;
+                }
+            }
+
+            //outHits.reserve(minHit->span());
+            // =========
+            { // Add the info from minHit to outHits
+                for (OffsetT i = minHit->begin; i < minHit->end; ++i) {
+                    auto globalPos = SA[i];
+                    //auto tid = txpIDs[globalPos];
+                    auto tid = rmi.transcriptAtPosition(globalPos);
+                    auto txpPos = globalPos - txpStarts[tid];
+                    outHits[tid].tqvec.emplace_back(txpPos, minHit->queryPos, minHit->queryRC);
+                }
+            }
+            // =========
+
+            // Now intersect everything in inHits (apart from minHits)
+            // to get the final set of mapping info.
+            size_t intervalCounter{2};
+            for (auto& h : inHits) {
+                if (&h != minHit) { // don't intersect minHit with itself
+                    intersectSAIntervalWithOutput(h, rmi, intervalCounter, outHits);
+                    ++intervalCounter;
+                }
+            }
+
+            size_t requiredNumHits = inHits.size();
+            // Mark as active any transcripts with the required number of hits.
+            for (auto it = outHits.begin(); it != outHits.end(); ++it) {
+                bool enoughHits = (it->second.numActive >= requiredNumHits);
+                it->second.active = (strictFilter) ? 
+                    (enoughHits and it->second.checkConsistent(requiredNumHits)) :
+                    (enoughHits);
+            }
+            return outHits;
+        }
+
+
+        /**
+        * Need to explicitly instantiate the versions we use
+        */
+      using SAIndex32BitDense = RapMapSAIndex<int32_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int32_t>,
+									     rapmap::utils::KmerKeyHasher>>;
+      using SAIndex64BitDense = RapMapSAIndex<int64_t,google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<int64_t>,
+									     rapmap::utils::KmerKeyHasher>>;
+      using SAIndex32BitPerfect = RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+      using SAIndex64BitPerfect = RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex32BitDense>(SAIntervalHit<int32_t>& h,
+                                                              SAIndex32BitDense& rmi, 
+                                                              uint32_t intervalCounter, 
+                                                              SAHitMap& outHits);
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex64BitDense>(SAIntervalHit<int64_t>& h,
+                                                              SAIndex64BitDense& rmi, 
+                                                              uint32_t intervalCounter, 
+                                                              SAHitMap& outHits); 
+
+        template
+        SAHitMap intersectSAHits<SAIndex32BitDense>(std::vector<SAIntervalHit<int32_t>>& inHits,
+                                                    SAIndex32BitDense& rmi, bool strictFilter);
+
+        template
+        SAHitMap intersectSAHits<SAIndex64BitDense>(std::vector<SAIntervalHit<int64_t>>& inHits,
+          SAIndex64BitDense& rmi, bool strictFilter);
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex32BitPerfect>(SAIntervalHit<int32_t>& h,
+                                                                SAIndex32BitPerfect& rmi, 
+                                                                uint32_t intervalCounter, 
+                                                                SAHitMap& outHits);
+
+        template
+        void intersectSAIntervalWithOutput<SAIndex64BitPerfect>(SAIntervalHit<int64_t>& h,
+                                                                SAIndex64BitPerfect& rmi, 
+                                                                uint32_t intervalCounter, 
+                                                                SAHitMap& outHits);
+
+        template
+        SAHitMap intersectSAHits<SAIndex32BitPerfect>(std::vector<SAIntervalHit<int32_t>>& inHits,
+                                                      SAIndex32BitPerfect& rmi, bool strictFilter);
+
+        template
+        SAHitMap intersectSAHits<SAIndex64BitPerfect>(std::vector<SAIntervalHit<int64_t>>& inHits,
+                                                      SAIndex64BitPerfect& rmi, bool strictFilter);
+    }
+}
diff --git a/debian/rapmap/HitManager.hpp b/debian/rapmap/HitManager.hpp
new file mode 100644
index 0000000..24a288e
--- /dev/null
+++ b/debian/rapmap/HitManager.hpp
@@ -0,0 +1,109 @@
+#ifndef __HIT_MANAGER_HPP__
+#define __HIT_MANAGER_HPP__
+
+#include "RapMapUtils.hpp"
+#include "RapMapIndex.hpp"
+#include "RapMapSAIndex.hpp"
+
+//#include "eytzinger_array.h"
+
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <map>
+#include <unordered_map>
+
+namespace rapmap {
+    namespace hit_manager {
+        using HitInfo = rapmap::utils::HitInfo;
+        using ProcessedHit = rapmap::utils::ProcessedHit;
+        using MateStatus = rapmap::utils::MateStatus;
+        using PositionListHelper = rapmap::utils::PositionListHelper;
+        using QuasiAlignment = rapmap::utils::QuasiAlignment;
+        using TxpQueryPos = rapmap::utils::TxpQueryPos;
+        using SATxpQueryPos = rapmap::utils::SATxpQueryPos;
+
+        template <typename T>
+        using SAIntervalHit = rapmap::utils::SAIntervalHit<T>;
+        using SAHitMap = std::map<int, rapmap::utils::ProcessedSAHit>;
+        using ProcessedSAHit = rapmap::utils::ProcessedSAHit;
+
+        class SAProcessedHitVec {
+            public:
+                std::vector<ProcessedSAHit> hits;
+                std::vector<uint32_t> txps;
+        };
+        /*
+        using SAProcessedHitVec = std::tuple<std::vector<ProcessedSAHit>, std::vector<uint32_t>>;
+        */
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimple(std::vector<ProcessedHit>& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus);
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA(SAHitMap& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus);
+
+        // Return hits from processedHits where position constraints
+        // match maxDist
+        bool collectHitsSimpleSA2(std::vector<ProcessedSAHit>& processedHits,
+                uint32_t readLen,
+                uint32_t maxDist,
+                std::vector<QuasiAlignment>& hits,
+                MateStatus mateStatus);
+
+
+        // Intersects the hit h2 with outHits.
+        // This will modify outHits so that the tqvec field of the
+        // entries in outHits that are labeled by the transcripts in
+        // which h2 appears will have an iterator to the beginning of
+        // the position list for h2.
+        void intersectWithOutput(HitInfo& h2, RapMapIndex& rmi,
+                std::vector<ProcessedHit>& outHits);
+
+        template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                                           RapMapIndexT& rmi,
+                                           uint32_t intervalCounter,
+                                           SAHitMap& outHits);
+                                           
+
+        template <typename RapMapIndexT>
+        void intersectSAIntervalWithOutput2(SAIntervalHit<typename RapMapIndexT::IndexType>& h,
+                RapMapIndexT& rmi,
+                SAProcessedHitVec& outStructs);
+
+        /*
+        void intersectSAIntervalWithOutput3(SAIntervalHit& h,
+                RapMapSAIndex& rmi,
+                SAProcessedHitVec& outHits);
+                */
+
+        std::vector<ProcessedHit> intersectHits(
+                std::vector<HitInfo>& inHits,
+                RapMapIndex& rmi);
+
+        template <typename RapMapIndexT>
+        SAHitMap intersectSAHits(
+                                 std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                                 RapMapIndexT& rmi, 
+                                 bool strictFilter=false);
+
+        template <typename RapMapIndexT>
+        std::vector<ProcessedSAHit> intersectSAHits2(
+                std::vector<SAIntervalHit<typename RapMapIndexT::IndexType>>& inHits,
+                RapMapIndexT& rmi);
+    }
+}
+
+
+#endif // __HIT_MANAGER_HPP__
diff --git a/debian/rapmap/IndexHeader.hpp b/debian/rapmap/IndexHeader.hpp
new file mode 100644
index 0000000..87eba2d
--- /dev/null
+++ b/debian/rapmap/IndexHeader.hpp
@@ -0,0 +1,77 @@
+#ifndef __INDEX_HEADER_HPP__
+#define __INDEX_HEADER_HPP__
+
+#include "spdlog/spdlog.h"
+#include <cereal/types/string.hpp>
+
+// The different types of indices supported
+enum class IndexType : uint8_t {
+    PSEUDO = 0,
+    QUASI,
+    INVALID
+};
+
+class IndexHeader {
+    public:
+        IndexHeader () : type_(IndexType::INVALID), versionString_("invalid"), usesKmers_(false), kmerLen_(0), perfectHash_(false) {}
+
+        IndexHeader(IndexType typeIn, const std::string& versionStringIn,
+                    bool usesKmersIn, uint32_t kmerLenIn, bool bigSA = false, bool perfectHash = false):
+                    type_(typeIn), versionString_(versionStringIn),
+                    usesKmers_(usesKmersIn), kmerLen_(kmerLenIn), bigSA_(bigSA),
+                    perfectHash_(perfectHash) {}
+
+        template <typename Archive>
+            void save(Archive& ar) const {
+                ar( cereal::make_nvp("IndexType", type_) );
+                ar( cereal::make_nvp("IndexVersion", versionString_) );
+                ar( cereal::make_nvp("UsesKmers", usesKmers_) );
+                ar( cereal::make_nvp("KmerLen", kmerLen_) );
+                ar( cereal::make_nvp("BigSA", bigSA_) );
+                ar( cereal::make_nvp("PerfectHash", perfectHash_) );
+            }
+
+        template <typename Archive>
+        void load(Archive& ar) {
+            try {
+                ar( cereal::make_nvp("IndexType", type_) );
+                ar( cereal::make_nvp("IndexVersion", versionString_) );
+                ar( cereal::make_nvp("UsesKmers", usesKmers_) );
+                ar( cereal::make_nvp("KmerLen", kmerLen_) );
+                ar( cereal::make_nvp("BigSA", bigSA_) );
+                ar( cereal::make_nvp("PerfectHash", perfectHash_) );
+            } catch (const cereal::Exception& e) {
+                auto cerrLog = spdlog::get("stderrLog");
+                cerrLog->error("Encountered exception [{}] when loading index.", e.what());
+                cerrLog->error("The index was likely build with an older (and incompatible) "
+                               "version of RapMap.  Please re-build the index with a compatible version.");
+                cerrLog->flush(); 
+                std::exit(1);
+            }
+        }
+
+        IndexType indexType() const { return type_; }
+        std::string version() const { return versionString_; }
+        bool usesKmers() const { return usesKmers_; }
+        uint32_t kmerLen() const { return kmerLen_; }
+        bool bigSA() const { return bigSA_; }
+        bool perfectHash() const { return perfectHash_; }
+
+    private:
+        // The type of index we have
+        IndexType type_;
+        // The version string for the index
+        std::string versionString_;
+        // True if this index makes use of k-mers false otherwise
+        // (currently, all supported indices use k-mers in some form)
+        bool usesKmers_;
+        // The length of k-mer used by the index
+        uint32_t kmerLen_;
+        // Do we have a 64-bit suffix array or not
+        bool bigSA_;
+        // Are we using a perfect hash in the index or not?
+        bool perfectHash_;
+};
+
+
+#endif // __INDEX_HEADER_HPP__
diff --git a/debian/rapmap/JFRaw.hpp b/debian/rapmap/JFRaw.hpp
new file mode 100644
index 0000000..4efa052
--- /dev/null
+++ b/debian/rapmap/JFRaw.hpp
@@ -0,0 +1,30 @@
+#ifndef __JF_RAW_H__
+#define __JF_RAW_H__
+
+#include "jellyfish/file_header.hpp"
+// Type for values
+/*
+struct value_type {
+  char foo;
+  int  bar;
+  bool baz;
+};
+*/
+
+// Special header type. Just like the jellyfish header type, but save
+// one extra piece of information about the hash array.
+class SpecialHeader : public jellyfish::file_header {
+public:
+  SpecialHeader() = default;
+  SpecialHeader(std::istream& is) : jellyfish::file_header(is) { }
+
+  template<typename storage>
+  void update_from_ary(const storage& ary) {
+    jellyfish::file_header::update_from_ary(ary);
+    root_["size_bytes"] = (Json::UInt64)ary.size_bytes();
+  }
+
+  size_t size_bytes() const { return root_["size_bytes"].asLargestUInt(); }
+};
+
+#endif /* __JF_RAW_H__ */
diff --git a/debian/rapmap/RapMapConfig.hpp b/debian/rapmap/RapMapConfig.hpp
new file mode 100644
index 0000000..df7d935
--- /dev/null
+++ b/debian/rapmap/RapMapConfig.hpp
@@ -0,0 +1,14 @@
+#ifndef __RAPMAP_CONFIG_HPP__
+#define __RAPMAP_CONFIG_HPP__
+
+#include <string>
+
+namespace rapmap {
+    constexpr char majorVersion[] = "0";
+    constexpr char minorVersion[] = "3";
+    constexpr char patchVersion[] = "0";
+    constexpr char version [] = "0.3.0";
+    constexpr uint32_t indexVersion = 2;
+}
+
+#endif //__RAPMAP_CONFIG_HPP__
diff --git a/debian/rapmap/RapMapFileSystem.cpp b/debian/rapmap/RapMapFileSystem.cpp
new file mode 100644
index 0000000..66e246b
--- /dev/null
+++ b/debian/rapmap/RapMapFileSystem.cpp
@@ -0,0 +1,37 @@
+#include "RapMapFileSystem.hpp"
+#include <sys/stat.h>
+
+
+namespace rapmap {
+    namespace fs {
+
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool FileExists(const char *path) {
+            struct stat fileStat;
+            if ( stat(path, &fileStat) ) {
+                return false;
+            }
+            if ( !S_ISREG(fileStat.st_mode) ) {
+                return false;
+            }
+            return true;
+        }
+
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool DirExists(const char *path) {
+            struct stat fileStat;
+            if ( stat(path, &fileStat) ) {
+                return false;
+            }
+            if ( !S_ISDIR(fileStat.st_mode) ) {
+                return false;
+            }
+            return true;
+        }
+
+        void MakeDir(const char* path) {
+            mkdir(path, ACCESSPERMS);
+        }
+
+    }
+}
diff --git a/debian/rapmap/RapMapFileSystem.hpp b/debian/rapmap/RapMapFileSystem.hpp
new file mode 100644
index 0000000..0292128
--- /dev/null
+++ b/debian/rapmap/RapMapFileSystem.hpp
@@ -0,0 +1,15 @@
+#ifndef __RAPMAP_FILESYSTEM_HPP__
+#define __RAPMAP_FILESYSTEM_HPP__
+
+namespace rapmap {
+    namespace fs {
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool FileExists(const char *path);
+        // Taken from http://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exist-using-standard-c-c11-c
+        bool DirExists(const char *path);
+        void MakeDir(const char* path);
+    }
+}
+
+
+#endif //__RAPMAP_FILESYSTEM_HPP__
diff --git a/debian/rapmap/RapMapIndex.hpp b/debian/rapmap/RapMapIndex.hpp
new file mode 100644
index 0000000..3994d8d
--- /dev/null
+++ b/debian/rapmap/RapMapIndex.hpp
@@ -0,0 +1,52 @@
+#ifndef __RAP_MAP_INDEX_HPP__
+#define __RAP_MAP_INDEX_HPP__
+
+#include <fstream>
+#include <memory>
+
+//#include "jellyfish/jellyfish.hpp"
+#include "jellyfish/file_header.hpp"
+#include "jellyfish/binary_dumper.hpp"
+#include "jellyfish/hash_counter.hpp"
+#include "jellyfish/mapped_file.hpp"
+#include "JFRaw.hpp"
+
+#include "spdlog/spdlog.h"
+
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+
+#include "RapMapUtils.hpp"
+#include "ScopedTimer.hpp"
+
+class RapMapIndex {
+    using PositionList = std::vector<uint32_t>;
+    using KmerInfoList = std::vector<rapmap::utils::KmerInfo>;
+    using EqClassList = std::vector<rapmap::utils::EqClass>;
+    //using MerMapT = jellyfish::cooperative::hash_counter<rapmap::utils::my_mer>;
+    using FileMerArray = jellyfish::large_hash::array_raw<rapmap::utils::my_mer>;
+    using EqClassLabelVec = std::vector<uint32_t>;
+
+    //using KmerIndex = std::unordered_map<uint64_t, TranscriptList, rapmap::utils::KmerKeyHasher>;
+    //using IntervalIndex = std::unordered_map<uint64_t, rapmap::utils::KmerInterval, rapmap::utils::KmerKeyHasher>;
+
+    public:
+    RapMapIndex();
+
+    bool load(std::string& indexPrefix);
+
+    KmerInfoList kmerInfos;
+    std::unique_ptr<char> rawHashMem{nullptr};
+    std::unique_ptr<FileMerArray> merHash{nullptr};
+    EqClassList eqClassList;
+    EqClassLabelVec eqLabelList;
+    PositionList posList;
+    std::vector<std::string> txpNames;
+    std::vector<uint32_t> txpLens;
+    std::vector<uint8_t> fwdJumpTable;
+    std::vector<uint8_t> revJumpTable;
+};
+
+#endif //__RAP_MAP_INDEX_HPP__
diff --git a/debian/rapmap/RapMapSAIndex.cpp b/debian/rapmap/RapMapSAIndex.cpp
new file mode 100644
index 0000000..2e97122
--- /dev/null
+++ b/debian/rapmap/RapMapSAIndex.cpp
@@ -0,0 +1,177 @@
+#include "BooMap.hpp"
+#include "RapMapSAIndex.hpp"
+#include "IndexHeader.hpp"
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+
+
+#include <future>
+#include <thread>
+
+// These are **free** functions that are used for loading the
+// appropriate type of hash.
+template <typename IndexT>
+bool loadHashFromIndex(const std::string& indexDir,
+                       google::dense_hash_map<uint64_t,
+                       rapmap::utils::SAInterval<IndexT>,
+                       rapmap::utils::KmerKeyHasher>& khash) {
+      khash.set_empty_key(std::numeric_limits<uint64_t>::max());
+      std::ifstream hashStream(indexDir + "hash.bin");
+      khash.unserialize(typename google::dense_hash_map<uint64_t,
+                      rapmap::utils::SAInterval<IndexT>,
+                      rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream);
+      return true;
+}
+
+template <typename IndexT>
+bool loadHashFromIndex(const std::string& indexDir,
+		       BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> & h) {
+    std::string hashBase = indexDir + "hash_info";
+    h.load(hashBase);
+    return true;
+}
+
+template <typename IndexT, typename HashT>
+RapMapSAIndex<IndexT, HashT>::RapMapSAIndex() {}
+
+// Given a position, p, in the concatenated text,
+// return the corresponding transcript
+template <typename IndexT, typename HashT>
+IndexT RapMapSAIndex<IndexT, HashT>::transcriptAtPosition(IndexT p) {
+    return rankDict->rank(p);
+}
+
+template <typename IndexT, typename HashT>
+bool RapMapSAIndex<IndexT, HashT>::load(const std::string& indDir) {
+
+    auto logger = spdlog::get("stderrLog");
+    size_t n{0};
+
+    IndexHeader h;
+    std::ifstream indexStream(indDir + "header.json");
+    {
+      cereal::JSONInputArchive ar(indexStream);
+      ar(h);
+    }
+    indexStream.close();
+    uint32_t idxK = h.kmerLen();
+
+    // This part takes the longest, so do it in it's own asynchronous task
+    std::future<bool> loadingHash = std::async(std::launch::async, [this, logger, indDir]() -> bool {
+	   if (loadHashFromIndex(indDir, khash)) {
+                logger->info("Successfully loaded position hash");
+                return true;
+            } else {
+                logger->error("Failed to load position hash!");
+                return false;
+            }
+	// If using a google dense hash
+        //this->khash.set_empty_key(std::numeric_limits<uint64_t>::max());
+        //uint32_t k = 31;
+        //std::ifstream hashStream(indDir + "hash.bin");
+        //{
+
+	  //logger->info("Loading Position Hash");
+            //khash.load(hashStream);
+            //cereal::BinaryInputArchive hashArchive(hashStream);
+            //hashArchive(k);
+            //khash.unserialize(typename google::dense_hash_map<uint64_t,
+            //        rapmap::utils::SAInterval<IndexT>,
+            //        rapmap::utils::KmerKeyHasher>::NopointerSerializer(), &hashStream);
+            //hashArchive(khash);
+	   //}
+        //hashStream.close();
+        //std::cerr << "had " << khash.size() << " entries\n";
+        //return true;
+    });
+
+    /*
+    std::ifstream intervalStream(indDir + "kintervals.bin");
+    {
+        logger->info("Loading k-mer intervals");
+        cereal::BinaryInputArchive intervalArchive(intervalStream);
+        intervalArchive(kintervals);
+    }
+    intervalStream.close();
+    */
+
+    std::ifstream saStream(indDir + "sa.bin");
+    {
+        logger->info("Loading Suffix Array ");
+        cereal::BinaryInputArchive saArchive(saStream);
+        saArchive(SA);
+        //saArchive(LCP);
+    }
+    saStream.close();
+
+    std::ifstream seqStream(indDir + "txpInfo.bin");
+    {
+        logger->info("Loading Transcript Info ");
+        cereal::BinaryInputArchive seqArchive(seqStream);
+        seqArchive(txpNames);
+        seqArchive(txpOffsets);
+        //seqArchive(positionIDs);
+        seqArchive(seq);
+    }
+    seqStream.close();
+
+    /*
+       std::ifstream rsStream(indDir + "rsdSafe.bin", std::ios::binary);
+       {
+       logger->info("Loading Rank-Select Data");
+       rankDictSafe.Load(rsStream);
+       }
+       rsStream.close();
+       */
+    std::string rsFileName = indDir + "rsd.bin";
+    FILE* rsFile = fopen(rsFileName.c_str(), "r");
+    {
+        logger->info("Loading Rank-Select Bit Array");
+        bitArray.reset(bit_array_create(0));
+        if (!bit_array_load(bitArray.get(), rsFile)) {
+            logger->error("Couldn't load bit array from {}!", rsFileName);
+            std::exit(1);
+        }
+        logger->info("There were {} set bits in the bit array", bit_array_num_bits_set(bitArray.get()));
+        rankDict.reset(new rank9b(bitArray->words, bitArray->num_of_bits));
+    }
+    fclose(rsFile);
+
+    {
+        logger->info("Computing transcript lengths");
+        txpLens.resize(txpOffsets.size());
+        if (txpOffsets.size() > 1) {
+            for(size_t i = 0; i < txpOffsets.size() - 1; ++i) {
+                auto nextOffset = txpOffsets[i+1];
+                auto currentOffset = txpOffsets[i];
+                txpLens[i] = (nextOffset - 1) - currentOffset;
+            }
+        }
+        // The last length is just the length of the suffix array - the last offset
+        txpLens[txpOffsets.size()-1] = (SA.size() - 1) - txpOffsets[txpOffsets.size() - 1];
+    }
+
+    logger->info("Waiting to finish loading hash");
+    loadingHash.wait();
+    auto hashLoadRes = loadingHash.get();
+    if (!hashLoadRes) {
+        logger->error("Failed to load hash!");
+        std::exit(1);
+    }
+    rapmap::utils::my_mer::k(idxK);
+
+    logger->info("Done loading index");
+    return true;
+}
+
+template class RapMapSAIndex<int32_t,  google::dense_hash_map<uint64_t,
+                      rapmap::utils::SAInterval<int32_t>,
+                      rapmap::utils::KmerKeyHasher>>;
+template class RapMapSAIndex<int64_t,  google::dense_hash_map<uint64_t,
+                      rapmap::utils::SAInterval<int64_t>,
+                      rapmap::utils::KmerKeyHasher>>;
+template class RapMapSAIndex<int32_t, BooMap<uint64_t, rapmap::utils::SAInterval<int32_t>>>;
+template class RapMapSAIndex<int64_t, BooMap<uint64_t, rapmap::utils::SAInterval<int64_t>>>;
diff --git a/debian/rapmap/RapMapSAIndex.hpp b/debian/rapmap/RapMapSAIndex.hpp
new file mode 100644
index 0000000..075846e
--- /dev/null
+++ b/debian/rapmap/RapMapSAIndex.hpp
@@ -0,0 +1,63 @@
+#ifndef __RAPMAP_SA_INDEX_HPP__
+#define __RAPMAP_SA_INDEX_HPP__
+
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/vector.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/archives/binary.hpp>
+
+#include "spdlog/spdlog.h"
+#include "spdlog/fmt/bundled/format.h"
+
+#include "google/dense_hash_map"
+#include "bit_array.h"
+//#include "bitmap.h"
+//#include "shared.h"
+#include "rank9b.h"
+
+#include <cstdio>
+#include <vector>
+#include <memory>
+
+#include <fstream>
+#include "RapMapUtils.hpp"
+
+template <typename IndexT, typename HashT>
+class RapMapSAIndex {
+    public:
+    using IndexType = IndexT;
+    using HashType = HashT;
+
+      struct BitArrayDeleter {
+        void operator()(BIT_ARRAY* b) {
+          if(b != nullptr) {
+            bit_array_free(b);
+          }
+        }
+      };
+
+	  using BitArrayPointer = std::unique_ptr<BIT_ARRAY, BitArrayDeleter>;
+
+    RapMapSAIndex();
+
+  	// Given a position, p, in the concatenated text,
+  	// return the corresponding transcript
+  	IndexT transcriptAtPosition(IndexT p);
+
+    bool load(const std::string& indDir);
+
+    std::vector<IndexT> SA;
+
+    BitArrayPointer bitArray{nullptr};
+    std::unique_ptr<rank9b> rankDict{nullptr};
+
+    std::string seq;
+    std::vector<std::string> txpNames;
+    std::vector<IndexT> txpOffsets;
+    std::vector<IndexT> txpLens;
+    std::vector<IndexT> positionIDs;
+    std::vector<rapmap::utils::SAIntervalWithKey<IndexT>> kintervals;
+    HashT khash;
+};
+
+#endif //__RAPMAP_SA_INDEX_HPP__
diff --git a/debian/rapmap/RapMapSAIndexer.cpp b/debian/rapmap/RapMapSAIndexer.cpp
new file mode 100644
index 0000000..83a1491
--- /dev/null
+++ b/debian/rapmap/RapMapSAIndexer.cpp
@@ -0,0 +1,731 @@
+#include <algorithm>
+#include <cctype>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "tclap/CmdLine.h"
+
+#include <cereal/archives/binary.hpp>
+#include <cereal/archives/json.hpp>
+#include <cereal/types/string.hpp>
+#include <cereal/types/unordered_map.hpp>
+#include <cereal/types/utility.hpp>
+#include <cereal/types/vector.hpp>
+
+#include "BooMap.hpp"
+#include "xxhash.h"
+
+#include "spdlog/spdlog.h"
+
+// Jellyfish 2 include
+#include "jellyfish/mer_dna.hpp"
+#include "jellyfish/stream_manager.hpp"
+#include "jellyfish/whole_sequence_parser.hpp"
+
+#include "divsufsort.h"
+#include "divsufsort64.h"
+
+#include "RapMapFileSystem.hpp"
+#include "RapMapUtils.hpp"
+#include "ScopedTimer.hpp"
+#include "bit_array.h"
+
+#include "JFRaw.hpp"
+#include "jellyfish/binary_dumper.hpp"
+#include "jellyfish/file_header.hpp"
+#include "jellyfish/hash_counter.hpp"
+#include "jellyfish/mer_iterator.hpp"
+#include "jellyfish/mer_overlap_sequence_parser.hpp"
+#include "jellyfish/thread_exec.hpp"
+#include "rank9b.h"
+
+#include "sparsehash/dense_hash_map"
+
+#include "IndexHeader.hpp"
+
+#include <chrono>
+
+using stream_manager =
+    jellyfish::stream_manager<std::vector<std::string>::const_iterator>;
+using single_parser = jellyfish::whole_sequence_parser<stream_manager>;
+using TranscriptID = uint32_t;
+using TranscriptIDVector = std::vector<TranscriptID>;
+using KmerIDMap = std::vector<TranscriptIDVector>;
+using MerMapT = jellyfish::cooperative::hash_counter<rapmap::utils::my_mer>;
+
+bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
+             std::vector<int64_t>& SA) {
+  // IndexT is the signed index type
+  // UIndexT is the unsigned index type
+  using IndexT = int64_t;
+  using UIndexT = uint64_t;
+  bool success{false};
+
+  std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    SA.resize(tlen, 0);
+    IndexT textLen = static_cast<IndexT>(tlen);
+    std::cerr << "Building suffix array . . . ";
+    auto ret = divsufsort64(
+        reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
+        SA.data(), tlen);
+
+    success = (ret == 0);
+    if (success) {
+      std::cerr << "success\n";
+      {
+        ScopedTimer timer2;
+        std::cerr << "saving to disk . . . ";
+        cereal::BinaryOutputArchive saArchive(saStream);
+        saArchive(SA);
+        std::cerr << "done\n";
+      }
+    } else {
+      std::cerr << "FAILURE: return code from libdivsufsort64() was " << ret
+                << "\n";
+      saStream.close();
+      std::exit(1);
+    }
+    std::cerr << "done\n";
+  }
+  saStream.close();
+  return success;
+}
+
+// IndexT is the index type.
+// int32_t for "small" suffix arrays
+// int64_t for "large" ones
+template <typename IndexT>
+bool buildPerfectHash(const std::string& outputDir, std::string& concatText,
+                      size_t tlen, uint32_t k, std::vector<IndexT>& SA,
+                      uint32_t numHashThreads) {
+  BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
+
+  // The start and stop of the current interval
+  IndexT start = 0, stop = 0;
+  // An iterator to the beginning of the text
+  auto textB = concatText.begin();
+  auto textE = concatText.end();
+  // The current k-mer as a string
+  rapmap::utils::my_mer mer;
+  bool currentValid{false};
+  std::string currentKmer;
+  std::string nextKmer;
+  while (stop < tlen) {
+    // Check if the string starting at the
+    // current position is valid (i.e. doesn't contain $)
+    // and is <= k bases from the end of the string
+    nextKmer = concatText.substr(SA[stop], k);
+    if (nextKmer.length() == k and
+        nextKmer.find_first_of('$') == std::string::npos) {
+      // If this is a new k-mer, then hash the current k-mer
+      if (nextKmer != currentKmer) {
+        if (currentKmer.length() == k and
+            currentKmer.find_first_of('$') == std::string::npos) {
+          mer = rapmap::utils::my_mer(currentKmer);
+          auto bits = mer.get_bits(0, 2 * k);
+          intervals.add(std::move(bits), {start, stop});
+          // push_back(std::make_pair<uint64_t,
+          // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start,
+          // stop}));
+        }
+        currentKmer = nextKmer;
+        start = stop;
+      }
+    } else {
+      // If this isn't a valid suffix (contains a $)
+      // If the previous interval was valid, put it
+      // in the hash.
+      if (currentKmer.length() == k and
+          currentKmer.find_first_of('$') == std::string::npos) {
+        mer = rapmap::utils::my_mer(currentKmer);
+        auto bits = mer.get_bits(0, 2 * k);
+        // intervals.push_back(std::make_pair<uint64_t,
+        // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
+        intervals.add(std::move(bits), {start, stop});
+      }
+      // The current interval is invalid and empty
+      currentKmer = nextKmer;
+      start = stop;
+    }
+    if (stop % 1000000 == 0) {
+      std::cerr << "\r\rprocessed " << stop << " positions";
+    }
+    // We always update the end position
+    ++stop;
+  }
+  if (start < tlen) {
+    if (currentKmer.length() == k and
+        currentKmer.find_first_of('$') != std::string::npos) {
+      mer = rapmap::utils::my_mer(currentKmer);
+      auto bits = mer.get_bits(0, 2 * k);
+      // intervals.push_back(std::make_pair<uint64_t,
+      // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
+      intervals.add(std::move(bits), {start, stop});
+    }
+  }
+
+  // std::cerr << "\nthere are " << intervals.size() << " intervals of the
+  // selected depth\n";
+
+  std::cout << "building perfect hash function\n";
+  intervals.build(numHashThreads);
+  std::cout << "\ndone.\n";
+  std::string outputPrefix = outputDir + "hash_info";
+  std::cout << "saving the perfect hash and SA intervals to disk ... ";
+  intervals.save(outputPrefix);
+  std::cout << "done.\n";
+
+  return true;
+}
+
+bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
+             std::vector<int32_t>& SA) {
+  // IndexT is the signed index type
+  // UIndexT is the unsigned index type
+  using IndexT = int32_t;
+  using UIndexT = uint32_t;
+  bool success{false};
+
+  std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    SA.resize(tlen, 0);
+    IndexT textLen = static_cast<IndexT>(tlen);
+    std::cerr << "Building suffix array . . . ";
+    auto ret = divsufsort(
+        reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
+        SA.data(), tlen);
+
+    success = (ret == 0);
+    if (success) {
+      std::cerr << "success\n";
+      {
+        ScopedTimer timer2;
+        std::cerr << "saving to disk . . . ";
+        cereal::BinaryOutputArchive saArchive(saStream);
+        saArchive(SA);
+        std::cerr << "done\n";
+      }
+    } else {
+      std::cerr << "FAILURE: return code from libdivsufsort() was " << ret
+                << "\n";
+      saStream.close();
+      std::exit(1);
+    }
+    std::cerr << "done\n";
+  }
+  saStream.close();
+  return success;
+}
+
+// IndexT is the index type.
+// int32_t for "small" suffix arrays
+// int64_t for "large" ones
+template <typename IndexT>
+bool buildHash(const std::string& outputDir, std::string& concatText,
+               size_t tlen, uint32_t k, std::vector<IndexT>& SA) {
+  // Now, build the k-mer lookup table
+  google::dense_hash_map<uint64_t, rapmap::utils::SAInterval<IndexT>,
+                         rapmap::utils::KmerKeyHasher>
+      khash;
+  khash.set_empty_key(std::numeric_limits<uint64_t>::max());
+
+  // The start and stop of the current interval
+  IndexT start = 0, stop = 0;
+  // An iterator to the beginning of the text
+  auto textB = concatText.begin();
+  auto textE = concatText.end();
+  // The current k-mer as a string
+  rapmap::utils::my_mer mer;
+  bool currentValid{false};
+  std::string currentKmer;
+  std::string nextKmer;
+  while (stop < tlen) {
+    // Check if the string starting at the
+    // current position is valid (i.e. doesn't contain $)
+    // and is <= k bases from the end of the string
+    nextKmer = concatText.substr(SA[stop], k);
+    if (nextKmer.length() == k and
+        nextKmer.find_first_of('$') == std::string::npos) {
+      // If this is a new k-mer, then hash the current k-mer
+      if (nextKmer != currentKmer) {
+        if (currentKmer.length() == k and
+            currentKmer.find_first_of('$') == std::string::npos) {
+          mer = rapmap::utils::my_mer(currentKmer);
+          auto bits = mer.get_bits(0, 2 * k);
+          auto hashIt = khash.find(bits);
+          if (hashIt == khash.end()) {
+            if (start > 1) {
+              if (concatText.substr(SA[start - 1], k) ==
+                  concatText.substr(SA[start], k)) {
+                std::cerr << "T[SA[" << start - 1 << "]:" << k
+                          << "] = " << concatText.substr(SA[start - 1], k)
+                          << " = T[SA[" << start << "]:" << k << "]\n";
+                std::cerr << "start = " << start << ", stop = " << stop << "\n";
+                std::cerr << "[fatal (1)] THIS SHOULD NOT HAPPEN\n";
+                std::exit(1);
+              }
+            }
+            if (start == stop) {
+              std::cerr << "[fatal (2)] Interval is empty! (start = " << start
+                        << ") = (stop =  " << stop << ")\n";
+            }
+            if (start == stop) {
+              std::cerr << "[fatal (3)] Interval is empty! (start = " << start
+                        << ") = (stop =  " << stop << ")\n";
+            }
+
+            khash[bits] = {start, stop};
+          } else {
+            std::cerr << "\nERROR (1): trying to add same suffix "
+                      << currentKmer << " (len = " << currentKmer.length()
+                      << ") multiple times!\n";
+            auto prevInt = hashIt->second;
+            std::cerr << "existing interval is [" << prevInt.begin << ", "
+                      << prevInt.end << ")\n";
+            for (auto x = prevInt.begin; x < prevInt.end; ++x) {
+              auto suff = concatText.substr(SA[x], k);
+              for (auto c : suff) {
+                std::cerr << "*" << c << "*";
+              }
+              std::cerr << " (len = " << suff.length() << ")\n";
+            }
+            std::cerr << "new interval is [" << start << ", " << stop << ")\n";
+            for (auto x = start; x < stop; ++x) {
+              auto suff = concatText.substr(SA[x], k);
+              for (auto c : suff) {
+                std::cerr << "*" << c << "*";
+              }
+              std::cerr << "\n";
+            }
+          }
+        }
+        currentKmer = nextKmer;
+        start = stop;
+      }
+    } else {
+      // If this isn't a valid suffix (contains a $)
+
+      // If the previous interval was valid, put it
+      // in the hash.
+      if (currentKmer.length() == k and
+          currentKmer.find_first_of('$') == std::string::npos) {
+        mer = rapmap::utils::my_mer(currentKmer);
+        auto bits = mer.get_bits(0, 2 * k);
+        auto hashIt = khash.find(bits);
+        if (hashIt == khash.end()) {
+          if (start > 2) {
+            if (concatText.substr(SA[start - 1], k) ==
+                concatText.substr(SA[start], k)) {
+              std::cerr << "T[SA[" << start - 1 << "]:" << k
+                        << "] = " << concatText.substr(SA[start - 1], k)
+                        << " = T[SA[" << start << "]:" << k << "]\n";
+              std::cerr << "start = " << start << ", stop = " << stop << "\n";
+              std::cerr << "[fatal (4)] THIS SHOULD NOT HAPPEN\n";
+              std::exit(1);
+            }
+          }
+          khash[bits] = {start, stop};
+        } else {
+          std::cerr << "\nERROR (2): trying to add same suffix " << currentKmer
+                    << "multiple times!\n";
+          auto prevInt = hashIt->second;
+          std::cerr << "existing interval is [" << prevInt.begin << ", "
+                    << prevInt.end << ")\n";
+          for (auto x = prevInt.begin; x < prevInt.end; ++x) {
+            std::cerr << concatText.substr(SA[x], k) << "\n";
+          }
+          std::cerr << "new interval is [" << start << ", " << stop << ")\n";
+          for (auto x = start; x < stop; ++x) {
+            std::cerr << concatText.substr(SA[x], k) << "\n";
+          }
+        }
+      }
+      // The current interval is invalid and empty
+      currentKmer = nextKmer;
+      start = stop;
+    }
+    if (stop % 1000000 == 0) {
+      std::cerr << "\r\rprocessed " << stop << " positions";
+    }
+    // We always update the end position
+    ++stop;
+  }
+  if (start < tlen) {
+    if (currentKmer.length() == k and
+        currentKmer.find_first_of('$') != std::string::npos) {
+      mer = rapmap::utils::my_mer(currentKmer);
+      khash[mer.get_bits(0, 2 * k)] = {start, stop};
+    }
+  }
+  std::cerr << "\nkhash had " << khash.size() << " keys\n";
+  std::ofstream hashStream(outputDir + "hash.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    std::cerr << "saving hash to disk . . . ";
+    cereal::BinaryOutputArchive hashArchive(hashStream);
+    // hashArchive(k);
+    khash.serialize(typename google::dense_hash_map<
+                        uint64_t, rapmap::utils::SAInterval<IndexT>,
+                        rapmap::utils::KmerKeyHasher>::NopointerSerializer(),
+                    &hashStream);
+    // hashArchive(khash);
+    std::cerr << "done\n";
+  }
+  hashStream.close();
+  return true;
+}
+
+// To use the parser in the following, we get "jobs" until none is
+// available. A job behaves like a pointer to the type
+// jellyfish::sequence_list (see whole_sequence_parser.hpp).
+template <typename ParserT> //, typename CoverageCalculator>
+void indexTranscriptsSA(ParserT* parser, std::string& outputDir,
+                        bool noClipPolyA, bool usePerfectHash,
+                        uint32_t numHashThreads, std::mutex& iomutex,
+                        std::shared_ptr<spdlog::logger> log) {
+  // Seed with a real random value, if available
+  std::random_device rd;
+
+  // Create a random uniform distribution
+  std::default_random_engine eng(rd());
+
+  std::uniform_int_distribution<> dis(0, 3);
+
+  uint32_t n{0};
+  uint32_t k = rapmap::utils::my_mer::k();
+  std::vector<std::string> transcriptNames;
+  std::vector<int64_t> transcriptStarts;
+  // std::vector<uint32_t> positionIDs;
+  constexpr char bases[] = {'A', 'C', 'G', 'T'};
+  uint32_t polyAClipLength{10};
+  uint32_t numPolyAsClipped{0};
+  uint32_t numNucleotidesReplaced{0};
+  std::string polyA(polyAClipLength, 'A');
+
+  using TranscriptList = std::vector<uint32_t>;
+  using eager_iterator = MerMapT::array::eager_iterator;
+  using KmerBinT = uint64_t;
+
+  bool clipPolyA = !noClipPolyA;
+
+  // http://biology.stackexchange.com/questions/21329/whats-the-longest-transcript-known
+  // longest human transcript is Titin (108861), so this gives us a *lot* of
+  // leeway before
+  // we issue any warning.
+  size_t tooLong = 200000;
+  size_t numDistinctKmers{0};
+  size_t numKmers{0};
+  size_t currIndex{0};
+  std::cerr << "\n[Step 1 of 4] : counting k-mers\n";
+
+  // rsdic::RSDicBuilder rsdb;
+  std::vector<uint64_t>
+      onePos; // Positions in the bit array where we should write a '1'
+  fmt::MemoryWriter txpSeqStream;
+  {
+    ScopedTimer timer;
+    while (true) {
+      typename ParserT::job j(*parser);
+      if (j.is_empty())
+        break;
+      for (size_t i = 0; i < j->nb_filled; ++i) { // For each sequence
+        std::string& readStr = j->data[i].seq;
+        readStr.erase(
+            std::remove_if(readStr.begin(), readStr.end(),
+                           [](const char a) -> bool { return !(isprint(a)); }),
+            readStr.end());
+
+        uint32_t readLen = readStr.size();
+        // First, replace non ATCG nucleotides
+        for (size_t b = 0; b < readLen; ++b) {
+          readStr[b] = ::toupper(readStr[b]);
+          int c = jellyfish::mer_dna::code(readStr[b]);
+          // Replace non-ACGT bases with pseudo-random bases
+          if (jellyfish::mer_dna::not_dna(c)) {
+            char rbase = bases[dis(eng)];
+            c = jellyfish::mer_dna::code(rbase);
+            readStr[b] = rbase;
+            ++numNucleotidesReplaced;
+          }
+        }
+
+        // Now, do Kallisto-esque clipping of polyA tails
+        if (clipPolyA) {
+          if (readStr.size() > polyAClipLength and
+              readStr.substr(readStr.length() - polyAClipLength) == polyA) {
+
+            auto newEndPos = readStr.find_last_not_of("Aa");
+            // If it was all As
+            if (newEndPos == std::string::npos) {
+              log->warn("Entry with header [{}] appeared to be all A's; it "
+                        "will be removed from the index!",
+                        j->data[i].header);
+              readStr.resize(0);
+            } else {
+              readStr.resize(newEndPos + 1);
+            }
+            ++numPolyAsClipped;
+          }
+        }
+
+        readLen = readStr.size();
+        // If the transcript was completely removed during clipping, don't
+        // include it in the index.
+        if (readStr.size() >= k) {
+          // If we're suspicious the user has fed in a *genome* rather
+          // than a transcriptome, say so here.
+          if (readStr.size() >= tooLong) {
+            log->warn("Entry with header [{}] was longer than {} nucleotides.  "
+                      "Are you certain that "
+                      "we are indexing a transcriptome and not a genome?",
+                      j->data[i].header, tooLong);
+          }
+
+          uint32_t txpIndex = n++;
+
+          // The name of the current transcript
+          auto& recHeader = j->data[i].header;
+          transcriptNames.emplace_back(
+              recHeader.substr(0, recHeader.find_first_of(" \t")));
+
+          // The position at which this transcript starts
+          transcriptStarts.push_back(currIndex);
+
+          txpSeqStream << readStr;
+          txpSeqStream << '$';
+          currIndex += readLen + 1;
+          onePos.push_back(currIndex - 1);
+        } else {
+            log->warn("Discarding entry with header [{}], since it was shorter than "
+                      "the k-mer length of {} (perhaps after poly-A clipping)", 
+                      j->data[i].header, k);
+        }
+      }
+      if (n % 10000 == 0) {
+        std::cerr << "\r\rcounted k-mers for " << n << " transcripts";
+      }
+    }
+  }
+  std::cerr << "\n";
+
+  std::cerr << "Replaced " << numNucleotidesReplaced
+            << " non-ATCG nucleotides\n";
+  std::cerr << "Clipped poly-A tails from " << numPolyAsClipped
+            << " transcripts\n";
+
+  // Put the concatenated text in a string
+  std::string concatText = txpSeqStream.str();
+  // And clear the stream
+  txpSeqStream.clear();
+
+  // Build the suffix array
+  size_t tlen = concatText.length();
+  size_t maxInt = std::numeric_limits<int32_t>::max();
+  bool largeIndex = (tlen + 1 > maxInt);
+
+  // Make our dense bit arrray
+  BIT_ARRAY* bitArray = bit_array_create(concatText.length());
+  for (auto p : onePos) {
+    bit_array_set_bit(bitArray, p);
+  }
+
+  onePos.clear();
+  onePos.shrink_to_fit();
+
+  std::string rsFileName = outputDir + "rsd.bin";
+  FILE* rsFile = fopen(rsFileName.c_str(), "w");
+  {
+    ScopedTimer timer;
+    std::cerr << "Building rank-select dictionary and saving to disk ";
+    bit_array_save(bitArray, rsFile);
+    std::cerr << "done\n";
+  }
+  fclose(rsFile);
+  bit_array_free(bitArray);
+
+  std::ofstream seqStream(outputDir + "txpInfo.bin", std::ios::binary);
+  {
+    ScopedTimer timer;
+    std::cerr << "Writing sequence data to file . . . ";
+    cereal::BinaryOutputArchive seqArchive(seqStream);
+    seqArchive(transcriptNames);
+    if (largeIndex) {
+      seqArchive(transcriptStarts);
+    } else {
+      std::vector<int32_t> txpStarts(transcriptStarts.size(), 0);
+      size_t numTranscriptStarts = transcriptStarts.size();
+      for (size_t i = 0; i < numTranscriptStarts; ++i) {
+        txpStarts[i] = static_cast<int32_t>(transcriptStarts[i]);
+      }
+      transcriptStarts.clear();
+      transcriptStarts.shrink_to_fit();
+      { seqArchive(txpStarts); }
+    }
+    // seqArchive(positionIDs);
+    seqArchive(concatText);
+    std::cerr << "done\n";
+  }
+  seqStream.close();
+
+  // clear stuff we no longer need
+  // positionIDs.clear();
+  // positionIDs.shrink_to_fit();
+  transcriptStarts.clear();
+  transcriptStarts.shrink_to_fit();
+  transcriptNames.clear();
+  transcriptNames.shrink_to_fit();
+  // done clearing
+
+  if (largeIndex) {
+    largeIndex = true;
+    std::cerr << "[info] Building 64-bit suffix array "
+                 "(length of generalized text is "
+              << tlen << " )\n";
+    using IndexT = int64_t;
+    std::vector<IndexT> SA;
+    bool success = buildSA(outputDir, concatText, tlen, SA);
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix array!\n";
+      std::exit(1);
+    }
+
+    if (usePerfectHash) {
+      success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
+                                         numHashThreads);
+    } else {
+      success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
+    }
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix interval hash!\n";
+      std::exit(1);
+    }
+  } else {
+    std::cerr << "[info] Building 32-bit suffix array "
+                 "(length of generalized text is "
+              << tlen << ")\n";
+    using IndexT = int32_t;
+    std::vector<IndexT> SA;
+    bool success = buildSA(outputDir, concatText, tlen, SA);
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix array!\n";
+      std::exit(1);
+    }
+
+    if (usePerfectHash) {
+      success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
+                                         numHashThreads);
+    } else {
+      success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
+    }
+    if (!success) {
+      std::cerr << "[fatal] Could not build the suffix interval hash!\n";
+      std::exit(1);
+    }
+  }
+
+  std::string indexVersion = "q3";
+  IndexHeader header(IndexType::QUASI, indexVersion, true, k, largeIndex,
+                     usePerfectHash);
+  // Finally (since everything presumably succeeded) write the header
+  std::ofstream headerStream(outputDir + "header.json");
+  {
+    cereal::JSONOutputArchive archive(headerStream);
+    archive(header);
+  }
+  headerStream.close();
+}
+
+int rapMapSAIndex(int argc, char* argv[]) {
+  std::cerr << "RapMap Indexer\n";
+
+  TCLAP::CmdLine cmd("RapMap Indexer");
+  TCLAP::ValueArg<std::string> transcripts("t", "transcripts",
+                                           "The transcript file to be indexed",
+                                           true, "", "path");
+  TCLAP::ValueArg<std::string> index(
+      "i", "index", "The location where the index should be written", true, "",
+      "path");
+  TCLAP::ValueArg<uint32_t> kval("k", "klen", "The length of k-mer to index",
+                                 false, 31, "positive integer less than 32");
+  TCLAP::SwitchArg noClip(
+      "n", "noClip",
+      "Don't clip poly-A tails from the ends of target sequences", false);
+  TCLAP::SwitchArg perfectHash(
+      "p", "perfectHash", "Use a perfect hash instead of dense hash --- "
+                          "somewhat slows construction, but uses less memory",
+      false);
+  TCLAP::ValueArg<uint32_t> numHashThreads(
+      "x", "numThreads",
+      "Use this many threads to build the perfect hash function", false, 4,
+      "positive integer <= # cores");
+  cmd.add(transcripts);
+  cmd.add(index);
+  cmd.add(kval);
+  cmd.add(noClip);
+  cmd.add(perfectHash);
+  cmd.add(numHashThreads);
+  cmd.parse(argc, argv);
+
+  // stupid parsing for now
+  std::string transcriptFile(transcripts.getValue());
+  std::vector<std::string> transcriptFiles({transcriptFile});
+
+  uint32_t k = kval.getValue();
+  if (k % 2 == 0) {
+    std::cerr << "Error: k must be an odd value, you chose " << k << '\n';
+    std::exit(1);
+  } else if (k > 31) {
+    std::cerr << "Error: k must not be larger than 31, you chose " << k << '\n';
+    std::exit(1);
+  }
+  rapmap::utils::my_mer::k(k);
+
+  std::string indexDir = index.getValue();
+  if (indexDir.back() != '/') {
+    indexDir += '/';
+  }
+  bool dirExists = rapmap::fs::DirExists(indexDir.c_str());
+  bool dirIsFile = rapmap::fs::FileExists(indexDir.c_str());
+  if (dirIsFile) {
+    std::cerr << "The requested index directory already exists as a file.";
+    std::exit(1);
+  }
+  if (!dirExists) {
+    rapmap::fs::MakeDir(indexDir.c_str());
+  }
+
+  std::string logPath = indexDir + "quasi_index.log";
+  auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_st>(logPath);
+  auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_st>();
+  auto consoleLog = spdlog::create("stderrLog", {consoleSink});
+  auto fileLog = spdlog::create("fileLog", {fileSink});
+  auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
+
+  size_t maxReadGroup{1000}; // Number of reads in each "job"
+  size_t concurrentFile{2};  // Number of files to read simultaneously
+  size_t numThreads{2};
+  stream_manager streams(transcriptFiles.begin(), transcriptFiles.end(),
+                         concurrentFile);
+  std::unique_ptr<single_parser> transcriptParserPtr{nullptr};
+  transcriptParserPtr.reset(
+      new single_parser(4 * numThreads, maxReadGroup, concurrentFile, streams));
+
+  bool noClipPolyA = noClip.getValue();
+  bool usePerfectHash = perfectHash.getValue();
+  uint32_t numPerfectHashThreads = numHashThreads.getValue();
+  std::mutex iomutex;
+  indexTranscriptsSA(transcriptParserPtr.get(), indexDir, noClipPolyA,
+                     usePerfectHash, numPerfectHashThreads, iomutex, jointLog);
+  return 0;
+}
diff --git a/debian/rapmap/RapMapUtils.hpp b/debian/rapmap/RapMapUtils.hpp
new file mode 100644
index 0000000..cbae039
--- /dev/null
+++ b/debian/rapmap/RapMapUtils.hpp
@@ -0,0 +1,825 @@
+#ifndef __RAP_MAP_UTILS_HPP__
+#define __RAP_MAP_UTILS_HPP__
+
+#include <atomic>
+#include <cmath>
+#include <memory>
+#include "xxhash.h"
+#include <cereal/archives/binary.hpp>
+#include "jellyfish/mer_dna.hpp"
+#include "spdlog/spdlog.h"
+#include "spdlog/fmt/bundled/format.h"
+#include "PairSequenceParser.hpp"
+
+#ifdef RAPMAP_SALMON_SUPPORT
+#include "LibraryFormat.hpp"
+#endif
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+// Must be forward-declared
+template <typename IndexT>
+class PairAlignmentFormatter;
+template <typename IndexT>
+class SingleAlignmentFormatter;
+
+// Forward-declare because the C++ compiler is dumb
+class RapMapIndex;
+
+namespace rapmap {
+    namespace utils {
+
+    using my_mer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
+
+    constexpr uint32_t newTxpSetMask = 0x80000000;
+    constexpr uint32_t rcSetMask = 0x40000000;
+
+    // Positions are stored in a packed format, where the highest
+    // 2-bits encode if this position refers to a new transcript
+    // and whether or not the k-mer from the hash matches this txp
+    // in the forward or RC direction.
+    void decodePosition(uint32_t p, uint32_t& pout, bool& newTxp, bool& isRC);
+
+    template <typename IndexT>
+        void writeSAMHeader(IndexT& rmi, std::shared_ptr<spdlog::logger> out) {
+            fmt::MemoryWriter hd;
+            hd.write("@HD\tVN:0.1\tSO:unknown\n");
+
+            auto& txpNames = rmi.txpNames;
+            auto& txpLens = rmi.txpLens;
+
+            auto numRef = txpNames.size();
+            for (size_t i = 0; i < numRef; ++i) {
+                hd.write("@SQ\tSN:{}\tLN:{:d}\n", txpNames[i], txpLens[i]);
+            }
+            // Eventuall output a @PG line
+            //hd.format("@PG\t");
+            std::string headerStr(hd.str());
+            // Don't include the last '\n', since the logger will do it for us.
+            headerStr.pop_back();
+            out->info("%s", headerStr);
+        }
+
+    template <typename IndexT>
+        void writeSAMHeader(IndexT& rmi, std::ostream& outStream) {
+            fmt::MemoryWriter hd;
+            hd.write("@HD\tVN:0.1\tSO:unknown\n");
+
+            auto& txpNames = rmi.txpNames;
+            auto& txpLens = rmi.txpLens;
+
+            auto numRef = txpNames.size();
+            for (size_t i = 0; i < numRef; ++i) {
+                hd.write("@SQ\tSN:{}\tLN:{:d}\n", txpNames[i], txpLens[i]);
+            }
+            // Eventuall output a @PG line
+            //hd.format("@PG\t");
+            outStream << hd.str();
+        }
+
+    // from http://stackoverflow.com/questions/9435385/split-a-string-using-c11
+    std::vector<std::string> tokenize(const std::string &s, char delim);
+
+    // from https://github.com/cppformat/cppformat/issues/105
+    class FixedBuffer : public fmt::Buffer<char> {
+        public:
+            FixedBuffer(char *array, std::size_t size)
+                : fmt::Buffer<char>(array, size) {}
+
+        protected:
+            void grow(std::size_t size) {
+                throw std::runtime_error("buffer overflow");
+            }
+    };
+
+    class FixedWriter : public fmt::Writer {
+        private:
+            FixedBuffer buffer_;
+        public:
+            FixedWriter(char *array, std::size_t size)
+                : fmt::Writer(buffer_), buffer_(array, size) {}
+    };
+
+    /**
+     * Stores both the key (k-mer)
+     * and the interval to which it corresponds.
+     * This is useful if the hash itself doesn't validate
+     * the key (e.g. a minimum perfect hash).
+     **/
+    template <typename IndexT>
+    struct SAIntervalWithKey {
+        uint64_t kmer;
+      //  SAInterval<IndexT> second;
+        IndexT begin;
+        IndexT end;
+        template <typename Archive>
+            void load(Archive& ar) { ar(kmer, begin, end); }
+
+        template <typename Archive>
+            void save(Archive& ar) const { ar(kmer, begin, end); }
+    };
+
+    template <typename IndexT>
+    struct SAInterval {
+      /*
+        SAInterval(IndexT beginIn, IndexT endIn) : begin(beginIn), end(endIn) {}
+	SAInterval(std::initializer_list<IndexT> il) {
+	  auto it = il.begin();
+	  begin = *(it);
+	  ++it;
+	  end = *(il.begin());
+	}
+	*/
+
+        IndexT begin;
+        IndexT end;
+        template <typename Archive>
+            void load(Archive& ar) { ar(begin, end); }
+
+        template <typename Archive>
+            void save(Archive& ar) const { ar(begin, end); }
+    };
+
+
+    struct HitCounters {
+        std::atomic<uint64_t> peHits{0};
+        std::atomic<uint64_t> seHits{0};
+        std::atomic<uint64_t> trueHits{0};
+        std::atomic<uint64_t> totHits{0};
+        std::atomic<uint64_t> numReads{0};
+        std::atomic<uint64_t> tooManyHits{0};
+        std::atomic<uint64_t> lastPrint{0};
+    };
+
+    class JFMerKeyHasher{
+        public:
+            size_t operator()(const my_mer& m) const {
+                auto k = rapmap::utils::my_mer::k();
+                auto v = m.get_bits(0, 2*k);
+                return XXH64(static_cast<void*>(&v), 8, 0);
+            }
+    };
+
+    class KmerKeyHasher {
+        public:
+            size_t operator()(const uint64_t& m) const {
+                //auto k = rapmap::utils::my_mer::k();
+                //auto v = m.get_bits(0, 2*k);
+                auto v = m;
+                return XXH64(static_cast<void*>(&v), 8, 0);
+            }
+    };
+
+    struct KmerInterval {
+        uint64_t offset;
+        uint32_t length;
+
+        template <typename Archive>
+            void save(Archive& arch) const {
+                arch(offset, length);
+            }
+
+        template <typename Archive>
+            void load(Archive& arch) {
+                arch(offset, length);
+            }
+    };
+
+    struct KmerInfo {
+        KmerInfo () : eqId(0), offset(0), count(0) {}
+
+
+        KmerInfo(uint32_t eqIdIn, uint32_t offsetIn, uint32_t countIn) :
+            eqId(eqIdIn), offset(offsetIn), count(countIn) {}
+
+        template <typename Archive>
+        void load(Archive& ar) {
+            ar(eqId, offset, count);
+        }
+
+        template <typename Archive>
+        void save(Archive& ar) const {
+            ar(eqId, offset, count);
+        }
+        uint32_t eqId = 0;
+        uint32_t offset = 0;
+        uint32_t count = 0;
+    };
+
+
+    template <class T>
+    inline void hashCombine(std::size_t& seed, const T& v)
+    {
+            std::hash<T> hasher;
+            seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2);
+    }
+
+    constexpr uint32_t uint32Invalid = std::numeric_limits<uint32_t>::max();
+    using PositionList = std::vector<uint32_t>;
+    using KmerInfoList = std::vector<KmerInfo>;
+
+    enum class MateStatus : uint8_t {
+        SINGLE_END = 0,
+        PAIRED_END_LEFT = 1,
+        PAIRED_END_RIGHT = 2,
+        PAIRED_END_PAIRED = 3 };
+
+    // Wraps the standard iterator of the Position list to provide
+    // some convenient functionality.  In the future, maybe this
+    // should be a proper iterator adaptor.
+    struct PositionListHelper{
+        using PLIt = PositionList::iterator;
+
+        PositionListHelper(PLIt itIn, PLIt endIn) :
+            it_(itIn), end_(endIn) {}
+        // The underlying iterator shouldn't be advanced further
+        inline bool done() { return it_ == end_; }
+
+        // The actual postion on the transcript
+        int32_t pos() const { return static_cast<int32_t>((*it_) & 0x3FFFFFFF); }
+
+        // True if the position encoded was on the reverse complement strand
+        // of the reference transcript, false otherwise.
+        bool isRC() const { return (*it_) & 0x40000000; }
+
+        // True if we hit the position list for a new transcript, false otherwise
+        bool isNewTxp() const { return (*it_) & 0x80000000; }
+
+        void advanceToNextTranscript() {
+            if (it_ < end_) {
+                do {
+                    ++it_;
+                } while (!isNewTxp() and it_ != end_);
+
+            }
+        }
+
+        PLIt it_; // The underlying iterator
+        PLIt end_; // The end of the container
+    };
+
+
+    struct QuasiAlignment {
+  	QuasiAlignment() :
+		tid(std::numeric_limits<uint32_t>::max()),
+		pos(std::numeric_limits<int32_t>::max()),
+		fwd(true),
+		readLen(std::numeric_limits<uint32_t>::max()),
+		fragLen(std::numeric_limits<uint32_t>::max()),
+		isPaired(false)
+#ifdef RAPMAP_SALMON_SUPPORT
+        ,format(LibraryFormat::formatFromID(0))
+#endif // RAPMAP_SALMON_SUPPORT
+        {}
+
+        QuasiAlignment(uint32_t tidIn, int32_t posIn,
+                bool fwdIn, uint32_t readLenIn,
+                uint32_t fragLenIn = 0,
+                bool isPairedIn = false) :
+            tid(tidIn), pos(posIn), fwd(fwdIn),
+            readLen(readLenIn), fragLen(fragLenIn),
+            isPaired(isPairedIn)
+#ifdef RAPMAP_SALMON_SUPPORT
+        ,format(LibraryFormat::formatFromID(0))
+#endif // RAPMAP_SALMON_SUPPORT
+        {}
+        QuasiAlignment(QuasiAlignment&& other) = default;
+        QuasiAlignment& operator=(QuasiAlignment&) = default;
+        QuasiAlignment& operator=(QuasiAlignment&& o) = default;
+        QuasiAlignment(const QuasiAlignment& o) = default;
+        QuasiAlignment(QuasiAlignment& o) = default;
+
+        // Some convenience functions to allow salmon interop
+#ifdef RAPMAP_SALMON_SUPPORT
+        inline uint32_t transcriptID() const { return tid; }
+        inline double score() { return 1.0; }
+        inline uint32_t fragLength() { return fragLen; }
+        inline uint32_t fragLengthPedantic(uint32_t txpLen) const { 
+            if (mateStatus != rapmap::utils::MateStatus::PAIRED_END_PAIRED
+                or fwd == mateIsFwd) {
+                return 0;
+            }
+            int32_t p1 = fwd ? pos : matePos;
+            p1 = (p1 < 0) ? 0 : p1;
+            p1 = (p1 > txpLen) ? txpLen : p1;
+            int32_t p2 = fwd ? matePos + mateLen : pos + readLen;
+            p2 = (p2 < 0) ? 0 : p2;
+            p2 = (p2 > txpLen) ? txpLen : p2;
+
+            return (p1 > p2) ? p1 - p2 : p2 - p1;
+        }
+        inline int32_t hitPos() { return std::min(pos, matePos); }
+        double logProb{HUGE_VAL};
+        double logBias{HUGE_VAL};
+        inline LibraryFormat libFormat() { return format; }
+        LibraryFormat format;
+#endif // RAPMAP_SALMON_SUPPORT
+
+        // Only 1 since the mate must have the same tid
+        // we won't call *chimeric* alignments here.
+        uint32_t tid;
+        // Left-most position of the hit
+        int32_t pos;
+        // left-most position of the mate
+        int32_t matePos;
+        // Is the read from the forward strand
+        bool fwd;
+        // Is the mate from the forward strand
+        bool mateIsFwd;
+        // The fragment length (template length)
+        // This is 0 for single-end or orphaned reads.
+        uint32_t fragLen;
+        // The read's length
+        uint32_t readLen;
+        // The mate's length
+        uint32_t mateLen;
+        // Is this a paired *alignment* or not
+        bool isPaired;
+        MateStatus mateStatus;
+    };
+
+    struct HitInfo {
+        HitInfo(KmerInfoList::iterator kit, uint32_t merIDIn,
+                int32_t queryPosIn, bool queryRCIn) :
+                kinfo(kit), merID(merIDIn), queryPos(queryPosIn),
+                queryRC(queryRCIn) {}
+
+        KmerInfoList::iterator kinfo;
+        uint32_t merID;
+        int32_t queryPos;
+        bool queryRC;
+    };
+
+    template <typename OffsetT>
+    struct SAIntervalHit {
+        SAIntervalHit(OffsetT beginIn, OffsetT endIn, uint32_t lenIn, uint32_t queryPosIn, bool queryRCIn) :
+            begin(beginIn), end(endIn), len(lenIn), queryPos(queryPosIn), queryRC(queryRCIn) {}
+
+	      OffsetT span() { return end - begin; }
+        OffsetT begin, end;
+        uint32_t len, queryPos;
+        bool queryRC;
+    };
+
+    struct SATxpQueryPos {
+	SATxpQueryPos(uint32_t posIn, uint32_t qposIn, bool queryRCIn, bool activeIn = false) :
+		pos(posIn), queryPos(qposIn), queryRC(queryRCIn), active(activeIn) {}
+	uint32_t pos, queryPos;
+	bool queryRC, active;
+    };
+
+    struct ProcessedSAHit {
+	    ProcessedSAHit() : tid(std::numeric_limits<uint32_t>::max()), active(false), numActive(1) {}
+
+	    ProcessedSAHit(uint32_t txpIDIn, uint32_t txpPosIn, uint32_t queryPosIn, bool queryRCIn) :
+		    tid(txpIDIn), active(false), numActive(1)
+	    {
+		tqvec.emplace_back(txpPosIn, queryPosIn, queryRCIn);
+	    }
+
+        /**
+         * This enforces a more stringent consistency check on
+         * the hits for this transcript.  The hits must be co-linear
+         * with respect to the query and target.
+         * 
+         * input: numToCheck --- the number of hits to check in sorted order
+         *                       hits after the last of these need not be consistent.
+         * return: numToCheck if the first numToCheck hits are consistent; 
+         *         -1 otherwise
+         **/
+        int32_t checkConsistent(int32_t numToCheck) {
+            auto numHits = tqvec.size();
+
+            // special case for only 1 or two hits (common)
+            if (numHits == 1) {
+                return numToCheck;
+            } else if (numHits == 2) {
+                auto& h1 = (tqvec[0].queryPos < tqvec[1].queryPos) ? tqvec[0] : tqvec[1];
+                auto& h2 = (tqvec[0].queryPos < tqvec[1].queryPos) ? tqvec[1] : tqvec[2];
+                return (h2.pos > h1.pos) ? (numToCheck) : -1;
+            } else {
+                // first, sort by query position
+                std::sort(tqvec.begin(), tqvec.end(), 
+                          [](const SATxpQueryPos& q1, const SATxpQueryPos& q2) -> bool {
+                              return q1.queryPos < q2.queryPos;
+                          });
+
+                int32_t lastRefPos{std::numeric_limits<int32_t>::min()};
+                for (size_t i = 0; i < numToCheck; ++i) {
+                    int32_t refPos = static_cast<int32_t>(tqvec[i].pos);
+                    if (refPos > lastRefPos) {
+                        lastRefPos = refPos;
+                    } else {
+                        return i;
+                    }
+                }
+                return numToCheck;
+            }
+        }
+
+	    uint32_t tid;
+	    std::vector<SATxpQueryPos> tqvec;
+        bool active;
+	    uint32_t numActive;
+    };
+
+    struct SAHitInfo {
+	    SAHitInfo(uint32_t txpIDIn, uint32_t txpPosIn, uint32_t queryPosIn, bool queryRCIn) :
+		    tid(txpIDIn), pos(txpPosIn), queryPos(queryPosIn), queryRC(queryRCIn) {}
+	    uint32_t tid;
+	    uint32_t pos;
+	    uint32_t queryPos;
+	    bool queryRC;
+    };
+
+    struct TxpQueryPos {
+        TxpQueryPos(PositionListHelper& ph, int32_t queryPosIn, bool queryRCIn) :
+                txpPosInfo(ph), queryPos(queryPosIn), queryRC(queryRCIn) {}
+        // Iterator for the beginning of the position list
+        // of a given k-mer into a given transcript.
+        PositionListHelper txpPosInfo;
+        // The position of the k-mer on the query.
+        int32_t queryPos;
+        bool queryRC;
+    };
+
+    struct ProcessedHit {
+        ProcessedHit() : tid(std::numeric_limits<uint32_t>::max()) {}
+        ProcessedHit(uint32_t tidIn,
+                     PositionListHelper ph, int32_t queryPos, bool queryRC) :
+                     tid(tidIn) {
+                         tqvec.emplace_back(ph, queryPos, queryRC);
+                     }
+
+
+        uint32_t tid; // transcript id
+        // A vector of iterators into the position list
+        // for the k-mers hitting this transcript
+        std::vector<TxpQueryPos> tqvec;
+    };
+
+
+    struct EqClass {
+        EqClass() :
+            txpListStart(uint32Invalid), txpListLen(uint32Invalid) {}
+        EqClass(uint32_t txpListStartIn, uint32_t txpListLenIn) :
+            txpListStart(txpListStartIn), txpListLen(txpListLenIn) {}
+
+        template <typename Archive>
+        void load (Archive& ar) {
+            ar(txpListStart, txpListLen);
+        }
+
+        template <typename Archive>
+        void save (Archive& ar) const {
+            ar(txpListStart, txpListLen);
+        }
+
+        uint32_t txpListStart;
+        uint32_t txpListLen;
+    };
+
+    inline void printMateStatus(rapmap::utils::MateStatus ms) {
+        switch(ms) {
+            case rapmap::utils::MateStatus::SINGLE_END:
+                std::cerr << "SINGLE END";
+                break;
+            case rapmap::utils::MateStatus::PAIRED_END_LEFT:
+                std::cerr << "PAIRED END (LEFT)";
+                break;
+            case rapmap::utils::MateStatus::PAIRED_END_RIGHT:
+                std::cerr << "PAIRED END (RIGHT)";
+                break;
+            case rapmap::utils::MateStatus::PAIRED_END_PAIRED:
+                std::cerr << "PAIRED END (PAIRED)";
+                break;
+        }
+    }
+
+
+    // Declarations for functions dealing with SAM formatting and output
+    //
+    inline void adjustOverhang(int32_t& pos, uint32_t readLen,
+		    uint32_t txpLen, FixedWriter& cigarStr) {
+	    cigarStr.clear();
+	    if (pos + readLen < 0) {
+            cigarStr.write("{}S", readLen);
+            pos = 0;
+        } else if (pos < 0) {
+		    int32_t matchLen = readLen + pos;
+            int32_t clipLen = readLen - matchLen;
+		    cigarStr.write("{}S{}M", clipLen, matchLen);
+		    // Now adjust the mapping position
+		    pos = 0;
+	    } else if (pos > txpLen) {
+            cigarStr.write("{}S", readLen);
+        } else if (pos + readLen > txpLen) {
+		    int32_t matchLen = txpLen - pos;
+		    int32_t clipLen = readLen - matchLen;
+		    cigarStr.write("{}M{}S", matchLen, clipLen);
+	    } else {
+		    cigarStr.write("{}M", readLen);
+	    }
+    }
+
+    inline void adjustOverhang(QuasiAlignment& qa, uint32_t txpLen,
+		    FixedWriter& cigarStr1, FixedWriter& cigarStr2) {
+	    if (qa.isPaired) { // both mapped
+		    adjustOverhang(qa.pos, qa.readLen, txpLen, cigarStr1);
+		    adjustOverhang(qa.matePos, qa.mateLen, txpLen, cigarStr2);
+	    } else if (qa.mateStatus == MateStatus::PAIRED_END_LEFT ) {
+		    // left read mapped
+		    adjustOverhang(qa.pos, qa.readLen, txpLen, cigarStr1);
+		    // right read un-mapped will just be read length * S
+		    cigarStr2.clear();
+		    cigarStr2.write("{}S", qa.mateLen);
+	    } else if (qa.mateStatus == MateStatus::PAIRED_END_RIGHT) {
+		    // right read mapped
+		    adjustOverhang(qa.pos, qa.readLen, txpLen, cigarStr2);
+		    // left read un-mapped will just be read length * S
+		    cigarStr1.clear();
+		    cigarStr1.write("{}S", qa.readLen);
+	    }
+    }
+
+
+
+        // get the sam flags for the quasialignment qaln.
+        // peinput is true if the read is paired in *sequencing*; false otherwise
+        // the sam flags for mate 1 are written into flags1 and for mate2 into flags2
+        inline void getSamFlags(const QuasiAlignment& qaln,
+                uint16_t& flags) {
+            constexpr uint16_t pairedInSeq = 0x1;
+            constexpr uint16_t mappedInProperPair = 0x2;
+            constexpr uint16_t unmapped = 0x4;
+            constexpr uint16_t mateUnmapped = 0x8;
+            constexpr uint16_t isRC = 0x10;
+            constexpr uint16_t mateIsRC = 0x20;
+            constexpr uint16_t isRead1 = 0x40;
+            constexpr uint16_t isRead2 = 0x80;
+            constexpr uint16_t isSecondaryAlignment = 0x100;
+            constexpr uint16_t failedQC = 0x200;
+            constexpr uint16_t isPCRDup = 0x400;
+            constexpr uint16_t supplementaryAln = 0x800;
+
+            flags = 0;
+            // Not paired in sequencing
+            // flags1 = (peInput) ? pairedInSeq : 0;
+            // flags |= properlyAligned;
+            // we don't output unmapped yet
+            // flags |= unmapped
+            // flags |= mateUnmapped
+            flags |= (qaln.fwd) ? 0 : isRC;
+            // Mate flag meaningless
+            // flags1 |= (qaln.mateIsFwd) ? 0 : mateIsRC;
+            // flags |= isRead1;
+            //flags2 |= isRead2;
+        }
+
+        // get the sam flags for the quasialignment qaln.
+        // peinput is true if the read is paired in *sequencing*; false otherwise
+        // the sam flags for mate 1 are written into flags1 and for mate2 into flags2
+        inline void getSamFlags(const QuasiAlignment& qaln,
+                bool peInput,
+                uint16_t& flags1,
+                uint16_t& flags2) {
+            constexpr uint16_t pairedInSeq = 0x1;
+            constexpr uint16_t properlyAligned = 0x2;
+            constexpr uint16_t unmapped = 0x4;
+            constexpr uint16_t mateUnmapped = 0x8;
+            constexpr uint16_t isRC = 0x10;
+            constexpr uint16_t mateIsRC = 0x20;
+            constexpr uint16_t isRead1 = 0x40;
+            constexpr uint16_t isRead2 = 0x80;
+            constexpr uint16_t isSecondaryAlignment = 0x100;
+            constexpr uint16_t failedQC = 0x200;
+            constexpr uint16_t isPCRDup = 0x400;
+            constexpr uint16_t supplementaryAln = 0x800;
+
+            flags1 = flags2 = 0;
+            flags1 = (peInput) ? pairedInSeq : 0;
+            flags1 |= (qaln.isPaired) ? properlyAligned : 0;
+            flags2 = flags1;
+            // we don't output unmapped yet
+            bool read1Unaligned = qaln.mateStatus == MateStatus::PAIRED_END_RIGHT;
+            bool read2Unaligned = qaln.mateStatus == MateStatus::PAIRED_END_LEFT;
+            // If read 1 is unaligned, flags1 gets "unmapped" and flags2 gets "mate unmapped"
+            flags1 |= (read1Unaligned) ? unmapped : 0;
+            flags2 |= (read1Unaligned) ? mateUnmapped : 0;
+            // If read 2 is unaligned, flags2 gets "unmapped" and flags1 gets "mate unmapped"
+            flags2 |= (read2Unaligned) ? unmapped : 0;
+            flags1 |= (read2Unaligned) ? mateUnmapped : 0;
+
+            flags1 |= (qaln.fwd) ? 0 : isRC;
+            flags1 |= (qaln.mateIsFwd) ? 0 : mateIsRC;
+            flags2 |= (qaln.mateIsFwd) ? 0 : isRC;
+            flags2 |= (qaln.fwd) ? 0 : mateIsRC;
+            flags1 |= isRead1;
+            flags2 |= isRead2;
+        }
+
+	// Adapted from
+        // https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/blob/8c9933a1685e0ab50c7d8b7926c9068bc0c9d7d2/src/main.c#L36
+        void reverseRead(std::string& seq,
+                std::string& qual,
+                std::string& readWork,
+                std::string& qualWork);
+
+        template <typename ReadPairT, typename IndexT>
+        uint32_t writeAlignmentsToStream(
+                ReadPairT& r,
+                PairAlignmentFormatter<IndexT>& formatter,
+                HitCounters& hctr,
+                std::vector<QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+        template <typename ReadT, typename IndexT>
+        uint32_t writeAlignmentsToStream(
+                ReadT& r,
+                SingleAlignmentFormatter<IndexT>& formatter,
+                HitCounters& hctr,
+                std::vector<QuasiAlignment>& jointHits,
+                fmt::MemoryWriter& sstream);
+
+        inline void mergeLeftRightHitsFuzzy(
+                bool leftMatches,
+                bool rightMatches,
+                std::vector<QuasiAlignment>& leftHits,
+                std::vector<QuasiAlignment>& rightHits,
+                std::vector<QuasiAlignment>& jointHits,
+                uint32_t readLen,
+                uint32_t maxNumHits,
+                bool& tooManyHits,
+                HitCounters& hctr) {
+
+            if (leftHits.empty()) {
+                if (!leftMatches) {
+                    if (!rightHits.empty()) {
+                        jointHits.insert(jointHits.end(),
+                                std::make_move_iterator(rightHits.begin()),
+                                std::make_move_iterator(rightHits.end()));
+                        hctr.seHits += rightHits.size();
+                    }
+                }
+            } else if (rightHits.empty()) {
+                if (!rightMatches) {
+                    if (!leftHits.empty()) {
+                        jointHits.insert(jointHits.end(),
+                                std::make_move_iterator(leftHits.begin()),
+                                std::make_move_iterator(leftHits.end()));
+                        hctr.seHits += leftHits.size();
+                    }
+                }
+            } else {
+                constexpr const int32_t signedZero{0};
+                auto leftIt = leftHits.begin();
+                auto leftEnd = leftHits.end();
+                auto leftLen = std::distance(leftIt, leftEnd);
+                if (rightHits.size() > 0) {
+                    auto rightIt = rightHits.begin();
+                    auto rightEnd = rightHits.end();
+                    auto rightLen = std::distance(rightIt, rightEnd);
+                    size_t numHits{0};
+                    jointHits.reserve(std::min(leftLen, rightLen));
+                    uint32_t leftTxp, rightTxp;
+                    while (leftIt != leftEnd && rightIt != rightEnd) {
+                        leftTxp = leftIt->tid;
+                        rightTxp = rightIt->tid;
+                        if (leftTxp < rightTxp) {
+                            ++leftIt;
+                        } else {
+                            if (!(rightTxp < leftTxp)) {
+                                int32_t startRead1 = std::max(leftIt->pos, signedZero);
+                                int32_t startRead2 = std::max(rightIt->pos, signedZero);
+                                bool read1First{(startRead1 < startRead2)};
+                                int32_t fragStartPos = read1First ? startRead1 : startRead2;
+                                int32_t fragEndPos = read1First ? 
+                                    (startRead2 + rightIt->readLen) : (startRead1 + leftIt->readLen);
+                                uint32_t fragLen = fragEndPos - fragStartPos;
+                                jointHits.emplace_back(leftTxp,
+                                        leftIt->pos,
+                                        leftIt->fwd,
+                                        leftIt->readLen,
+                                        fragLen, true);
+                                // Fill in the mate info
+                                auto& qaln = jointHits.back();
+                                qaln.mateLen = rightIt->readLen;
+                                qaln.matePos = rightIt->pos;
+                                qaln.mateIsFwd = rightIt->fwd;
+                                jointHits.back().mateStatus = MateStatus::PAIRED_END_PAIRED;
+
+                                ++numHits;
+                                if (numHits > maxNumHits) { tooManyHits = true; break; }
+                                ++leftIt;
+                            }
+                            ++rightIt;
+                        }
+                    }
+                }
+                if (tooManyHits) { jointHits.clear(); ++hctr.tooManyHits; }
+            }
+
+            // If we had proper paired hits
+            if (jointHits.size() > 0) {
+                hctr.peHits += jointHits.size();
+                //orphanStatus = 0;
+            }
+        }
+
+        inline void mergeLeftRightHits(
+                std::vector<QuasiAlignment>& leftHits,
+                std::vector<QuasiAlignment>& rightHits,
+                std::vector<QuasiAlignment>& jointHits,
+                uint32_t readLen,
+                uint32_t maxNumHits,
+                bool& tooManyHits,
+                HitCounters& hctr) {
+            if (leftHits.size() > 0) {
+                constexpr const int32_t signedZero{0};
+                auto leftIt = leftHits.begin();
+                auto leftEnd = leftHits.end();
+                auto leftLen = std::distance(leftIt, leftEnd);
+                if (rightHits.size() > 0) {
+                    auto rightIt = rightHits.begin();
+                    auto rightEnd = rightHits.end();
+                    auto rightLen = std::distance(rightIt, rightEnd);
+                    size_t numHits{0};
+                    jointHits.reserve(std::min(leftLen, rightLen));
+                    uint32_t leftTxp, rightTxp;
+                    while (leftIt != leftEnd && rightIt != rightEnd) {
+                        leftTxp = leftIt->tid;
+                        rightTxp = rightIt->tid;
+                        if (leftTxp < rightTxp) {
+                            ++leftIt;
+                        } else {
+                            if (!(rightTxp < leftTxp)) {
+                                int32_t startRead1 = std::max(leftIt->pos, signedZero);
+                                int32_t startRead2 = std::max(rightIt->pos, signedZero);
+                                bool read1First{(startRead1 < startRead2)};
+                                int32_t fragStartPos = read1First ? startRead1 : startRead2;
+                                int32_t fragEndPos = read1First ? 
+                                    (startRead2 + rightIt->readLen) : (startRead1 + leftIt->readLen);
+                                uint32_t fragLen = fragEndPos - fragStartPos;
+                                jointHits.emplace_back(leftTxp,
+                                        startRead1,
+                                        leftIt->fwd,
+                                        leftIt->readLen,
+                                        fragLen, true);
+                                // Fill in the mate info
+                                auto& qaln = jointHits.back();
+                                qaln.mateLen = rightIt->readLen;
+                                qaln.matePos = startRead2;
+                                qaln.mateIsFwd = rightIt->fwd;
+                                jointHits.back().mateStatus = MateStatus::PAIRED_END_PAIRED;
+
+                                ++numHits;
+                                if (numHits > maxNumHits) { tooManyHits = true; break; }
+                                ++leftIt;
+                            }
+                            ++rightIt;
+                        }
+                    }
+                }
+                if (tooManyHits) { jointHits.clear(); ++hctr.tooManyHits; }
+            }
+
+            // If we had proper paired hits
+            if (jointHits.size() > 0) {
+                hctr.peHits += jointHits.size();
+                //orphanStatus = 0;
+            } else if (leftHits.size() + rightHits.size() > 0 and !tooManyHits) {
+                // If there weren't proper paired hits, then either
+                // there were too many hits, and we forcibly discarded the read
+                // or we take the single end hits.
+                auto numHits = leftHits.size() + rightHits.size();
+                hctr.seHits += numHits;
+                //orphanStatus = 0;
+                //orphanStatus |= (leftHits.size() > 0) ? 0x1 : 0;
+                //orphanStatus |= (rightHits.size() > 0) ? 0x2 : 0;
+                jointHits.insert(jointHits.end(),
+                        std::make_move_iterator(leftHits.begin()),
+                        std::make_move_iterator(leftHits.end()));
+                jointHits.insert(jointHits.end(),
+                        std::make_move_iterator(rightHits.begin()),
+                        std::make_move_iterator(rightHits.end()));
+            }
+        }
+
+    /*
+    template <typename Archive>
+    void save(Archive& archive, const my_mer& mer);
+
+    template <typename Archive>
+    void load(Archive& archive, my_mer& mer);
+    */
+    }
+}
+
+
+#endif // __RAP_MAP_UTILS_HPP__
diff --git a/debian/rapmap/SACollector.hpp b/debian/rapmap/SACollector.hpp
new file mode 100644
index 0000000..261b2ce
--- /dev/null
+++ b/debian/rapmap/SACollector.hpp
@@ -0,0 +1,580 @@
+#ifndef SA_COLLECTOR_HPP
+#define SA_COLLECTOR_HPP
+
+#include "RapMapUtils.hpp"
+#include "RapMapSAIndex.hpp"
+#include "SASearcher.hpp"
+
+#include <iostream>
+#include <algorithm>
+#include <iterator>
+
+template <typename RapMapIndexT>
+class SACollector {
+    public:
+    using OffsetT = typename RapMapIndexT::IndexType;
+
+    SACollector(RapMapIndexT* rmi) : rmi_(rmi) {}
+    bool operator()(std::string& read,
+                    std::vector<rapmap::utils::QuasiAlignment>& hits,
+                    SASearcher<RapMapIndexT>& saSearcher,
+                    rapmap::utils::MateStatus mateStatus,
+                    bool strictCheck=false,
+                    bool consistentHits=false) {
+
+        using QuasiAlignment = rapmap::utils::QuasiAlignment;
+        using MateStatus = rapmap::utils::MateStatus;
+
+        //auto& posIDs = rmi_->positionIDs;
+        auto& rankDict = rmi_->rankDict;
+        auto& txpStarts = rmi_->txpOffsets;
+        auto& SA = rmi_->SA;
+        auto& khash = rmi_->khash;
+        auto& text = rmi_->seq;
+        uint32_t sampFactor{1};
+        auto salen = SA.size();
+
+        auto readLen = read.length();
+        auto maxDist = 1.5 * readLen;
+        auto k = rapmap::utils::my_mer::k();
+        auto readStartIt = read.begin();
+        auto readEndIt = read.end();
+
+        auto readRevStartIt = read.rbegin();
+        auto readRevEndIt = read.rend();
+
+        auto rb = read.begin();
+        auto re = rb + k;
+        OffsetT lbLeftFwd = 0, ubLeftFwd = 0;
+        OffsetT lbLeftRC = 0, ubLeftRC = 0;
+        OffsetT lbRightFwd = 0, ubRightFwd = 0;
+        OffsetT lbRightRC = 0, ubRightRC = 0;
+        OffsetT matchedLen;
+
+        uint32_t fwdHit{0};
+        uint32_t rcHit{0};
+
+        bool foundHit = false;
+        bool isRev = false;
+        rapmap::utils::my_mer mer;
+        rapmap::utils::my_mer rcMer;
+
+        enum HitStatus { ABSENT = -1, UNTESTED = 0, PRESENT = 1 };
+        // Record if k-mers are hits in the
+        // fwd direction, rc direction or both
+        struct KmerDirScore {
+	  KmerDirScore(rapmap::utils::my_mer kmerIn, int32_t kposIn, HitStatus fwdScoreIn, HitStatus rcScoreIn) :
+	    kmer(kmerIn), kpos(kposIn), fwdScore(fwdScoreIn), rcScore(rcScoreIn) {}
+	  KmerDirScore() : kpos(0), fwdScore(UNTESTED), rcScore(UNTESTED) {}
+	  bool operator==(const KmerDirScore& other) const { return kpos == other.kpos; }
+	  bool operator<(const KmerDirScore& other) const { return kpos < other.kpos; }
+          void print() { 
+	    std::cerr << "{ " << kmer.to_str() << ", " <<  kpos << ", " << ((fwdScore) ? "PRESENT" : "ABSENT") << ", " << ((rcScore) ? "PRESENT" : "ABSENT") << "}\t";
+	  }
+            rapmap::utils::my_mer kmer;
+	    int32_t kpos;
+            HitStatus fwdScore;
+            HitStatus rcScore;
+        };
+
+        // This allows implementing our heurisic for comparing
+        // forward and reverse-complement strand matches
+        std::vector<KmerDirScore> kmerScores;
+
+        using SAIntervalHit = rapmap::utils::SAIntervalHit<OffsetT>;
+
+        std::vector<SAIntervalHit> fwdSAInts;
+        std::vector<SAIntervalHit> rcSAInts;
+
+        std::vector<uint32_t> leftTxps, leftTxpsRC;
+        std::vector<uint32_t> rightTxps, rightTxpsRC;
+        OffsetT maxInterval{1000};
+
+        // The number of bases that a new query position (to which
+        // we skipped) should overlap the previous extension. A
+        // value of 0 means no overlap (the new search begins at the next
+        // base) while a value of (k - 1) means that k-1 bases (one less than
+        // the k-mer size) must overlap.
+        OffsetT skipOverlap = k-1;
+        // Number of nucleotides to skip when encountering a homopolymer k-mer.
+        OffsetT homoPolymerSkip = k/2;
+
+        // Find a hit within the read
+        // While we haven't fallen off the end
+        while (re < read.end()) {
+
+            // Get the k-mer at the current start position.
+            // And make sure that it's valid (contains no Ns).
+            auto pos = std::distance(readStartIt, rb);
+            auto invalidPos = read.find_first_of("nN", pos);
+            if (invalidPos <= pos + k) {
+                rb = read.begin() + invalidPos + 1;
+                re = rb + k;
+                continue;
+            }
+
+            // If the next k-bases are valid, get the k-mer and
+            // reverse complement k-mer
+            mer = rapmap::utils::my_mer(read.c_str() + pos);
+            if (mer.is_homopolymer()) { rb += homoPolymerSkip; re += homoPolymerSkip; continue; }
+            rcMer = mer.get_reverse_complement();
+
+            // See if we can find this k-mer in the hash
+            auto merIt = khash.find(mer.get_bits(0, 2*k));
+            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+
+            // If we can find the k-mer in the hash, get its SA interval
+            if (merIt != khash.end()) {
+                OffsetT lb = merIt->second.begin;
+                OffsetT ub = merIt->second.end;
+
+                // lb must be 1 *less* then the current lb
+                auto lbRestart = std::max(static_cast<OffsetT>(0), lb-1);
+                // Extend the SA interval using the read sequence as far as
+                // possible
+                std::tie(lbLeftFwd, ubLeftFwd, matchedLen) =
+                    saSearcher.extendSearchNaive(lbRestart, ub, k, rb, readEndIt);
+
+                // If the SA interval is valid, and not too wide, then record
+                // the hit.
+                OffsetT diff = ubLeftFwd - lbLeftFwd;
+                if (ubLeftFwd > lbLeftFwd and diff < maxInterval) {
+                    auto queryStart = std::distance(read.begin(), rb);
+                    fwdSAInts.emplace_back(lbLeftFwd, ubLeftFwd, matchedLen, queryStart, false);
+                    if (strictCheck) {
+                        ++fwdHit;
+                        // If we also match this k-mer in the rc direction
+			if (rcMerIt != khash.end()) {
+			  ++rcHit;
+			  kmerScores.emplace_back(mer, pos, PRESENT, PRESENT);
+			} else { // Otherwise it doesn't match in the rc direction
+			  kmerScores.emplace_back(mer, pos, PRESENT, ABSENT);
+			}
+
+			// If we didn't end the match b/c we exhausted the query
+                        // test the mismatching k-mer in the other strand
+                        // TODO: check for 'N'?
+                        if (rb + matchedLen < readEndIt){
+                            auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
+                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
+                            kmerScores.emplace_back(mer, kmerPos, ABSENT, UNTESTED);
+                        }
+                    } else { // no strict check
+                        ++fwdHit;
+                        if (rcMerIt != khash.end()) { ++rcHit; }
+                    }
+                }
+            }
+
+            // See if the reverse complement k-mer is in the hash
+            if (rcMerIt != khash.end()) {
+                lbLeftRC = rcMerIt->second.begin;
+                ubLeftRC = rcMerIt->second.end;
+                OffsetT diff = ubLeftRC - lbLeftRC;
+                if (ubLeftRC > lbLeftRC) {
+                    // The original k-mer didn't match in the foward direction
+                    if (!fwdHit) {
+                        ++rcHit;
+                        if (strictCheck) {
+			  kmerScores.emplace_back(mer, pos, ABSENT, PRESENT);
+                        }
+                    }
+                }
+            }
+
+            // If we had a hit with either k-mer then we can
+            // break out of this loop to look for the next informative position
+            if (fwdHit + rcHit > 0) {
+                foundHit = true;
+                break;
+            }
+            ++rb; ++re;
+        }
+
+        // If we went the entire length of the read without finding a hit
+        // then we can bail.
+        if (!foundHit) { return false; }
+
+        bool lastSearch{false};
+        // If we had a hit on the forward strand
+        if (fwdHit) {
+
+            // The length of this match
+            auto matchLen = fwdSAInts.front().len;
+            // The iterator to where this match began
+            rb = read.begin() + fwdSAInts.front().queryPos;
+
+            // [lb, ub) is the suffix array interval for the MMP (maximum mappable prefix)
+            // of the k-mer we found.  The NIP (next informative position) in the sequence
+            // is the position after the LCE (longest common extension) of
+            // T[SA[lb]:] and T[SA[ub-1]:]
+            auto remainingLength = std::distance(rb + matchLen, readEndIt);
+            auto lce = saSearcher.lce(lbLeftFwd, ubLeftFwd-1, matchLen, remainingLength);
+            auto fwdSkip = std::max(static_cast<OffsetT>(matchLen) - skipOverlap,
+                                    static_cast<OffsetT>(lce) - skipOverlap);
+
+            size_t nextInformativePosition = std::min(
+                    std::max(static_cast<OffsetT>(0),
+                    static_cast<OffsetT>(readLen)- static_cast<OffsetT>(k)),
+                    static_cast<OffsetT>(std::distance(readStartIt, rb) + fwdSkip)
+                    );
+
+            rb = read.begin() + nextInformativePosition;
+            re = rb + k;
+
+            size_t invalidPos{0};
+            while (re <= readEndIt) {
+                // The offset into the string
+                auto pos = std::distance(readStartIt, rb);
+
+                // The position of the first N in the k-mer (if there is one)
+                // If we have already verified there are no Ns in the remainder
+                // of the string (invalidPos is std::string::npos) then we can
+                // skip this test.
+                if (invalidPos != std::string::npos) {
+                    invalidPos = read.find_first_of("nN", pos);
+                }
+
+                // If the first N is within k bases, then this k-mer is invalid
+                if (invalidPos < pos + k) {
+                    // A valid k-mer can't start until after the 'N'
+                    nextInformativePosition = invalidPos + 1;
+                    rb = read.begin() + nextInformativePosition;
+                    re = rb + k;
+                    // Go to the next iteration of the while loop
+                    continue;
+                }
+
+                // If the current end position is valid
+                if (re <= readEndIt) {
+
+                    mer = rapmap::utils::my_mer(read.c_str() + pos);
+                    if (mer.is_homopolymer()) { rb += homoPolymerSkip; re = rb + k; continue; }
+                    auto merIt = khash.find(mer.get_bits(0, 2*k));
+
+                    if (merIt != khash.end()) {
+                        if (strictCheck) {
+                            ++fwdHit;
+                            kmerScores.emplace_back(mer, pos, PRESENT, UNTESTED);
+                            auto rcMer = mer.get_reverse_complement();
+                            auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+                            if (rcMerIt != khash.end()) {
+                                ++rcHit;
+                                kmerScores.back().rcScore = PRESENT;
+                            }
+                        }
+
+                        lbRightFwd = merIt->second.begin;
+                        ubRightFwd = merIt->second.end;
+
+                        // lb must be 1 *less* then the current lb
+                        lbRightFwd = std::max(static_cast<OffsetT>(0), lbRightFwd - 1);
+                        std::tie(lbRightFwd, ubRightFwd, matchedLen) =
+                            saSearcher.extendSearchNaive(lbRightFwd, ubRightFwd,
+                                    k, rb, readEndIt);
+
+                        OffsetT diff = ubRightFwd - lbRightFwd;
+                        if (ubRightFwd > lbRightFwd and diff < maxInterval) {
+                            auto queryStart = std::distance(read.begin(), rb);
+                            fwdSAInts.emplace_back(lbRightFwd, ubRightFwd, matchedLen, queryStart, false);
+                            // If we didn't end the match b/c we exhausted the query
+                            // test the mismatching k-mer in the other strand
+                            // TODO: check for 'N'?
+                            if (strictCheck and rb + matchedLen < readEndIt){
+                                auto kmerPos = std::distance(readStartIt, rb + matchedLen - skipOverlap);
+                                mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
+				// TODO: 04/11/16
+                                kmerScores.emplace_back(mer, kmerPos, UNTESTED, UNTESTED);
+                            }
+
+                        }
+
+                        if (lastSearch) { break; }
+                        auto mismatchIt = rb + matchedLen;
+                        if (mismatchIt < readEndIt) {
+                            auto remainingDistance = std::distance(mismatchIt, readEndIt);
+                            auto lce = saSearcher.lce(lbRightFwd, ubRightFwd-1, matchedLen, remainingDistance);
+
+                            // Where we would jump if we just used the MMP
+                            auto skipMatch = mismatchIt - skipOverlap;
+                            // Where we would jump if we used the LCE
+                            auto skipLCE = rb + lce - skipOverlap;
+                            // Pick the larger of the two
+                            rb = std::max(skipLCE, skipMatch);
+                            if (rb > (readEndIt - k)) {
+                                rb = readEndIt - k;
+                                lastSearch = true;
+                            }
+                            re = rb + k;
+                        } else {
+                            lastSearch = true;
+                            rb = readEndIt - k;
+                            re = rb + k;
+                        }
+
+                    } else {
+                        rb += sampFactor;
+                        re = rb + k;
+                    }
+                }
+            }
+        }
+
+        lastSearch = false;
+        if (rcHit >= fwdHit) {
+            size_t pos{read.length() - k};
+
+            auto revReadEndIt = read.rend();
+
+            auto revRB = read.rbegin();
+            auto revRE = revRB + k;
+
+            auto invalidPosIt = revRB;
+            while (revRE <= revReadEndIt){
+
+                revRE = revRB + k;
+                if (revRE > revReadEndIt) { break; }
+
+                // See if this k-mer would contain an N
+                // only check if we don't yet know that there are no remaining
+                // Ns
+                if (invalidPosIt != revReadEndIt) {
+                    invalidPosIt = std::find_if(revRB, revRE,
+                                                 [](const char c) -> bool {
+                                                     return c == 'n' or c == 'N';
+                                                 });
+                }
+
+                // If we found an N before the end of the k-mer
+                if (invalidPosIt < revRE) {
+                    // Skip to the k-mer starting at the next position
+                    // (i.e. right past the N)
+                    revRB = invalidPosIt + 1;
+                    continue;
+                }
+
+                // The distance from the beginning of the read to the
+                // start of the k-mer
+                pos = std::distance(revRE, revReadEndIt);
+
+                // Get the k-mer and query it in the hash
+                mer = rapmap::utils::my_mer(read.c_str() + pos);
+                if (mer.is_homopolymer()) { revRB += homoPolymerSkip; revRE += homoPolymerSkip; continue; }
+                rcMer = mer.get_reverse_complement();
+                auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+
+                // If we found the k-mer
+                if (rcMerIt != khash.end()) {
+                    if (strictCheck) {
+                        ++rcHit;
+                        kmerScores.emplace_back(mer, pos, UNTESTED, PRESENT);
+                        auto merIt = khash.find(mer.get_bits(0, 2*k));
+                        if (merIt != khash.end()) {
+                            ++fwdHit;
+                            kmerScores.back().fwdScore = PRESENT;
+                        }
+                    }
+
+
+                    lbRightRC = rcMerIt->second.begin;
+                    ubRightRC = rcMerIt->second.end;
+
+                    // lb must be 1 *less* then the current lb
+                    // We can't move any further in the reverse complement direction
+                    lbRightRC = std::max(static_cast<OffsetT>(0), lbRightRC - 1);
+                    std::tie(lbRightRC, ubRightRC, matchedLen) =
+                        saSearcher.extendSearchNaive(lbRightRC, ubRightRC, k,
+                                revRB, revReadEndIt, true);
+
+                    OffsetT diff = ubRightRC - lbRightRC;
+                    if (ubRightRC > lbRightRC and diff < maxInterval) {
+                        auto queryStart = std::distance(read.rbegin(), revRB);
+                        rcSAInts.emplace_back(lbRightRC, ubRightRC, matchedLen, queryStart, true);
+                        // If we didn't end the match b/c we exhausted the query
+                        // test the mismatching k-mer in the other strand
+                        // TODO: check for 'N'?
+                        if (strictCheck and revRB + matchedLen < revReadEndIt){
+                            auto kmerPos = std::distance(revRB + matchedLen, revReadEndIt);
+                            mer = rapmap::utils::my_mer(read.c_str() + kmerPos);
+                            // TODO: 04/11/16
+                            kmerScores.emplace_back(mer, kmerPos, UNTESTED, UNTESTED);
+                        }
+                    }
+
+                    if (lastSearch) { break; }
+                    auto mismatchIt = revRB + matchedLen;
+                    if (mismatchIt < revReadEndIt) {
+                        auto remainingDistance = std::distance(mismatchIt, revReadEndIt);
+                        auto lce = saSearcher.lce(lbRightRC, ubRightRC-1, matchedLen, remainingDistance);
+
+                        // Where we would jump if we just used the MMP
+                        auto skipMatch = mismatchIt - skipOverlap;
+                        // Where we would jump if we used the lce
+                        auto skipLCE = revRB + lce - skipOverlap;
+                        // Choose the larger of the two
+                        revRB = std::max(skipLCE, skipMatch);
+                        if (revRB > (revReadEndIt - k)) {
+                            revRB = revReadEndIt - k;
+                            lastSearch = true;
+                        }
+                        revRE = revRB + k;
+                    } else {
+                        lastSearch = true;
+                        revRB = revReadEndIt - k;
+                        revRE = revRB + k;
+                    }
+
+                } else {
+                    revRB += sampFactor;
+                    revRE = revRB + k;
+                }
+            }
+        }
+
+        if (strictCheck) {
+            // The first two conditions shouldn't happen
+            // but I'm just being paranoid here
+            if (fwdHit > 0 and rcHit == 0) {
+                rcSAInts.clear();
+            } else if (rcHit > 0 and fwdHit == 0) {
+                fwdSAInts.clear();
+            } else {
+	      std::sort( kmerScores.begin(), kmerScores.end() );
+	      auto e = std::unique(kmerScores.begin(), kmerScores.end());
+                // Compute the score for the k-mers we need to
+                // test in both the forward and rc directions.
+                int32_t fwdScore{0};
+                int32_t rcScore{0};
+                // For every kmer score structure
+		//std::cerr << "[\n";
+                for (auto kmsIt = kmerScores.begin(); kmsIt != e; ++kmsIt) {//: kmerScores) {
+   		    auto& kms = *kmsIt;
+                    // If the forward k-mer is untested, then test it
+                    if (kms.fwdScore == UNTESTED) {
+                        auto merIt = khash.find(kms.kmer.get_bits(0, 2*k));
+                        kms.fwdScore = (merIt != khash.end()) ? PRESENT : ABSENT;
+                    }
+                    // accumulate the score
+                    fwdScore += kms.fwdScore;
+
+                    // If the rc k-mer is untested, then test it
+                    if (kms.rcScore == UNTESTED) {
+                        rcMer = kms.kmer.get_reverse_complement();
+                        auto rcMerIt = khash.find(rcMer.get_bits(0, 2*k));
+                        kms.rcScore = (rcMerIt != khash.end()) ? PRESENT : ABSENT;
+                    }
+                    // accumulate the score
+                    rcScore += kms.rcScore;
+		    //kms.print();
+		    //std::cerr << "\n";
+                }
+		//std::cerr << "]\n";
+                // If the forward score is strictly greater
+                // then get rid of the rc hits.
+                if (fwdScore > rcScore) {
+                    rcSAInts.clear();
+                } else if (rcScore > fwdScore) {
+                    // If the rc score is strictly greater
+                    // get rid of the forward hits
+                    fwdSAInts.clear();
+                }
+            }
+        }
+
+        auto fwdHitsStart = hits.size();
+        // If we had > 1 forward hit
+        if (fwdSAInts.size() > 1) {
+            auto processedHits = rapmap::hit_manager::intersectSAHits(fwdSAInts, *rmi_, consistentHits);
+            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
+        } else if (fwdSAInts.size() == 1) { // only 1 hit!
+            auto& saIntervalHit = fwdSAInts.front();
+                auto initialSize = hits.size();
+                for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
+                        auto globalPos = SA[i];
+		            	auto txpID = rmi_->transcriptAtPosition(globalPos);
+                        // the offset into this transcript
+                        auto pos = globalPos - txpStarts[txpID];
+                        int32_t hitPos = pos - saIntervalHit.queryPos;
+                        hits.emplace_back(txpID, hitPos, true, readLen);
+                        hits.back().mateStatus = mateStatus;
+                }
+                // Now sort by transcript ID (then position) and eliminate
+                // duplicates
+                auto sortStartIt = hits.begin() + initialSize;
+                auto sortEndIt = hits.end();
+                std::sort(sortStartIt, sortEndIt,
+                                [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                                if (a.tid == b.tid) {
+                                return a.pos < b.pos;
+                                } else {
+                                return a.tid < b.tid;
+                                }
+                                });
+                auto newEnd = std::unique(hits.begin() + initialSize, hits.end(),
+                                [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                                return a.tid == b.tid;
+                                });
+                hits.resize(std::distance(hits.begin(), newEnd));
+        }
+        auto fwdHitsEnd = hits.size();
+
+        auto rcHitsStart = fwdHitsEnd;
+        // If we had > 1 rc hit
+        if (rcSAInts.size() > 1) {
+            auto processedHits = rapmap::hit_manager::intersectSAHits(rcSAInts, *rmi_, consistentHits);
+            rapmap::hit_manager::collectHitsSimpleSA(processedHits, readLen, maxDist, hits, mateStatus);
+        } else if (rcSAInts.size() == 1) { // only 1 hit!
+            auto& saIntervalHit = rcSAInts.front();
+            auto initialSize = hits.size();
+            for (OffsetT i = saIntervalHit.begin; i != saIntervalHit.end; ++i) {
+                auto globalPos = SA[i];
+		        auto txpID = rmi_->transcriptAtPosition(globalPos);
+                // the offset into this transcript
+                auto pos = globalPos - txpStarts[txpID];
+                int32_t hitPos = pos - saIntervalHit.queryPos;
+                hits.emplace_back(txpID, hitPos, false, readLen);
+                hits.back().mateStatus = mateStatus;
+            }
+            // Now sort by transcript ID (then position) and eliminate
+            // duplicates
+            auto sortStartIt = hits.begin() + rcHitsStart;
+            auto sortEndIt = hits.end();
+            std::sort(sortStartIt, sortEndIt,
+                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    if (a.tid == b.tid) {
+                    return a.pos < b.pos;
+                    } else {
+                    return a.tid < b.tid;
+                    }
+                    });
+            auto newEnd = std::unique(sortStartIt, sortEndIt,
+                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    return a.tid == b.tid;
+                    });
+            hits.resize(std::distance(hits.begin(), newEnd));
+        }
+        auto rcHitsEnd = hits.size();
+
+        // If we had both forward and RC hits, then merge them
+        if ((fwdHitsEnd > fwdHitsStart) and (rcHitsEnd > rcHitsStart)) {
+            // Merge the forward and reverse hits
+            std::inplace_merge(hits.begin() + fwdHitsStart, hits.begin() + fwdHitsEnd, hits.begin() + rcHitsEnd,
+                    [](const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    return a.tid < b.tid;
+                    });
+            // And get rid of duplicate transcript IDs
+            auto newEnd = std::unique(hits.begin() + fwdHitsStart, hits.begin() + rcHitsEnd,
+                    [] (const QuasiAlignment& a, const QuasiAlignment& b) -> bool {
+                    return a.tid == b.tid;
+                    });
+            hits.resize(std::distance(hits.begin(), newEnd));
+        }
+        // Return true if we had any valid hits and false otherwise.
+        return foundHit;
+    }
+
+    private:
+        RapMapIndexT* rmi_;
+};
+
+#endif // SA_COLLECTOR_HPP
diff --git a/debian/rapmap/SASearcher.hpp b/debian/rapmap/SASearcher.hpp
new file mode 100644
index 0000000..b36e476
--- /dev/null
+++ b/debian/rapmap/SASearcher.hpp
@@ -0,0 +1,631 @@
+#ifndef SA_SEARCHER_HPP
+#define SA_SEARCHER_HPP
+
+#include <vector>
+#include <algorithm>
+#include <iterator>
+#include "jellyfish/mer_dna.hpp"
+
+#include "RapMapUtils.hpp"
+#include "RapMapSAIndex.hpp"
+
+template <typename RapMapIndexT>
+class SASearcher {
+    public:
+        using OffsetT = typename RapMapIndexT::IndexType;
+
+        SASearcher(RapMapIndexT* rmi) :
+            rmi_(rmi), seq_(&rmi->seq), sa_(&rmi->SA) {}
+
+        int cmp(std::string::iterator abeg,
+                std::string::iterator aend,
+                std::string::iterator bbeg,
+                std::string::iterator bend) {
+            auto ait = abeg;
+            auto bit = bbeg;
+            //size_t la = a.length();
+            //size_t lb = b.length();
+            while (ait < aend and bit < bend) {
+                if (*ait < *bit) {
+                    return -1;
+                } else if (*ait > *bit) {
+                    return 1;
+                }
+                ++ait;
+                ++bit;
+            }
+            if (bit == bend and ait < aend) {
+                return 1;
+            }
+            return 0;
+        }
+
+        enum class SearchDirection : uint8_t {
+            UP = 0, DOWN
+        };
+    
+        template <typename IndexT>
+        struct BoundSearchResult {
+            IndexT maxLen;
+            IndexT bound;
+            SearchDirection dir;
+        };
+
+
+
+	/**
+	 * OK!  It should be (is) possible to figure out what we need with only two binary
+	 * searches.  However, that seems to have some tricky corner cases and has been
+	 * somewhat illusive so far.  This "naive" version performs *3* binary searches.
+	 * The first determines the length of the maximum mappable prefix (MMP).  The second
+	 * finds the lower bound for the query interval and the third finds the upper bound.
+	 * The final binary search *is* optimized (it has a lower bound given by the value)
+	 * returned by second search.  However, this method is likely a bit slower than the
+	 * one above (when it can be made to work correctly at all times).
+	 */
+        template <typename IteratorT>
+        std::tuple<OffsetT, OffsetT, OffsetT> extendSearchNaive(
+                OffsetT lbIn, // The lower bound for the search
+                OffsetT ubIn, // The upper bound for the search
+                OffsetT startAt, // The offset at which to start looking
+                IteratorT qb, // Iterator to the beginning of the query
+                IteratorT qe, // Iterator to the end of the query
+                bool complementBases=false // True if bases should be complemented
+                                           // before comparison
+                ) {
+
+            std::vector<OffsetT>& SA = *sa_;
+            std::string& seq = *seq_;
+
+            int64_t m = std::distance(qb, qe);
+            size_t n = seq.length();
+
+            auto sb = seq.begin();
+            auto se = seq.end();
+
+            // If the bounds are already trivial, just figure how long
+            // of a prefix we share and return the interval.
+            if (ubIn - lbIn == 2) {
+                lbIn += 1;
+                auto i = startAt;
+                while (i < m and SA[lbIn] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+                    if ( queryChar < *(sb + SA[lbIn] + i) ) {
+                        break;
+                    } else if ( queryChar > *(sb + SA[lbIn] + i)) {
+                        break;
+                    }
+                    ++i;
+                }
+                return std::make_tuple(lbIn, ubIn, static_cast<OffsetT>(i));
+            }
+
+            BoundSearchResult<OffsetT> res1, res2;
+
+            char smallest = '#';
+            char largest = '}';
+            char sentinel = smallest;
+
+            // FIX: these have to be large enough to hold the *sum* of the boundaries!
+            int64_t l = lbIn, r = ubIn;
+            int64_t lcpLP = startAt, lcpRP = startAt;
+            int64_t c{0};
+            int64_t i{0};
+
+            int64_t maxI{startAt};
+            int64_t prevI = startAt;
+            int64_t prevILow = startAt;
+            int64_t prevIHigh = startAt;
+            int64_t validBoundLow = ubIn;
+            int64_t validBoundHigh = lbIn;
+            int64_t validBound = 0;
+            bool plt{true};
+            // Reduce the search interval until we hit a border
+            // i.e. until c == r - 1 or c == l + 1
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                        if (i > prevIHigh) {
+                            prevIHigh = i;
+                            validBoundHigh = c;
+                        } else if (i == prevIHigh) {
+                            validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+                        }
+
+                        break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        if (i > prevILow) {
+                            prevILow = i;
+                            validBoundLow = c;
+                        } else if (i == prevILow) {
+                            validBoundLow = c > validBoundLow ? c : validBoundLow;
+                        }
+                        plt = false;
+                        break;
+                    }
+
+                    ++i;
+                }
+                if (i == m or SA[c] + i == n) {
+                    if (i > prevIHigh) {
+                        prevIHigh = i;
+                        validBoundHigh = c;
+                    } else if (i == prevIHigh) {
+                        validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+                    }
+                }
+
+                if (plt) {
+                    if (c == l + 1) {
+                        auto maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            bool knownValid{true};
+            m = res1.maxLen + 1;
+
+            // first search for the lower bound
+            sentinel = '#';
+            l = lbIn;
+            r = ubIn;
+
+            lcpLP = startAt;
+            lcpRP = startAt;
+            c = 0;
+            plt = true;
+            i = startAt;
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
+                    // If we're reverse complementing
+                    if (queryChar != sentinel and complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                     	break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) {
+                        res1.bound = c;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        res1.bound = r;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            // then search for the upper bound
+            sentinel = '{';
+            l = res1.bound - 1;
+            r = ubIn;
+
+            lcpLP = startAt;
+            lcpRP = startAt;
+            c = 0;
+            plt = true;
+            i = startAt;
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
+                    // If we're reverse complementing
+                    if (queryChar != sentinel and complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                     	break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) {
+                        res2.bound = c;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        res2.bound = r;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            // Must occur at least once!
+            if (res1.bound == res2.bound) { res2.bound += 1; }
+            return std::make_tuple(static_cast<OffsetT>(res1.bound), static_cast<OffsetT>(res2.bound), static_cast<OffsetT>(res1.maxLen));
+        }
+
+
+        /**
+         * Compute the longest common extension between the suffixes
+         * at T[SA[p1]] and T[SA[p2]].  Start the comparison at `startAt`
+         * positions into the suffix, and only consider an extension
+         * going to at most position `stopAt`.
+         */
+        OffsetT lce(OffsetT p1, OffsetT p2,
+                    OffsetT startAt=0,
+                    OffsetT stopAt=std::numeric_limits<OffsetT>::max(),
+                    bool verbose=false) {
+            std::string& seq = *seq_;
+            std::vector<OffsetT>& SA = *sa_;
+            OffsetT len = static_cast<OffsetT>(startAt);
+            auto o1 = SA[p1] + startAt;
+            auto o2 = SA[p2] + startAt;
+            auto maxIndex = std::max(o1, o2);
+            while (maxIndex + len < textLen_ and seq[o1+len] == seq[o2+len]) {
+                if (seq[o1+len] == '$') { break; }
+                if (len >= stopAt) { break; }
+                ++len;
+            }
+            return len;
+        }
+
+    private:
+        RapMapIndexT* rmi_;
+        std::string* seq_;
+        std::vector<OffsetT>* sa_;
+        OffsetT textLen_;
+};
+
+
+        /*
+        // http://www.cs.jhu.edu/~langmea/resources/lecture_notes/suffix_arrays.pdf
+        std::tuple<int, int> querySimpleAccel(std::string::iterator qb,
+                                              std::string::iterator qe) {
+            std::vector<int>& SA = *sa_;
+            std::string& seq = *seq_;
+            //ForwardIt it;
+            auto sb = seq.begin();
+            auto se = seq.end();
+
+            size_t n = seq.length();
+            size_t m = std::distance(qb, qe);
+            size_t l = 0, r = n;
+            size_t lcpLP = 0, lcpRP = 0;
+            size_t c{0};
+            size_t i{0};
+            bool plt{true};
+            size_t lower{0};
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    if ( *(qb + i) < *(sb + SA[c] + i) ) {
+                        break;
+                    } else if ( *(qb + i) > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) { lower = c; break; }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) { lower = r; break; }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            i = 0;
+            l = 0;
+            r = n;
+            lcpLP = 0;
+            lcpRP = 0;
+            size_t upper{0};
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    if ( *(qb + i) < *(sb + SA[c] + i) ) {
+                        break;
+                    } else if ( *(qb + i) > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) { upper = c; break; }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) { upper = r; break; }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+            return std::make_tuple(lower, upper);
+        }
+
+
+        // http://www.cs.jhu.edu/~langmea/resources/lecture_notes/suffix_arrays.pdf
+        // templated on the iterator type so we can use a forward or revers iterator
+        template <typename IteratorT>
+        std::tuple<int, int, int> extendSearch(
+                int lbIn, // The lower bound for the search
+                int ubIn, // The upper bound for the search
+                int startAt, // The offset at which to start looking
+                IteratorT qb, // Iterator to the beginning of the query
+                IteratorT qe, // Iterator to the end of the query
+                bool complementBases=false // True if bases should be complemented
+                                           // before comparison
+                ) {
+
+            std::vector<int>& SA = *sa_;
+            std::string& seq = *seq_;
+
+            int m = std::distance(qb, qe);
+            size_t n = seq.length();
+
+            auto sb = seq.begin();
+            auto se = seq.end();
+
+            // If the bounds are already trivial, just figure how long
+            // of a prefix we share and return the interval.
+            if (ubIn - lbIn == 2) {
+                lbIn += 1;
+                auto i = startAt;
+                while (i < m and SA[lbIn] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+                    if ( queryChar < *(sb + SA[lbIn] + i) ) {
+                        break;
+                    } else if ( queryChar > *(sb + SA[lbIn] + i)) {
+                        break;
+                    }
+                    ++i;
+                }
+                return std::make_tuple(lbIn, ubIn, i);
+            }
+
+            BoundSearchResult res1, res2;
+
+            char smallest = '#';
+            char largest = '}';
+            char sentinel = smallest;
+
+            int l = lbIn, r = ubIn;
+            int lcpLP = startAt, lcpRP = startAt;
+            int c{0};
+            int i{0};
+            int maxI{startAt};
+            int prevI = startAt;
+            int prevILow = startAt;
+            int prevIHigh = startAt;
+            int validBoundLow = ubIn;
+            int validBoundHigh = lbIn;
+            int validBound = 0;
+            bool plt{true};
+            bool prevPLT{true};
+            //std::cerr << "lbIn = " << lbIn << ", ubIn = " << ubIn << "\n";
+            // Reduce the search interval until we hit a border
+            // i.e. until c == r - 1 or c == l + 1
+            while (true) {
+                c = (l + r) / 2;
+                //std::cerr << "l = " << l << ", r = " << r << ", c = " << c << '\n';
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = ::toupper(*(qb + i));
+                    // If we're reverse complementing
+                    if (complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                        if (i > prevIHigh) {
+                            prevIHigh = i;
+                            validBoundHigh = c;
+                        } else if (i == prevIHigh) {
+                            validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+                        }
+                        //std::cerr << "(l = " << l << ", r = " << r << ") pattern < SA[" << c << "]\n";
+                        //std::cerr << "(i = " << i << ", m = " << m << ") " << queryChar << " < " <<  *(sb + SA[c] + i) << "\n";
+
+                        break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        if (i > prevILow) {
+                            prevILow = i;
+                            validBoundLow = c;
+                        } else if (i == prevILow) {
+                            validBoundLow = c > validBoundLow ? c : validBoundLow;
+                        }
+                        //std::cerr << "(l = " << l << ", r = " << r << ") pattern > SA[" << c << "]\n";
+                        //std::cerr << "(i = " << i << ", m = " << m << ") " << queryChar << " > " <<  *(sb + SA[c] + i) << "\n";
+                        plt = false;
+                        break;
+                    }
+
+                    ++i;
+		}
+		if (i == m or SA[c] + i == n) {
+			if (i > prevIHigh) {
+				prevIHigh = i;
+				validBoundHigh = c;
+			} else if (i == prevIHigh) {
+				validBoundHigh = c < validBoundHigh ? c : validBoundHigh;
+			}
+		}
+
+                if (plt) {
+                    if (c == l + 1) {
+                        std::cerr << "path 1\n";
+                        auto maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        if (maxI == m) {
+                            res1.dir = SearchDirection::DOWN;
+                            res1.bound = c;
+                        } else {
+                            validBound = (prevILow >= prevIHigh) ? validBoundLow : validBoundHigh;
+                            res1.bound = validBound;
+                            res1.dir = (res1.bound == validBoundLow) ? SearchDirection::DOWN : SearchDirection::UP;
+                        }
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        std::cerr << "path 2\n";
+                        maxI = std::max(std::max(i, prevILow), prevIHigh);
+                        res1.maxLen = maxI;
+                        validBound = (prevILow >= prevIHigh) ? validBoundLow : validBoundHigh;
+                        if (maxI == m) {
+                            res1.bound = r;
+                        } else {
+                            res1.bound = validBound;
+                        }
+                        res1.dir = (res1.bound == validBoundLow) ? SearchDirection::DOWN : SearchDirection::UP;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+
+            bool knownValid{true};
+            m = res1.maxLen + 1;
+
+            switch (res1.dir) {
+                case SearchDirection::UP:
+                    sentinel = '#';
+                    r = res1.bound;
+                    l = lbIn;
+                    std::cerr << "direction was UP; lb = " << l << ", ub = " << r << "\n";
+                    std::cerr << "direction was UP; origLb = " << lbIn << ", origUb = " << ubIn << "\n";
+                    break;
+                case SearchDirection::DOWN:
+                    sentinel = '{';
+                    r = ubIn;
+                    l = res1.bound;
+                    std::cerr << "direction was DOWN; lb = " << l << ", ub = " << r << "\n";
+                    std::cerr << "direction was UP; origLb = " << lbIn << ", origUb = " << ubIn << "\n";
+                    break;
+            }
+
+            if (r - l < 2) {
+                if (r == l) { r += 1; }
+                //std::cerr << "early exit!\n";
+                return std::make_tuple(l, r, res1.maxLen);
+            }
+
+
+            lcpLP = startAt;
+            lcpRP = startAt;
+            c = 0;
+            plt = true;
+            prevPLT = true;
+            prevI = 0;
+            prevILow = 0;
+            prevIHigh = 0;
+            i = startAt;
+            validBound = 0;
+            validBoundLow = ubIn;
+            validBoundHigh = lbIn;
+            while (true) {
+                c = (l + r) / 2;
+                plt = true;
+                i = std::min(lcpLP, lcpRP);
+                while (i < m and SA[c] + i < n) {
+                    char queryChar = (i < m - 1) ? ::toupper(*(qb + i)) : sentinel;
+                    // If we're reverse complementing
+                    if (queryChar != sentinel and complementBases) {
+                        queryChar = rapmap::utils::my_mer::complement(queryChar);
+                    }
+
+                    if ( queryChar < *(sb + SA[c] + i) ) {
+                     	break;
+                    } else if ( queryChar > *(sb + SA[c] + i)) {
+                        plt = false;
+                        break;
+                    }
+                    ++i;
+                }
+                if (plt) {
+                    if (c == l + 1) {
+                        res2.dir = SearchDirection::DOWN;
+                        res2.bound = c;
+                        break;
+                    }
+                    r = c;
+                    lcpRP = i;
+                } else {
+                    if (c == r - 1) {
+                        res2.bound = r;
+                        break;
+                    }
+                    l = c;
+                    lcpLP = i;
+                }
+            }
+
+            auto bound1 = std::min(res1.bound, res2.bound);
+            auto bound2 = std::max(res1.bound, res2.bound);
+            // Must occur at least once!
+            if (bound1 == bound2) { bound2 += 1; }
+            return std::make_tuple(bound1, bound2, res1.maxLen);
+        }
+        */
+
+#endif //SA_SEARCHER_HPP
diff --git a/debian/rapmap/ScopedTimer.hpp b/debian/rapmap/ScopedTimer.hpp
new file mode 100644
index 0000000..de1121c
--- /dev/null
+++ b/debian/rapmap/ScopedTimer.hpp
@@ -0,0 +1,22 @@
+#ifndef __SCOPED_TIMER_HPP__
+#define __SCOPED_TIMER_HPP__
+// from https://gist.github.com/justgord/4482447
+#include <chrono>
+#include <iostream>
+
+struct ScopedTimer
+{
+    std::chrono::high_resolution_clock::time_point t0;
+
+    ScopedTimer()
+        : t0(std::chrono::high_resolution_clock::now())
+    { }
+    ~ScopedTimer(void)
+    {
+        auto  t1 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> elapsedSec =  t1 - t0;
+        std::cerr << "Elapsed time: " << elapsedSec.count() << "s\n";
+    }
+};
+
+#endif //__SCOPED_TIMER_HPP__
diff --git a/debian/rapmap/SpinLock.hpp b/debian/rapmap/SpinLock.hpp
new file mode 100644
index 0000000..56647fa
--- /dev/null
+++ b/debian/rapmap/SpinLock.hpp
@@ -0,0 +1,25 @@
+#ifndef __SPIN_LOCK_HPP__
+#define __SPIN_LOCK_HPP__
+
+#include <atomic>
+
+// Taken from http://stackoverflow.com/questions/26583433/c11-implementation-of-spinlock-using-atomic
+class SpinLock {
+    std::atomic_flag locked = ATOMIC_FLAG_INIT ;
+public:
+    void lock() {
+        while (locked.test_and_set(std::memory_order_acquire)) { ; }
+    }
+
+    // from http://stackoverflow.com/questions/19742993/implementing-a-spinlock-in-boost-example-neededhttp://stackoverflow.com/questions/19742993/implementing-a-spinlock-in-boost-example-needed
+    // is this legit?
+    bool try_lock() {
+        return !locked.test_and_set(std::memory_order_acquire);
+    }
+
+    void unlock() {
+        locked.clear(std::memory_order_release);
+    }
+};
+
+#endif //__SPIN_LOCK_HPP__
diff --git a/debian/rapmap/bit_array.c b/debian/rapmap/bit_array.c
new file mode 100644
index 0000000..af0bc1a
--- /dev/null
+++ b/debian/rapmap/bit_array.c
@@ -0,0 +1,3160 @@
+/*
+ bit_array.c
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ maintainer: Isaac Turner <turner.isaac at gmail.com>
+ license: Public Domain, no warranty
+ date: Aug 2014
+*/
+
+// 64 bit words
+// Array length can be zero
+// Unused top bits must be zero
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <limits.h> // ULONG_MAX
+#include <errno.h>
+#include <signal.h> // needed for abort()
+#include <string.h> // memset()
+#include <assert.h>
+#include <time.h> // needed for seeding rand()
+#include <unistd.h>  // need for getpid() for seeding rand number
+#include <ctype.h>  // need for tolower()
+#include <errno.h>  // perror()
+#include <sys/time.h> // for seeding random
+
+// Windows includes
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "bit_array.h"
+#include "bit_macros.h"
+
+//
+// Tables of constants
+//
+
+// byte reverse look up table
+static const word_t reverse_table[256] =
+{
+  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0,
+  0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
+  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8,
+  0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
+  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4,
+  0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
+  0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
+  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2,
+  0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
+  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA,
+  0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6,
+  0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE,
+  0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
+  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1,
+  0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9,
+  0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
+  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
+  0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
+  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED,
+  0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3,
+  0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB,
+  0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7,
+  0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF,
+  0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF,
+};
+
+// Morton table for interleaving bytes
+static const word_t morton_table0[256] =
+{
+  0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
+  0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
+  0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
+  0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
+  0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
+  0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
+  0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
+  0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
+  0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
+  0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
+  0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
+  0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
+  0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
+  0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
+  0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
+  0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
+  0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
+  0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
+  0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
+  0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
+  0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
+  0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
+  0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
+  0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
+  0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
+  0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
+  0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
+  0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
+  0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
+  0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
+  0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
+  0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555,
+};
+
+// Morton table for interleaving bytes, shifted left 1 bit
+static const word_t morton_table1[256] =
+{
+  0x0000, 0x0002, 0x0008, 0x000A, 0x0020, 0x0022, 0x0028, 0x002A,
+  0x0080, 0x0082, 0x0088, 0x008A, 0x00A0, 0x00A2, 0x00A8, 0x00AA,
+  0x0200, 0x0202, 0x0208, 0x020A, 0x0220, 0x0222, 0x0228, 0x022A,
+  0x0280, 0x0282, 0x0288, 0x028A, 0x02A0, 0x02A2, 0x02A8, 0x02AA,
+  0x0800, 0x0802, 0x0808, 0x080A, 0x0820, 0x0822, 0x0828, 0x082A,
+  0x0880, 0x0882, 0x0888, 0x088A, 0x08A0, 0x08A2, 0x08A8, 0x08AA,
+  0x0A00, 0x0A02, 0x0A08, 0x0A0A, 0x0A20, 0x0A22, 0x0A28, 0x0A2A,
+  0x0A80, 0x0A82, 0x0A88, 0x0A8A, 0x0AA0, 0x0AA2, 0x0AA8, 0x0AAA,
+  0x2000, 0x2002, 0x2008, 0x200A, 0x2020, 0x2022, 0x2028, 0x202A,
+  0x2080, 0x2082, 0x2088, 0x208A, 0x20A0, 0x20A2, 0x20A8, 0x20AA,
+  0x2200, 0x2202, 0x2208, 0x220A, 0x2220, 0x2222, 0x2228, 0x222A,
+  0x2280, 0x2282, 0x2288, 0x228A, 0x22A0, 0x22A2, 0x22A8, 0x22AA,
+  0x2800, 0x2802, 0x2808, 0x280A, 0x2820, 0x2822, 0x2828, 0x282A,
+  0x2880, 0x2882, 0x2888, 0x288A, 0x28A0, 0x28A2, 0x28A8, 0x28AA,
+  0x2A00, 0x2A02, 0x2A08, 0x2A0A, 0x2A20, 0x2A22, 0x2A28, 0x2A2A,
+  0x2A80, 0x2A82, 0x2A88, 0x2A8A, 0x2AA0, 0x2AA2, 0x2AA8, 0x2AAA,
+  0x8000, 0x8002, 0x8008, 0x800A, 0x8020, 0x8022, 0x8028, 0x802A,
+  0x8080, 0x8082, 0x8088, 0x808A, 0x80A0, 0x80A2, 0x80A8, 0x80AA,
+  0x8200, 0x8202, 0x8208, 0x820A, 0x8220, 0x8222, 0x8228, 0x822A,
+  0x8280, 0x8282, 0x8288, 0x828A, 0x82A0, 0x82A2, 0x82A8, 0x82AA,
+  0x8800, 0x8802, 0x8808, 0x880A, 0x8820, 0x8822, 0x8828, 0x882A,
+  0x8880, 0x8882, 0x8888, 0x888A, 0x88A0, 0x88A2, 0x88A8, 0x88AA,
+  0x8A00, 0x8A02, 0x8A08, 0x8A0A, 0x8A20, 0x8A22, 0x8A28, 0x8A2A,
+  0x8A80, 0x8A82, 0x8A88, 0x8A8A, 0x8AA0, 0x8AA2, 0x8AA8, 0x8AAA,
+  0xA000, 0xA002, 0xA008, 0xA00A, 0xA020, 0xA022, 0xA028, 0xA02A,
+  0xA080, 0xA082, 0xA088, 0xA08A, 0xA0A0, 0xA0A2, 0xA0A8, 0xA0AA,
+  0xA200, 0xA202, 0xA208, 0xA20A, 0xA220, 0xA222, 0xA228, 0xA22A,
+  0xA280, 0xA282, 0xA288, 0xA28A, 0xA2A0, 0xA2A2, 0xA2A8, 0xA2AA,
+  0xA800, 0xA802, 0xA808, 0xA80A, 0xA820, 0xA822, 0xA828, 0xA82A,
+  0xA880, 0xA882, 0xA888, 0xA88A, 0xA8A0, 0xA8A2, 0xA8A8, 0xA8AA,
+  0xAA00, 0xAA02, 0xAA08, 0xAA0A, 0xAA20, 0xAA22, 0xAA28, 0xAA2A,
+  0xAA80, 0xAA82, 0xAA88, 0xAA8A, 0xAAA0, 0xAAA2, 0xAAA8, 0xAAAA,
+};
+
+//
+// Macros
+//
+
+// WORD_SIZE is the number of bits per word
+// sizeof gives size in bytes (8 bits per byte)
+#define WORD_SIZE 64
+// #define WORD_SIZE (sizeof(word_t) * 8)
+
+// POPCOUNT is number of bits set
+
+#if defined(_WIN32)
+
+// See http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+static word_t __inline windows_popcount(word_t w)
+{
+  w = w - ((w >> 1) & (word_t)~(word_t)0/3);
+  w = (w & (word_t)~(word_t)0/15*3) + ((w >> 2) & (word_t)~(word_t)0/15*3);
+  w = (w + (w >> 4)) & (word_t)~(word_t)0/255*15;
+  c = (word_t)(w * ((word_t)~(word_t)0/255)) >> (sizeof(word_t) - 1) * 8;
+}
+
+static word_t __inline windows_parity(word_t w)
+{
+  w ^= w >> 1;
+  w ^= w >> 2;
+  w = (w & 0x1111111111111111UL) * 0x1111111111111111UL;
+  return (w >> 60) & 1;
+}
+
+#define POPCOUNT(x) windows_popcountl(x)
+#define PARITY(x) windows_parity(x)
+#else
+#define POPCOUNT(x) (unsigned)__builtin_popcountll(x)
+#define PARITY(x) (unsigned)__builtin_parityll(x)
+#endif
+
+#define MIN(a, b)  (((a) <= (b)) ? (a) : (b))
+#define MAX(a, b)  (((a) >= (b)) ? (a) : (b))
+
+// Make this a power of two
+#define INIT_CAPACITY_WORDS 2
+
+// word of all 1s
+#define WORD_MAX  (~(word_t)0)
+
+#define SET_REGION(arr,start,len)    _set_region((arr),(start),(len),FILL_REGION)
+#define CLEAR_REGION(arr,start,len)  _set_region((arr),(start),(len),ZERO_REGION)
+#define TOGGLE_REGION(arr,start,len) _set_region((arr),(start),(len),SWAP_REGION)
+
+// Have we initialised with srand() ?
+static char rand_initiated = 0;
+
+static void _seed_rand()
+{
+  if(!rand_initiated)
+  {
+    // Initialise random number generator
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    srand((((time.tv_sec ^ getpid()) * 1000001) + time.tv_usec));
+    rand_initiated = 1;
+  }
+}
+
+//
+// Common internal functions
+//
+
+#define bits_in_top_word(nbits) ((nbits) ? bitset64_idx((nbits) - 1) + 1 : 0)
+
+// Mostly used for debugging
+static inline void _print_word(word_t word, FILE* out)
+{
+  word_offset_t i;
+  for(i = 0; i < WORD_SIZE; i++)
+  {
+    fprintf(out, "%c", ((word >> i) & (word_t)0x1) == 0 ? '0' : '1');
+  }
+}
+
+// prints right to left
+static inline char* _word_to_str(word_t word, char str[WORD_SIZE+1])
+  __attribute__((unused));
+
+static inline char* _word_to_str(word_t word, char str[WORD_SIZE+1])
+{
+  word_offset_t i;
+  for(i = 0; i < WORD_SIZE; i++)
+  {
+    str[WORD_SIZE-i-1] = ((word >> i) & (word_t)0x1) == 0 ? '0' : '1';
+  }
+  str[WORD_SIZE] = '\0';
+  return str;
+}
+
+// Used in debugging
+#ifdef DEBUG
+  #define DEBUG_PRINT(msg,...) printf("[%s:%i] "msg, __FILE__, __LINE__, ##__VA_ARGS__);
+  #define DEBUG_VALIDATE(a) validate_bitarr((a), __FILE__, __LINE__)
+#else
+  #define DEBUG_PRINT(msg,...)
+  #define DEBUG_VALIDATE(a)
+#endif
+
+void validate_bitarr(BIT_ARRAY *arr, const char *file, int lineno)
+{
+  // Check top word is masked
+  word_addr_t tw = arr->num_of_words == 0 ? 0 : arr->num_of_words - 1;
+  bit_index_t top_bits = bits_in_top_word(arr->num_of_bits);
+  int err = 0;
+
+  if(arr->words[tw] > bitmask64(top_bits))
+  {
+    _print_word(arr->words[tw], stderr);
+    fprintf(stderr, "\n[%s:%i] Expected %i bits in top word[%i]\n",
+            file, lineno, (int)top_bits, (int)tw);
+    err = 1;
+  }
+
+  // Check num of words is correct
+  word_addr_t num_words = roundup_bits2words64(arr->num_of_bits);
+  if(num_words != arr->num_of_words)
+  {
+    fprintf(stderr, "\n[%s:%i] num of words wrong "
+                    "[bits: %i, word: %i, actual words: %i]\n", file, lineno,
+            (int)arr->num_of_bits, (int)num_words, (int)arr->num_of_words);
+    err = 1;
+  }
+
+  if(err) abort();
+}
+
+// Reverse a word
+static inline word_t _reverse_word(word_t word)
+{
+  word_t reverse = (reverse_table[(word)       & 0xff] << 56) |
+                   (reverse_table[(word >>  8) & 0xff] << 48) |
+                   (reverse_table[(word >> 16) & 0xff] << 40) |
+                   (reverse_table[(word >> 24) & 0xff] << 32) |
+                   (reverse_table[(word >> 32) & 0xff] << 24) |
+                   (reverse_table[(word >> 40) & 0xff] << 16) |
+                   (reverse_table[(word >> 48) & 0xff] << 8) |
+                   (reverse_table[(word >> 56) & 0xff]);
+
+  return reverse;
+}
+
+static inline void _mask_top_word(BIT_ARRAY* bitarr)
+{
+  // Mask top word
+  word_addr_t num_of_words = MAX(1, bitarr->num_of_words);
+  word_offset_t bits_active = bits_in_top_word(bitarr->num_of_bits);
+  bitarr->words[num_of_words-1] &= bitmask64(bits_active);
+}
+
+//
+// Get and set words (internal use only -- no bounds checking)
+//
+
+static inline word_t _get_word(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  word_addr_t word_index = bitset64_wrd(start);
+  word_offset_t word_offset = bitset64_idx(start);
+
+  word_t result = bitarr->words[word_index] >> word_offset;
+
+  word_offset_t bits_taken = WORD_SIZE - word_offset;
+
+  // word_offset is now the number of bits we need from the next word
+  // Check the next word has at least some bits
+  if(word_offset > 0 && start + bits_taken < bitarr->num_of_bits)
+  {
+    result |= bitarr->words[word_index+1] << (WORD_SIZE - word_offset);
+  }
+
+  return result;
+}
+
+// Set 64 bits from a particular start position
+// Doesn't extend bit array
+static inline void _set_word(BIT_ARRAY* bitarr, bit_index_t start, word_t word)
+{
+  word_addr_t word_index = bitset64_wrd(start);
+  word_offset_t word_offset = bitset64_idx(start);
+
+  if(word_offset == 0)
+  {
+    bitarr->words[word_index] = word;
+  }
+  else
+  {
+    bitarr->words[word_index]
+      = (word << word_offset) |
+        (bitarr->words[word_index] & bitmask64(word_offset));
+
+    if(word_index+1 < bitarr->num_of_words)
+    {
+      bitarr->words[word_index+1]
+        = (word >> (WORD_SIZE - word_offset)) |
+          (bitarr->words[word_index+1] & (WORD_MAX << word_offset));
+    }
+  }
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+static inline void _set_byte(BIT_ARRAY *bitarr, bit_index_t start, uint8_t byte)
+{
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xff) | byte);
+}
+
+// 4 bits
+static inline void _set_nibble(BIT_ARRAY *bitarr, bit_index_t start,
+                               uint8_t nibble)
+{
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xf) | nibble);
+}
+
+// Wrap around
+static inline word_t _get_word_cyclic(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  word_t word = _get_word(bitarr, start);
+
+  bit_index_t bits_taken = bitarr->num_of_bits - start;
+
+  if(bits_taken < WORD_SIZE)
+  {
+    word |= (bitarr->words[0] << bits_taken);
+
+    if(bitarr->num_of_bits < (bit_index_t)WORD_SIZE)
+    {
+      // Mask word to prevent repetition of the same bits
+      word = word & bitmask64(bitarr->num_of_bits);
+    }
+  }
+
+  return word;
+}
+
+// Wrap around
+static inline void _set_word_cyclic(BIT_ARRAY* bitarr,
+                                    bit_index_t start, word_t word)
+{
+  _set_word(bitarr, start, word);
+
+  bit_index_t bits_set = bitarr->num_of_bits - start;
+
+  if(bits_set < WORD_SIZE && start > 0)
+  {
+    word >>= bits_set;
+
+    // Prevent overwriting the bits we've just set
+    // by setting 'start' as the upper bound for the number of bits to write
+    word_offset_t bits_remaining = MIN(WORD_SIZE - bits_set, start);
+    word_t mask = bitmask64(bits_remaining);
+
+    bitarr->words[0] = bitmask_merge(word, bitarr->words[0], mask);
+  }
+}
+
+//
+// Fill a region (internal use only)
+//
+
+// FillAction is fill with 0 or 1 or toggle
+typedef enum {ZERO_REGION, FILL_REGION, SWAP_REGION} FillAction;
+
+static inline void _set_region(BIT_ARRAY* bitarr, bit_index_t start,
+                               bit_index_t length, FillAction action)
+{
+  if(length == 0) return;
+
+  word_addr_t first_word = bitset64_wrd(start);
+  word_addr_t last_word = bitset64_wrd(start+length-1);
+  word_offset_t foffset = bitset64_idx(start);
+  word_offset_t loffset = bitset64_idx(start+length-1);
+
+  if(first_word == last_word)
+  {
+    word_t mask = bitmask64(length) << foffset;
+
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[first_word] &= ~mask; break;
+      case FILL_REGION: bitarr->words[first_word] |=  mask; break;
+      case SWAP_REGION: bitarr->words[first_word] ^=  mask; break;
+    }
+  }
+  else
+  {
+    // Set first word
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[first_word] &=  bitmask64(foffset); break;
+      case FILL_REGION: bitarr->words[first_word] |= ~bitmask64(foffset); break;
+      case SWAP_REGION: bitarr->words[first_word] ^= ~bitmask64(foffset); break;
+    }
+
+    word_addr_t i;
+
+    // Set whole words
+    switch(action)
+    {
+      case ZERO_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] = (word_t)0;
+        break;
+      case FILL_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] = WORD_MAX;
+        break;
+      case SWAP_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] ^= WORD_MAX;
+        break;
+    }
+
+    // Set last word
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[last_word] &= ~bitmask64(loffset+1); break;
+      case FILL_REGION: bitarr->words[last_word] |=  bitmask64(loffset+1); break;
+      case SWAP_REGION: bitarr->words[last_word] ^=  bitmask64(loffset+1); break;
+    }
+  }
+}
+
+
+
+//
+// Constructor
+//
+
+// If cannot allocate memory, set errno to ENOMEM, return NULL
+BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits)
+{
+  bitarr->num_of_bits = nbits;
+  bitarr->num_of_words = roundup_bits2words64(nbits);
+  bitarr->capacity_in_words = MAX(8, roundup2pow(bitarr->num_of_words));
+  bitarr->words = (word_t*)calloc(bitarr->capacity_in_words, sizeof(word_t));
+
+  if(bitarr->words == NULL) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return bitarr;
+}
+
+void bit_array_dealloc(BIT_ARRAY* bitarr)
+{
+  free(bitarr->words);
+  memset(bitarr, 0, sizeof(BIT_ARRAY));
+}
+
+// If cannot allocate memory, set errno to ENOMEM, return NULL
+BIT_ARRAY* bit_array_create(bit_index_t nbits)
+{
+  BIT_ARRAY* bitarr = (BIT_ARRAY*)malloc(sizeof(BIT_ARRAY));
+
+  // error if could not allocate enough memory
+  if(bitarr == NULL || bit_array_alloc(bitarr, nbits) == NULL)
+  {
+    if(bitarr != NULL) free(bitarr);
+    errno = ENOMEM;
+    return NULL;
+  }
+
+  DEBUG_PRINT("Creating BIT_ARRAY (bits: %lu; allocated words: %lu; "
+              "using words: %lu; WORD_SIZE: %i)\n",
+              (unsigned long)nbits, (unsigned long)bitarr->capacity_in_words,
+              (unsigned long)roundup_bits2words64(nbits), (int)WORD_SIZE);
+
+  DEBUG_VALIDATE(bitarr);
+
+  return bitarr;
+}
+
+//
+// Destructor
+//
+void bit_array_free(BIT_ARRAY* bitarr)
+{
+  if(bitarr->words != NULL)
+    free(bitarr->words);
+
+  free(bitarr);
+}
+
+bit_index_t bit_array_length(const BIT_ARRAY* bit_arr)
+{
+  return bit_arr->num_of_bits;
+}
+
+// Change the size of a bit array. Enlarging an array will add zeros
+// to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory)
+char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits)
+{
+  word_addr_t old_num_of_words = bitarr->num_of_words;
+  word_addr_t new_num_of_words = roundup_bits2words64(new_num_of_bits);
+
+  bitarr->num_of_bits = new_num_of_bits;
+  bitarr->num_of_words = new_num_of_words;
+
+  DEBUG_PRINT("Resize: old_num_of_words: %i; new_num_of_words: %i capacity: %i\n",
+              (int)old_num_of_words, (int)new_num_of_words,
+              (int)bitarr->capacity_in_words);
+
+  if(new_num_of_words > bitarr->capacity_in_words)
+  {
+    // Need to change the amount of memory used
+    word_addr_t old_capacity_in_words = bitarr->capacity_in_words;
+    size_t old_capacity_in_bytes = old_capacity_in_words * sizeof(word_t);
+
+    bitarr->capacity_in_words = roundup2pow(new_num_of_words);
+    bitarr->capacity_in_words = MAX(8, bitarr->capacity_in_words);
+
+    size_t new_capacity_in_bytes = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->words = (word_t*)realloc(bitarr->words, new_capacity_in_bytes);
+
+    if(bitarr->words == NULL)
+    {
+      // error - could not allocate enough memory
+      perror("resize realloc");
+      errno = ENOMEM;
+      return 0;
+    }
+
+    // Need to zero new memory
+    size_t num_bytes_to_zero = new_capacity_in_bytes - old_capacity_in_bytes;
+    memset(bitarr->words + old_capacity_in_words, 0, num_bytes_to_zero);
+
+    DEBUG_PRINT("zeroing from word %i for %i bytes\n", (int)old_capacity_in_words,
+                (int)num_bytes_to_zero);
+  }
+  else if(new_num_of_words < old_num_of_words)
+  {
+    // Shrunk -- need to zero old memory
+    size_t num_bytes_to_zero = (old_num_of_words - new_num_of_words)*sizeof(word_t);
+
+    memset(bitarr->words + new_num_of_words, 0, num_bytes_to_zero);
+  }
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+  return 1;
+}
+
+void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits)
+{
+  bit_index_t old_num_of_bits = bitarr->num_of_bits;
+
+  if(!bit_array_resize(bitarr, num_of_bits))
+  {
+    fprintf(stderr, "Ran out of memory resizing [%lu -> %lu]",
+            (unsigned long)old_num_of_bits, (unsigned long)num_of_bits);
+    abort();
+  }
+}
+
+// If bitarr length < num_bits, resizes to num_bits
+char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits)
+{
+  if(bitarr->num_of_bits < ensure_num_of_bits)
+  {
+    return bit_array_resize(bitarr, ensure_num_of_bits);
+  }
+
+  return 1;
+}
+
+void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits)
+{
+  if(num_of_bits > bitarr->num_of_bits)
+  {
+    bit_array_resize_critical(bitarr, num_of_bits);
+  }
+}
+
+static inline
+void _bit_array_ensure_nwords(BIT_ARRAY* bitarr, word_addr_t nwords,
+                              const char *file, int lineno, const char *func)
+{
+  size_t newmem, oldmem;
+  if(bitarr->capacity_in_words < nwords) {
+    oldmem = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->capacity_in_words = roundup2pow(nwords);
+    newmem = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->words = (word_t*)realloc(bitarr->words, newmem);
+
+    if(bitarr->words == NULL) {
+      fprintf(stderr, "[%s:%i:%s()] Ran out of memory resizing [%zu -> %zu]",
+              file, lineno, func, oldmem, newmem);
+      abort();
+    }
+
+    DEBUG_PRINT("Ensure nwords realloc %zu -> %zu\n", oldmem, newmem);
+  }
+}
+
+
+//
+// Get, set, clear, assign and toggle individual bits
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  return bit_array_get(bitarr, b);
+}
+
+// set a bit (to 1) at position b
+void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_set(bitarr,b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// clear a bit (to 0) at position b
+void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_clear(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If bit is 0 -> 1, if bit is 1 -> 0.  AKA 'flip'
+void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_toggle(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_assign(bitarr, b, c ? 1 : 0);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Get, set etc with resize
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  return bit_array_get(bitarr, b);
+}
+
+// set a bit (to 1) at position b
+void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_set(bitarr,b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// clear a bit (to 0) at position b
+void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_clear(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If bit is 0 -> 1, if bit is 1 -> 0.  AKA 'flip'
+void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_toggle(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_assign(bitarr, b, c ? 1 : 0);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Set, clear and toggle several bits at once
+//
+
+// Set multiple bits at once.
+// e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31);
+void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_set_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Clear multiple bits at once.
+// e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31);
+void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_clear_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Toggle multiple bits at once
+// e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31);
+void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_toggle_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Set, clear and toggle all bits in a region
+//
+
+// Set all the bits in a region
+void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  SET_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+// Clear all the bits in a region
+void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  CLEAR_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Toggle all the bits in a region
+void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  TOGGLE_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Set, clear and toggle all bits at once
+//
+
+// set all elements of data to one
+void bit_array_set_all(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bytes = bitarr->num_of_words * sizeof(word_t);
+  memset(bitarr->words, 0xFF, num_of_bytes);
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// set all elements of data to zero
+void bit_array_clear_all(BIT_ARRAY* bitarr)
+{
+  memset(bitarr->words, 0, bitarr->num_of_words * sizeof(word_t));
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Set all 1 bits to 0, and all 0 bits to 1. AKA flip
+void bit_array_toggle_all(BIT_ARRAY* bitarr)
+{
+  word_addr_t i;
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    bitarr->words[i] ^= WORD_MAX;
+  }
+
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Get a word at a time
+//
+
+uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint64_t)_get_word(bitarr, start);
+}
+
+uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint32_t)_get_word(bitarr, start);
+}
+
+uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint16_t)_get_word(bitarr, start);
+}
+
+uint8_t bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint8_t)_get_word(bitarr, start);
+}
+
+uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n)
+{
+  assert(start < bitarr->num_of_bits);
+  assert(n <= 64);
+  return (uint64_t)(_get_word(bitarr, start) & bitmask64(n));
+}
+
+//
+// Set a word at a time
+//
+
+void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  _set_word(bitarr, start, (word_t)word);
+}
+
+void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xffffffff) | word);
+}
+
+void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xffff) | word);
+}
+
+void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte)
+{
+  assert(start < bitarr->num_of_bits);
+  _set_byte(bitarr, start, byte);
+}
+
+void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n)
+{
+  assert(start < bitarr->num_of_bits);
+  assert(n <= 64);
+  word_t w = _get_word(bitarr, start), m = bitmask64(n);
+  _set_word(bitarr, start, bitmask_merge(word,w,m));
+}
+
+//
+// Number of bits set
+//
+
+// Get the number of bits set (hamming weight)
+bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr)
+{
+  word_addr_t i;
+
+  bit_index_t num_of_bits_set = 0;
+
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    if(bitarr->words[i] > 0)
+    {
+      num_of_bits_set += POPCOUNT(bitarr->words[i]);
+    }
+  }
+
+  return num_of_bits_set;
+}
+
+// Get the number of bits not set (1 - hamming weight)
+bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr)
+{
+  return bitarr->num_of_bits - bit_array_num_bits_set(bitarr);
+}
+
+
+// Get the number of bits set in on array and not the other.  This is equivalent
+// to hamming weight of the XOR when the two arrays are the same length.
+// e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010)
+bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1,
+                                       const BIT_ARRAY* arr2)
+{
+  word_addr_t min_words = MIN(arr1->num_of_words, arr2->num_of_words);
+  word_addr_t max_words = MAX(arr1->num_of_words, arr2->num_of_words);
+
+  bit_index_t hamming_distance = 0;
+  word_addr_t i;
+
+  for(i = 0; i < min_words; i++)
+  {
+    hamming_distance += POPCOUNT(arr1->words[i] ^ arr2->words[i]);
+  }
+
+  if(min_words != max_words)
+  {
+    const BIT_ARRAY* long_arr
+      = (arr1->num_of_words > arr2->num_of_words ? arr1 : arr2);
+
+    for(i = min_words; i < max_words; i++)
+    {
+      hamming_distance += POPCOUNT(long_arr->words[i]);
+    }
+  }
+
+  return hamming_distance;
+}
+
+// Parity - returns 1 if odd number of bits set, 0 if even
+char bit_array_parity(const BIT_ARRAY* bitarr)
+{
+  word_addr_t w;
+  unsigned int parity = 0;
+
+  for(w = 0; w < bitarr->num_of_words; w++)
+  {
+    parity ^= PARITY(bitarr->words[w]);
+  }
+
+  return (char)parity;
+}
+
+//
+// Find indices of set/clear bits
+//
+
+// Find the index of the next bit that is set/clear, at or after `offset`
+// Returns 1 if such a bit is found, otherwise 0
+// Index is stored in the integer pointed to by `result`
+// If no such bit is found, value at `result` is not changed
+#define _next_bit_func_def(FUNC,GET) \
+char FUNC(const BIT_ARRAY* bitarr, bit_index_t offset, bit_index_t* result) \
+{ \
+  assert(offset < bitarr->num_of_bits); \
+  if(bitarr->num_of_bits == 0 || offset >= bitarr->num_of_bits) { return 0; } \
+ \
+  /* Find first word that is greater than zero */ \
+  word_addr_t i = bitset64_wrd(offset); \
+  word_t w = GET(bitarr->words[i]) & ~bitmask64(bitset64_idx(offset)); \
+ \
+  while(1) { \
+    if(w > 0) { \
+      bit_index_t pos = i * WORD_SIZE + trailing_zeros(w); \
+      if(pos < bitarr->num_of_bits) { *result = pos; return 1; } \
+      else { return 0; } \
+    } \
+    i++; \
+    if(i >= bitarr->num_of_words) break; \
+    w = GET(bitarr->words[i]); \
+  } \
+ \
+  return 0; \
+}
+
+// Find the index of the previous bit that is set/clear, before `offset`.
+// Returns 1 if such a bit is found, otherwise 0
+// Index is stored in the integer pointed to by `result`
+// If no such bit is found, value at `result` is not changed
+#define _prev_bit_func_def(FUNC,GET) \
+char FUNC(const BIT_ARRAY* bitarr, bit_index_t offset, bit_index_t* result) \
+{ \
+  assert(offset <= bitarr->num_of_bits); \
+  if(bitarr->num_of_bits == 0 || offset == 0) { return 0; } \
+ \
+  /* Find prev word that is greater than zero */ \
+  word_addr_t i = bitset64_wrd(offset-1); \
+  word_t w = GET(bitarr->words[i]) & bitmask64(bitset64_idx(offset-1)+1); \
+ \
+  if(w > 0) { *result = (i+1) * WORD_SIZE - leading_zeros(w) - 1; return 1; } \
+ \
+  /* i is unsigned so have to use break when i == 0 */ \
+  for(--i; i != BIT_INDEX_MAX; i--) { \
+    w = GET(bitarr->words[i]); \
+    if(w > 0) { \
+      *result = (i+1) * WORD_SIZE - leading_zeros(w) - 1; \
+      return 1; \
+    } \
+  } \
+ \
+  return 0; \
+}
+
+#define GET_WORD(x) (x)
+#define NEG_WORD(x) (~(x))
+_next_bit_func_def(bit_array_find_next_set_bit,  GET_WORD);
+_next_bit_func_def(bit_array_find_next_clear_bit,NEG_WORD);
+_prev_bit_func_def(bit_array_find_prev_set_bit,  GET_WORD);
+_prev_bit_func_def(bit_array_find_prev_clear_bit,NEG_WORD);
+
+// Find the index of the first bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of first set bit is stored in the integer pointed to by result
+// If no bits are set, value at `result` is not changed
+char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_next_set_bit(bitarr, 0, result);
+}
+
+// same same
+char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_next_clear_bit(bitarr, 0, result);
+}
+
+// Find the index of the last bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of last set bit is stored in the integer pointed to by `result`
+// If no bits are set, value at `result` is not changed
+char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_prev_set_bit(bitarr, bitarr->num_of_bits, result);
+}
+
+// same same
+char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_prev_clear_bit(bitarr, bitarr->num_of_bits, result);
+}
+
+//
+// "Sorting" bits
+//
+
+// Put all the 0s before all the 1s
+void bit_array_sort_bits(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bits_set = bit_array_num_bits_set(bitarr);
+  bit_index_t num_of_bits_cleared = bitarr->num_of_bits - num_of_bits_set;
+  bit_array_set_all(bitarr);
+  CLEAR_REGION(bitarr, 0, num_of_bits_cleared);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Put all the 1s before all the 0s
+void bit_array_sort_bits_rev(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bits_set = bit_array_num_bits_set(bitarr);
+  bit_array_clear_all(bitarr);
+  SET_REGION(bitarr, 0, num_of_bits_set);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Strings and printing
+//
+
+// Construct a BIT_ARRAY from a substring with given on and off characters.
+void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset,
+                           const char *str, size_t len,
+                           const char *on, const char *off,
+                           char left_to_right)
+{
+  bit_array_ensure_size(bitarr, offset + len);
+  bit_array_clear_region(bitarr, offset, len);
+
+  // BitArray region is now all 0s -- just set the 1s
+  size_t i;
+  bit_index_t j;
+
+  for(i = 0; i < len; i++)
+  {
+    if(strchr(on, str[i]) != NULL)
+    {
+      j = offset + (left_to_right ? i : len - i - 1);
+      bit_array_set(bitarr, j);
+    }
+    else { assert(strchr(off, str[i]) != NULL); }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// From string method
+void bit_array_from_str(BIT_ARRAY* bitarr, const char* str)
+{
+  bit_array_from_substr(bitarr, 0, str, strlen(str), "1", "0", 1);
+}
+
+// Takes a char array to write to.  `str` must be bitarr->num_of_bits+1 in length
+// Terminates string with '\0'
+char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    str[i] = bit_array_get(bitarr, i) ? '1' : '0';
+  }
+
+  str[bitarr->num_of_bits] = '\0';
+
+  return str;
+}
+
+char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    str[i] = bit_array_get(bitarr, bitarr->num_of_bits-i-1) ? '1' : '0';
+  }
+
+  str[bitarr->num_of_bits] = '\0';
+
+  return str;
+}
+
+
+// Get a string representations for a given region, using given on/off characters.
+// Note: does not null-terminate
+void bit_array_to_substr(const BIT_ARRAY* bitarr,
+                         bit_index_t start, bit_index_t length,
+                         char* str, char on, char off,
+                         char left_to_right)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  bit_index_t i, j;
+  bit_index_t end = start + length - 1;
+
+  for(i = 0; i < length; i++)
+  {
+    j = (left_to_right ? start + i : end - i);
+    str[i] = bit_array_get(bitarr, j) ? on : off;
+  }
+
+//  str[length] = '\0';
+}
+
+// Print this array to a file stream.  Prints '0's and '1'.  Doesn't print newline.
+void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    fprintf(fout, "%c", bit_array_get(bitarr, i) ? '1' : '0');
+  }
+}
+
+// Print a string representations for a given region, using given on/off characters.
+void bit_array_print_substr(const BIT_ARRAY* bitarr,
+                            bit_index_t start, bit_index_t length,
+                            FILE* fout, char on, char off,
+                            char left_to_right)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  bit_index_t i, j;
+  bit_index_t end = start + length - 1;
+
+  for(i = 0; i < length; i++)
+  {
+    j = (left_to_right ? start + i : end - i);
+    fprintf(fout, "%c", bit_array_get(bitarr, j) ? on : off);
+  }
+}
+
+//
+// Decimal
+//
+
+// Get bit array as decimal str (e.g. 0b1101 -> "13")
+// len is the length of str char array -- will write at most len-1 chars
+// returns the number of characters needed
+// return is the same as strlen(str)
+size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len)
+{
+  size_t i = 0;
+
+  if(bit_array_cmp_uint64(bitarr, 0) == 0)
+  {
+    if(len >= 2)
+    {
+      *str = '0';
+      *(str+1) = '\0';
+    }
+
+    return 1;
+  }
+
+  BIT_ARRAY *tmp = bit_array_clone(bitarr);
+  uint64_t rem;
+
+  str[len-1] = '\0';
+
+  while(bit_array_cmp_uint64(tmp, 0) != 0)
+  {
+    bit_array_div_uint64(tmp, 10, &rem);
+
+    if(i < len-1)
+    {
+      str[len-2-i] = '0' + rem;
+    }
+
+    i++;
+  }
+
+  if(i < len-1)
+  {
+    // Moves null-terminator as well
+    memmove(str, str+len-i-1, i+1);
+  }
+
+  bit_array_free(tmp);
+
+  return i;
+}
+
+// Get bit array from decimal str (e.g. "13" -> 0b1101)
+// Returns number of characters used
+size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal)
+{
+  bit_array_clear_all(bitarr);
+  size_t i = 0;
+
+  if(decimal[0] == '\0' || decimal[0] < '0' || decimal[0] > '9')
+  {
+    return 0;
+  }
+
+  bit_array_add_uint64(bitarr, decimal[i] - '0');
+  i++;
+
+  while(decimal[i] != '\0' && decimal[i] >= '0' && decimal[i] <= '9')
+  {
+    bit_array_mul_uint64(bitarr, 10);
+    bit_array_add_uint64(bitarr, decimal[i] - '0');
+    i++;
+  }
+
+  return i;
+}
+
+//
+// Hexidecimal
+//
+
+char bit_array_hex_to_nibble(char c, uint8_t *b)
+{
+  c = tolower(c);
+
+  if(c >= '0' && c <= '9')
+  {
+    *b = c - '0';
+    return 1;
+  }
+  else if(c >= 'a' && c <= 'f')
+  {
+    *b = 0xa + (c - 'a');
+    return 1;
+  }
+  else
+  {
+    return 0;
+  }
+}
+
+char bit_array_nibble_to_hex(uint8_t b, char uppercase)
+{
+  if(b <= 9)
+  {
+    return '0' + b;
+  }
+  else
+  {
+    return (uppercase ? 'A' : 'a') + (b - 0xa);
+  }
+}
+
+// Loads array from hex string
+// Returns the number of bits loaded (will be chars rounded up to multiple of 4)
+// (0 on failure)
+bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset,
+                               const char* str, size_t len)
+{
+  if(str[0] == '0' && tolower(str[1]) == 'x')
+  {
+    str += 2;
+    len -= 2;
+  }
+
+  size_t i;
+  for(i = 0; i < len; i++, offset += 4)
+  {
+    uint8_t b;
+    if(bit_array_hex_to_nibble(str[i], &b))
+    {
+      bit_array_ensure_size(bitarr, offset + 4);
+      _set_nibble(bitarr, offset, b);
+    }
+    else
+    {
+      break;
+    }
+  }
+
+  return 4 * i;
+}
+
+// Returns number of characters written
+size_t bit_array_to_hex(const BIT_ARRAY* bitarr,
+                        bit_index_t start, bit_index_t length,
+                        char* str, char uppercase)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  size_t k = 0;
+  bit_index_t offset, end = start + length;
+
+  for(offset = start; offset + WORD_SIZE <= end; offset += WORD_SIZE)
+  {
+    word_t w = _get_word(bitarr, offset);
+
+    word_offset_t j;
+    for(j = 0; j < 64; j += 4)
+    {
+      str[k++] = bit_array_nibble_to_hex((w>>j) & 0xf, uppercase);
+    }
+  }
+
+  if(offset < end)
+  {
+    // Remaining full nibbles (4 bits)
+    word_t w = _get_word(bitarr, offset);
+
+    for(; offset + 4 <= end; offset += 4)
+    {
+      str[k++] = bit_array_nibble_to_hex(w & 0xf, uppercase);
+      w >>= 4;
+    }
+
+    if(offset < end)
+    {
+      // Remaining bits
+      str[k++] = bit_array_nibble_to_hex(w & bitmask64(end - offset), uppercase);
+    }
+  }
+
+  str[k] = '\0';
+
+  // Return number of characters written
+  return k;
+}
+
+// Print bit array as hex
+size_t bit_array_print_hex(const BIT_ARRAY* bitarr,
+                           bit_index_t start, bit_index_t length,
+                           FILE* fout, char uppercase)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  size_t k = 0;
+  bit_index_t offset, end = start + length;
+
+  for(offset = start; offset + WORD_SIZE <= end; offset += WORD_SIZE)
+  {
+    word_t w = _get_word(bitarr, offset);
+
+    word_offset_t j;
+    for(j = 0; j < 64; j += 4)
+    {
+      fprintf(fout, "%c", bit_array_nibble_to_hex((w>>j) & 0xf, uppercase));
+      k++;
+    }
+  }
+
+  if(offset < end)
+  {
+    // Remaining full nibbles (4 bits)
+    word_t w = _get_word(bitarr, offset);
+
+    for(; offset + 4 <= end; offset += 4)
+    {
+      fprintf(fout, "%c", bit_array_nibble_to_hex(w & 0xf, uppercase));
+      w >>= 4;
+      k++;
+    }
+
+    if(offset < end)
+    {
+      // Remaining bits
+      char hex = bit_array_nibble_to_hex(w & bitmask64(end - offset), uppercase);
+      fprintf(fout, "%c", hex);
+      k++;
+    }
+  }
+
+  return k;
+}
+
+//
+// Clone and copy
+//
+
+// Returns NULL if cannot malloc
+BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr)
+{
+  BIT_ARRAY* cpy = bit_array_create(bitarr->num_of_bits);
+
+  if(cpy == NULL)
+  {
+    return NULL;
+  }
+
+  // Copy across bits
+  memcpy(cpy->words, bitarr->words, bitarr->num_of_words * sizeof(word_t));
+
+  DEBUG_VALIDATE(cpy);
+  return cpy;
+}
+
+// destination and source may be the same bit_array
+// and src/dst regions may overlap
+static void _array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                        const BIT_ARRAY* src, bit_index_t srcindx,
+                        bit_index_t length)
+{
+  DEBUG_PRINT("bit_array_copy(dst: %zu, src: %zu, length: %zu)\n",
+              (size_t)dstindx, (size_t)srcindx, (size_t)length);
+
+  // Num of full words to copy
+  word_addr_t num_of_full_words = length / WORD_SIZE;
+  word_addr_t i;
+
+  word_offset_t bits_in_last_word = bits_in_top_word(length);
+
+  if(dst == src && srcindx > dstindx)
+  {
+    // Work left to right
+    DEBUG_PRINT("work left to right\n");
+
+    for(i = 0; i < num_of_full_words; i++)
+    {
+      word_t word = _get_word(src, srcindx+i*WORD_SIZE);
+      _set_word(dst, dstindx+i*WORD_SIZE, word);
+    }
+
+    if(bits_in_last_word > 0)
+    {
+      word_t src_word = _get_word(src, srcindx+i*WORD_SIZE);
+      word_t dst_word = _get_word(dst, dstindx+i*WORD_SIZE);
+
+      word_t mask = bitmask64(bits_in_last_word);
+      word_t word = bitmask_merge(src_word, dst_word, mask);
+
+      _set_word(dst, dstindx+num_of_full_words*WORD_SIZE, word);
+    }
+  }
+  else
+  {
+    // Work right to left
+    DEBUG_PRINT("work right to left\n");
+
+    for(i = 0; i < num_of_full_words; i++)
+    {
+      word_t word = _get_word(src, srcindx+length-(i+1)*WORD_SIZE);
+      _set_word(dst, dstindx+length-(i+1)*WORD_SIZE, word);
+    }
+
+    DEBUG_PRINT("Copy %i,%i to %i\n", (int)srcindx, (int)bits_in_last_word,
+                                      (int)dstindx);
+
+    if(bits_in_last_word > 0)
+    {
+      word_t src_word = _get_word(src, srcindx);
+      word_t dst_word = _get_word(dst, dstindx);
+
+      word_t mask = bitmask64(bits_in_last_word);
+      word_t word = bitmask_merge(src_word, dst_word, mask);
+      _set_word(dst, dstindx, word);
+    }
+  }
+
+  _mask_top_word(dst);
+}
+
+// destination and source may be the same bit_array
+// and src/dst regions may overlap
+void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                    const BIT_ARRAY* src, bit_index_t srcindx,
+                    bit_index_t length)
+{
+  assert(srcindx + length <= src->num_of_bits);
+  assert(dstindx <= dst->num_of_bits);
+  _array_copy(dst, dstindx, src, srcindx, length);
+  DEBUG_VALIDATE(dst);
+}
+
+// Clone `src` into `dst`. Resizes `dst`.
+void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src)
+{
+  bit_array_resize_critical(dst, src->num_of_bits);
+  memmove(dst->words, src->words, src->num_of_words * sizeof(word_t));
+  DEBUG_VALIDATE(dst);
+}
+
+
+//
+// Logic operators
+//
+
+// Destination can be the same as one or both of the sources
+void bit_array_and(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  // Ensure dst array is big enough
+  word_addr_t max_bits = MAX(src1->num_of_bits, src2->num_of_bits);
+  bit_array_ensure_size_critical(dst, max_bits);
+
+  word_addr_t min_words = MIN(src1->num_of_words, src2->num_of_words);
+
+  word_addr_t i;
+
+  for(i = 0; i < min_words; i++)
+  {
+    dst->words[i] = src1->words[i] & src2->words[i];
+  }
+
+  // Set remaining bits to zero
+  for(i = min_words; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = (word_t)0;
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// Destination can be the same as one or both of the sources
+static void _logical_or_xor(BIT_ARRAY* dst,
+                            const BIT_ARRAY* src1,
+                            const BIT_ARRAY* src2,
+                            char use_xor)
+{
+  // Ensure dst array is big enough
+  bit_array_ensure_size_critical(dst, MAX(src1->num_of_bits, src2->num_of_bits));
+
+  word_addr_t min_words = MIN(src1->num_of_words, src2->num_of_words);
+  word_addr_t max_words = MAX(src1->num_of_words, src2->num_of_words);
+
+  word_addr_t i;
+
+  if(use_xor)
+  {
+    for(i = 0; i < min_words; i++)
+      dst->words[i] = src1->words[i] ^ src2->words[i];
+  }
+  else
+  {
+    for(i = 0; i < min_words; i++)
+      dst->words[i] = src1->words[i] | src2->words[i];
+  }
+
+  // Copy remaining bits from longer src array
+  if(min_words != max_words)
+  {
+    const BIT_ARRAY* longer = src1->num_of_words > src2->num_of_words ? src1 : src2;
+
+    for(i = min_words; i < max_words; i++)
+    {
+      dst->words[i] = longer->words[i];
+    }
+  }
+
+  // Set remaining bits to zero
+  size_t size = (dst->num_of_words - max_words) * sizeof(word_t);
+  memset(dst->words + max_words, 0, size);
+
+  DEBUG_VALIDATE(dst);
+}
+
+void bit_array_or(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  _logical_or_xor(dst, src1, src2, 0);
+}
+
+// Destination can be the same as one or both of the sources
+void bit_array_xor(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  _logical_or_xor(dst, src1, src2, 1);
+}
+
+// If dst is longer than src, top bits are set to 1
+void bit_array_not(BIT_ARRAY* dst, const BIT_ARRAY* src)
+{
+  bit_array_ensure_size_critical(dst, src->num_of_bits);
+
+  word_addr_t i;
+
+  for(i = 0; i < src->num_of_words; i++)
+  {
+    dst->words[i] = ~(src->words[i]);
+  }
+
+  // Set remaining words to 1s
+  for(i = src->num_of_words; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = WORD_MAX;
+  }
+
+  _mask_top_word(dst);
+
+  DEBUG_VALIDATE(dst);
+}
+
+//
+// Comparisons
+//
+
+// Compare two bit arrays by value stored, with index 0 being the Least
+// Significant Bit (LSB). Arrays do not have to be the same length.
+// Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side]
+// Sorts on length if all zeros: (0,0) < (0,0,0)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2)
+{
+  word_addr_t i;
+  word_t word1, word2;
+  word_addr_t min_words = bitarr1->num_of_words;
+
+  // i is unsigned so break when i == 0
+  if(bitarr1->num_of_words > bitarr2->num_of_words) {
+    min_words = bitarr2->num_of_words;
+    for(i = bitarr1->num_of_words-1; ; i--) {
+      if(bitarr1->words[i]) return 1;
+      if(i == bitarr2->num_of_words) break;
+    }
+  }
+  else if(bitarr1->num_of_words < bitarr2->num_of_words) {
+    for(i = bitarr2->num_of_words-1; ; i--) {
+      if(bitarr2->words[i]) return 1;
+      if(i == bitarr1->num_of_words) break;
+    }
+  }
+
+  if(min_words == 0) return 0;
+
+  for(i = min_words-1; ; i--)
+  {
+    word1 = bitarr1->words[i];
+    word2 = bitarr2->words[i];
+    if(word1 != word2) return (word1 > word2 ? 1 : -1);
+    if(i == 0) break;
+  }
+
+  if(bitarr1->num_of_bits == bitarr2->num_of_bits) return 0;
+  return bitarr1->num_of_bits > bitarr2->num_of_bits ? 1 : -1;
+}
+
+// Compare two bit arrays by value stored, with index 0 being the Most
+// Significant Bit (MSB). Arrays do not have to be the same length.
+// Example: 10.. > 01.. [index 0 is MSB at left hand side]
+// Sorts on length if all zeros: (0,0) < (0,0,0)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2)
+{
+  word_addr_t min_words = MAX(bitarr1->num_of_words, bitarr2->num_of_words);
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = 0; i < min_words; i++) {
+    word1 = _reverse_word(bitarr1->words[i]);
+    word2 = _reverse_word(bitarr2->words[i]);
+    if(word1 != word2) return (word1 > word2 ? 1 : -1);
+  }
+
+  // Check remaining words. Only one of these loops will execute
+  for(; i < bitarr1->num_of_words; i++)
+    if(bitarr1->words[i]) return 1;
+  for(; i < bitarr2->num_of_words; i++)
+    if(bitarr2->words[i]) return -1;
+
+  if(bitarr1->num_of_bits == bitarr2->num_of_bits) return 0;
+  return bitarr1->num_of_bits > bitarr2->num_of_bits ? 1 : -1;
+}
+
+// compare bitarr with (bitarr2 << pos)
+// bit_array_cmp(bitarr1, bitarr2<<pos)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp_words(const BIT_ARRAY *arr1,
+                        bit_index_t pos, const BIT_ARRAY *arr2)
+{
+  if(arr1->num_of_bits == 0 && arr2->num_of_bits == 0)
+  {
+    return 0;
+  }
+
+  bit_index_t top_bit1 = 0, top_bit2 = 0;
+
+  char arr1_zero = !bit_array_find_last_set_bit(arr1, &top_bit1);
+  char arr2_zero = !bit_array_find_last_set_bit(arr2, &top_bit2);
+
+  if(arr1_zero && arr2_zero) return 0;
+  if(arr1_zero) return -1;
+  if(arr2_zero) return 1;
+
+  bit_index_t top_bit2_offset = top_bit2 + pos;
+
+  if(top_bit1 != top_bit2_offset) {
+    return top_bit1 > top_bit2_offset ? 1 : -1;
+  }
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = top_bit2 / WORD_SIZE; i > 0; i--)
+  {
+    word1 = _get_word(arr1, pos + i * WORD_SIZE);
+    word2 = arr2->words[i];
+
+    if(word1 > word2) return 1;
+    if(word1 < word2) return -1;
+  }
+
+  word1 = _get_word(arr1, pos);
+  word2 = arr2->words[0];
+
+  if(word1 > word2) return 1;
+  if(word1 < word2) return -1;
+
+  // return 1 if arr1[0..pos] != 0, 0 otherwise
+
+  // Whole words
+  word_addr_t num_words = pos / WORD_SIZE;
+
+  for(i = 0; i < num_words; i++)
+  {
+    if(arr1->words[i] > 0)
+    {
+      return 1;
+    }
+  }
+
+  word_offset_t bits_remaining = pos - num_words * WORD_SIZE;
+
+  if(arr1->words[num_words] & bitmask64(bits_remaining))
+  {
+    return 1;
+  }
+
+  return 0;
+}
+
+
+//
+// Reverse -- coords may wrap around
+//
+
+// No bounds checking
+// length cannot be zero
+static void _reverse_region(BIT_ARRAY* bitarr,
+                            bit_index_t start,
+                            bit_index_t length)
+{
+  bit_index_t left = start;
+  bit_index_t right = (start + length - WORD_SIZE) % bitarr->num_of_bits;
+
+  while(length >= 2 * WORD_SIZE)
+  {
+    // Swap entire words
+    word_t left_word = _get_word_cyclic(bitarr, left);
+    word_t right_word = _get_word_cyclic(bitarr, right);
+
+    // reverse words individually
+    left_word = _reverse_word(left_word);
+    right_word = _reverse_word(right_word);
+
+    // Swap
+    _set_word_cyclic(bitarr, left, right_word);
+    _set_word_cyclic(bitarr, right, left_word);
+
+    // Update
+    left = (left + WORD_SIZE) % bitarr->num_of_bits;
+    right = (right < WORD_SIZE ? right + bitarr->num_of_bits : right) - WORD_SIZE;
+    length -= 2 * WORD_SIZE;
+  }
+
+  word_t word, rev;
+
+  if(length == 0)
+  {
+    return;
+  }
+  else if(length > WORD_SIZE)
+  {
+    // Words overlap
+    word_t left_word = _get_word_cyclic(bitarr, left);
+    word_t right_word = _get_word_cyclic(bitarr, right);
+
+    rev = _reverse_word(left_word);
+    right_word = _reverse_word(right_word);
+
+    // fill left 64 bits with right word rev
+    _set_word_cyclic(bitarr, left, right_word);
+
+    // Now do remaining bits (length is between 1 and 64 bits)
+    left += WORD_SIZE;
+    length -= WORD_SIZE;
+
+    word = _get_word_cyclic(bitarr, left);
+  }
+  else
+  {
+    word = _get_word_cyclic(bitarr, left);
+    rev = _reverse_word(word);
+  }
+
+  rev >>= WORD_SIZE - length;
+  word_t mask = bitmask64(length);
+
+  word = bitmask_merge(rev, word, mask);
+
+  _set_word_cyclic(bitarr, left, word);
+}
+
+void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  if(len > 0) _reverse_region(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+void bit_array_reverse(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits > 0) _reverse_region(bitarr, 0, bitarr->num_of_bits);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Shift left / right
+//
+
+// Shift towards MSB / higher index
+void bit_array_shift_left(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill)
+{
+  if(shift_dist >= bitarr->num_of_bits)
+  {
+    fill ? bit_array_set_all(bitarr) : bit_array_clear_all(bitarr);
+    return;
+  }
+  else if(shift_dist == 0)
+  {
+    return;
+  }
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+
+  bit_index_t cpy_length = bitarr->num_of_bits - shift_dist;
+  _array_copy(bitarr, shift_dist, bitarr, 0, cpy_length);
+  _set_region(bitarr, 0, shift_dist, action);
+}
+
+// shift left extend - don't truncate bits when shifting UP, instead
+// make room for them.
+void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist,
+                                 char fill)
+{
+   bit_index_t newlen = bitarr->num_of_bits + shift_dist;
+   bit_index_t cpy_length = bitarr->num_of_bits;
+
+  if(shift_dist == 0)
+  {
+    return;
+  }
+
+  bit_array_resize_critical(bitarr, newlen);
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+  _array_copy(bitarr, shift_dist, bitarr, 0, cpy_length);
+  _set_region(bitarr, 0, shift_dist, action);
+}
+
+// Shift towards LSB / lower index
+void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill)
+{
+  if(shift_dist >= bitarr->num_of_bits)
+  {
+    fill ? bit_array_set_all(bitarr) : bit_array_clear_all(bitarr);
+    return;
+  }
+  else if(shift_dist == 0)
+  {
+    return;
+  }
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+
+  bit_index_t cpy_length = bitarr->num_of_bits - shift_dist;
+  bit_array_copy(bitarr, 0, bitarr, shift_dist, cpy_length);
+
+  _set_region(bitarr, cpy_length, shift_dist, action);
+}
+
+//
+// Cycle
+//
+
+// Cycle towards index 0
+void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t cycle_dist)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  cycle_dist = cycle_dist % bitarr->num_of_bits;
+
+  if(cycle_dist == 0)
+  {
+    return;
+  }
+
+  bit_index_t len1 = cycle_dist;
+  bit_index_t len2 = bitarr->num_of_bits - cycle_dist;
+
+  _reverse_region(bitarr, 0, len1);
+  _reverse_region(bitarr, len1, len2);
+  bit_array_reverse(bitarr);
+}
+
+// Cycle away from index 0
+void bit_array_cycle_left(BIT_ARRAY* bitarr, bit_index_t cycle_dist)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  cycle_dist = cycle_dist % bitarr->num_of_bits;
+
+  if(cycle_dist == 0)
+  {
+    return;
+  }
+
+  bit_index_t len1 = bitarr->num_of_bits - cycle_dist;
+  bit_index_t len2 = cycle_dist;
+
+  _reverse_region(bitarr, 0, len1);
+  _reverse_region(bitarr, len1, len2);
+  bit_array_reverse(bitarr);
+}
+
+//
+// Next permutation
+//
+
+static word_t _next_permutation(word_t v)
+{
+  // From http://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation
+  word_t t = v | (v - 1); // t gets v's least significant 0 bits set to 1
+  // Next set to 1 the most significant bit to change,
+  // set to 0 the least significant ones, and add the necessary 1 bits.
+  return (t+1) | (((~t & (t+1)) - 1) >> (trailing_zeros(v) + 1));
+}
+
+// Get the next permutation of an array with a fixed size and given number of
+// bits set.  Also known as next lexicographic permutation.
+// Given a bit array find the next lexicographic orginisation of the bits
+// Number of possible combinations given by (size choose bits_set) i.e. nCk
+// 00011 -> 00101 -> 00110 -> 01001 -> 01010 ->
+// 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start)
+void bit_array_next_permutation(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  word_addr_t w;
+
+  char carry = 0;
+  word_offset_t top_bits = bitset64_idx(bitarr->num_of_bits);
+
+  for(w = 0; w < bitarr->num_of_words; w++)
+  {
+    word_t mask
+      = (w < bitarr->num_of_words - 1 || top_bits == 0) ? WORD_MAX
+                                                        : bitmask64(top_bits);
+
+    if(bitarr->words[w] > 0 &&
+       (bitarr->words[w] | (bitarr->words[w]-1)) == mask)
+    {
+      // Bits in this word cannot be moved forward
+      carry = 1;
+    }
+    else if(carry)
+    {
+      // 0111 -> 1000, 1000 -> 1001
+      word_t tmp = bitarr->words[w] + 1;
+
+      // Count bits previously set
+      bit_index_t bits_previously_set = POPCOUNT(bitarr->words[w]);
+
+      // set new word
+      bitarr->words[w] = tmp;
+
+      // note: w is unsigned
+      // Zero words while counting bits set
+      while(w > 0)
+      {
+        bits_previously_set += POPCOUNT(bitarr->words[w-1]);
+        bitarr->words[w-1] = 0;
+        w--;
+      }
+
+      // Set bits at the beginning
+      SET_REGION(bitarr, 0, bits_previously_set - POPCOUNT(tmp));
+
+      carry = 0;
+      break;
+    }
+    else if(bitarr->words[w] > 0)
+    {
+      bitarr->words[w] = _next_permutation(bitarr->words[w]);
+      break;
+    }
+  }
+
+  if(carry)
+  {
+    // Loop around
+    bit_index_t num_bits_set = bit_array_num_bits_set(bitarr);
+    bit_array_clear_all(bitarr);
+    SET_REGION(bitarr, 0, num_bits_set);
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Interleave
+//
+
+// dst cannot point to the same bit array as src1 or src2
+// src1, src2 may point to the same bit array
+// abcd 1234 -> a1b2c3d4
+// 0011 0000 -> 00001010
+// 1111 0000 -> 10101010
+// 0101 1010 -> 01100110
+void bit_array_interleave(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1,
+                          const BIT_ARRAY* src2)
+{
+  // dst cannot be either src1 or src2
+  assert(dst != src1 && dst != src2);
+  // Behaviour undefined when src1 length != src2 length",
+  assert(src1->num_of_bits == src2->num_of_bits);
+
+  // Need at least src1->num_of_words + src2->num_of_words
+  size_t nwords = MIN(src1->num_of_words + src2->num_of_words, 2);
+  _bit_array_ensure_nwords(dst, nwords, __FILE__, __LINE__, __func__);
+  dst->num_of_bits = src1->num_of_bits + src2->num_of_bits;
+  dst->num_of_words = roundup_bits2words64(dst->num_of_bits);
+
+  word_addr_t i, j;
+
+  for(i = 0, j = 0; i < src1->num_of_words; i++)
+  {
+    word_t a = src1->words[i];
+    word_t b = src2->words[i];
+
+    dst->words[j++] =  morton_table0[(a      ) & 0xff] |
+                       morton_table1[(b      ) & 0xff] |
+                      (morton_table0[(a >>  8) & 0xff] << 16) |
+                      (morton_table1[(b >>  8) & 0xff] << 16) |
+                      (morton_table0[(a >> 16) & 0xff] << 32) |
+                      (morton_table1[(b >> 16) & 0xff] << 32) |
+                      (morton_table0[(a >> 24) & 0xff] << 48) |
+                      (morton_table1[(b >> 24) & 0xff] << 48);
+
+    dst->words[j++] =  morton_table0[(a >> 32) & 0xff] |
+                       morton_table1[(b >> 32) & 0xff] |
+                      (morton_table0[(a >> 40) & 0xff] << 16) |
+                      (morton_table1[(b >> 40) & 0xff] << 16) |
+                      (morton_table0[(a >> 48) & 0xff] << 32) |
+                      (morton_table1[(b >> 48) & 0xff] << 32) |
+                      (morton_table0[(a >> 56)       ] << 48) |
+                      (morton_table1[(b >> 56)       ] << 48);
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+//
+// Random
+//
+
+// Set bits randomly with probability prob : 0 <= prob <= 1
+void bit_array_random(BIT_ARRAY* bitarr, float prob)
+{
+  assert(prob >= 0 && prob <= 1);
+
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+  else if(prob == 1)
+  {
+    bit_array_set_all(bitarr);
+    return;
+  }
+
+  // rand() generates number between 0 and RAND_MAX inclusive
+  // therefore we want to check if rand() <= p
+  long p = RAND_MAX * prob;
+
+  _seed_rand();
+
+  word_addr_t w;
+  word_offset_t o;
+
+  // Initialise to zero
+  memset(bitarr->words, 0, bitarr->num_of_words * sizeof(word_t));
+
+  for(w = 0; w < bitarr->num_of_words - 1; w++)
+  {
+    for(o = 0; o < WORD_SIZE; o++)
+    {
+      if(rand() <= p)
+      {
+        bitarr->words[w] |= ((word_t)0x1 << o);
+      }
+    }
+  }
+
+  // Top word
+  word_offset_t bits_in_last_word = bits_in_top_word(bitarr->num_of_bits);
+  w = bitarr->num_of_words - 1;
+
+  for(o = 0; o < bits_in_last_word; o++)
+  {
+    if(rand() <= p)
+    {
+      bitarr->words[w] |= ((word_t)0x1 << o);
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Shuffle the bits in an array randomly
+void bit_array_shuffle(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits == 0)
+    return;
+
+  _seed_rand();
+
+  bit_index_t i, j;
+
+  for(i = bitarr->num_of_bits - 1; i > 0; i--)
+  {
+    j = (bit_index_t)rand() % i;
+
+    // Swap i and j
+    char x = (bitarr->words[bitset64_wrd(i)] >> bitset64_idx(i)) & 0x1;
+    char y = (bitarr->words[bitset64_wrd(j)] >> bitset64_idx(j)) & 0x1;
+
+    if(!y)
+      bitarr->words[bitset64_wrd(i)] &= ~((word_t)0x1 << bitset64_idx(i));
+    else
+      bitarr->words[bitset64_wrd(i)] |= (word_t)0x1 << bitset64_idx(i);
+
+    if(!x)
+      bitarr->words[bitset64_wrd(j)] &= ~((word_t)0x1 << bitset64_idx(j));
+    else
+      bitarr->words[bitset64_wrd(j)] |= (word_t)0x1 << bitset64_idx(j);
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Arithmetic
+//
+
+// Returns 1 on sucess, 0 if value in array is too big
+char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    *result = 0;
+    return 1;
+  }
+
+  word_addr_t w;
+
+  for(w = bitarr->num_of_words-1; w > 0; w--)
+  {
+    if(bitarr->words[w] > 0)
+    {
+      return 0;
+    }
+  }
+
+  *result = bitarr->words[0];
+  return 1;
+}
+
+
+// 1 iff bitarr > value
+// 0 iff bitarr == value
+// -1 iff bitarr < value
+int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value)
+{
+  uint64_t arr_num = 0;
+
+  // If cannot put bitarr in uint64, it is > value
+  if(!bit_array_as_num(bitarr, &arr_num)) return 1;
+
+  if(arr_num > value)      return  1;
+  else if(arr_num < value) return -1;
+  else                     return  0;
+}
+
+// If value is zero, no change is made
+void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value)
+{
+  if(value == 0)
+  {
+    return;
+  }
+  else if(bitarr->num_of_bits == 0)
+  {
+    bit_array_resize_critical(bitarr, WORD_SIZE - leading_zeros(value));
+    bitarr->words[0] = (word_t)value;
+    return;
+  }
+
+  char carry = 0;
+  word_addr_t i;
+
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    if(WORD_MAX - bitarr->words[i] < value)
+    {
+      carry = 1;
+      bitarr->words[i] += value;
+    }
+    else
+    {
+      // Carry is absorbed
+      bitarr->words[i] += value;
+      carry = 0;
+      break;
+    }
+  }
+
+  if(carry)
+  {
+    // Bit array full, need another bit after all words filled
+    bit_array_resize_critical(bitarr, bitarr->num_of_words * WORD_SIZE + 1);
+
+    // Set top word to 1
+    bitarr->words[bitarr->num_of_words-1] = 1;
+  }
+  else
+  {
+    word_t final_word = bitarr->words[bitarr->num_of_words-1];
+    word_offset_t expected_bits = bits_in_top_word(bitarr->num_of_bits);
+    word_offset_t actual_bits = WORD_SIZE - leading_zeros(final_word);
+
+    if(actual_bits > expected_bits)
+    {
+      // num_of_bits has increased -- num_of_words has not
+      bitarr->num_of_bits += (actual_bits - expected_bits);
+    }
+  }
+}
+
+// If value is greater than bitarr, bitarr is not changed and 0 is returned
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value)
+{
+  if(value == 0)
+  {
+    return 1;
+  }
+  else if(bitarr->words[0] >= value)
+  {
+    bitarr->words[0] -= value;
+    return 1;
+  }
+
+  value -= bitarr->words[0];
+
+  word_addr_t i;
+
+  for(i = 1; i < bitarr->num_of_words; i++)
+  {
+    if(bitarr->words[i] > 0)
+    {
+      // deduct one
+      bitarr->words[i]--;
+
+      for(; i > 0; i--)
+      {
+        bitarr->words[i] = WORD_MAX;
+      }
+
+      // -1 since we've already deducted 1
+      bitarr->words[0] = WORD_MAX - value - 1;
+
+      return 1;
+    }
+  }
+
+  // subtract value is greater than array
+  return 0;
+}
+
+//
+// Arithmetic between bit arrays
+//
+
+// src1, src2 and dst can all be the same BIT_ARRAY
+static void _arithmetic(BIT_ARRAY* dst,
+                        const BIT_ARRAY* src1,
+                        const BIT_ARRAY* src2,
+                        char subtract)
+{
+  word_addr_t max_words = MAX(src1->num_of_words, src2->num_of_words);
+
+  // Adding: dst_words >= max(src1 words, src2 words)
+  // Subtracting: dst_words is >= src1->num_of_words
+
+  char carry = subtract ? 1 : 0;
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = 0; i < max_words; i++)
+  {
+    word1 = (i < src1->num_of_words ? src1->words[i] : 0);
+    word2 = (i < src2->num_of_words ? src2->words[i] : 0);
+
+    if(subtract)
+      word2 = ~word2;
+
+    dst->words[i] = word1 + word2 + carry;
+    // Update carry
+    carry = WORD_MAX - word1 < word2 || WORD_MAX - word1 - word2 < (word_t)carry;
+  }
+
+  if(subtract)
+  {
+    carry = 0;
+  }
+  else
+  {
+    // Check last word
+    word_offset_t bits_on_last_word = bits_in_top_word(dst->num_of_bits);
+
+    if(bits_on_last_word < WORD_SIZE)
+    {
+      word_t mask = bitmask64(bits_on_last_word);
+
+      if(dst->words[max_words-1] > mask)
+      {
+        // Array has overflowed, increase size
+        dst->num_of_bits++;
+      }
+    }
+    else if(carry)
+    {
+      // Carry onto a new word
+      if(dst->num_of_words == max_words)
+      {
+        // Need to resize for the carry bit
+        bit_array_resize_critical(dst, dst->num_of_bits+1);
+      }
+
+      dst->words[max_words] = (word_t)1;
+    }
+  }
+
+  // Zero the rest of dst array
+  for(i = max_words+carry; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = (word_t)0;
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than either of src1, src2, it is enlarged
+void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  bit_array_ensure_size_critical(dst, MAX(src1->num_of_bits, src2->num_of_bits));
+  _arithmetic(dst, src1, src2, 0);
+}
+
+// dst = src1 - src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than src1, it will be extended to be as long as src1
+// src1 must be greater than or equal to src2 (src1 >= src2)
+void bit_array_subtract(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  // subtraction by method of complements:
+  // a - b = a + ~b + 1 = src1 + ~src2 +1
+
+  assert(bit_array_cmp(src1, src2) >= 0); // Require src1 >= src2
+
+  bit_array_ensure_size_critical(dst, src1->num_of_bits);
+  _arithmetic(dst, src1, src2, 1);
+}
+
+
+// Add `add` to `bitarr` at `pos`
+// Bounds checking not needed as out of bounds is valid
+void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add)
+{
+  DEBUG_VALIDATE(bitarr);
+
+  if(add == 0)
+  {
+    return;
+  }
+  else if(pos >= bitarr->num_of_bits)
+  {
+    // Resize and add!
+    bit_index_t num_bits_required = pos + (WORD_SIZE - leading_zeros(add));
+    bit_array_resize_critical(bitarr, num_bits_required);
+    _set_word(bitarr, pos, (word_t)add);
+    return;
+  }
+
+  /*
+  char str[1000];
+  printf(" add_word: %s\n", bit_array_to_str_rev(bitarr, str));
+  printf("     word: %s [pos: %i]\n", _word_to_str(add, str), (int)pos);
+  */
+
+  word_t w = _get_word(bitarr, pos);
+  word_t sum = w + add;
+  char carry = WORD_MAX - w < add;
+
+  // Ensure array is big enough
+  bit_index_t num_bits_required = pos + (carry ? WORD_SIZE + 1
+                                               : (WORD_SIZE - leading_zeros(sum)));
+
+  bit_array_ensure_size(bitarr, num_bits_required);
+
+  _set_word(bitarr, pos, sum);
+  pos += WORD_SIZE;
+
+  if(carry)
+  {
+    word_offset_t offset = pos % WORD_SIZE;
+    word_addr_t addr = bitset64_wrd(pos);
+
+    add = (word_t)0x1 << offset;
+    carry = (WORD_MAX - bitarr->words[addr] < add);
+    sum = bitarr->words[addr] + add;
+
+    num_bits_required = addr * WORD_SIZE +
+                        (carry ? WORD_SIZE + 1 : (WORD_SIZE - leading_zeros(sum)));
+
+    bit_array_ensure_size(bitarr, num_bits_required);
+
+    bitarr->words[addr++] = sum;
+
+    if(carry)
+    {
+      while(addr < bitarr->num_of_words && bitarr->words[addr] == WORD_MAX)
+      {
+        bitarr->words[addr++] = 0;
+      }
+
+      if(addr == bitarr->num_of_words)
+      {
+        bit_array_resize_critical(bitarr, addr * WORD_SIZE + 1);
+      }
+      else if(addr == bitarr->num_of_words-1 &&
+              bitarr->words[addr] == bitmask64(bits_in_top_word(bitarr->num_of_bits)))
+      {
+        bit_array_resize_critical(bitarr, bitarr->num_of_bits + 1);
+      }
+
+      bitarr->words[addr]++;
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Add `add` to `bitarr` at `pos`
+// Bounds checking not needed as out of bounds is valid
+void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add)
+{
+  assert(bitarr != add); // bitarr and add cannot point to the same bit array
+
+  bit_index_t add_top_bit_set;
+
+  if(!bit_array_find_last_set_bit(add, &add_top_bit_set))
+  {
+    // No bits set in add
+    return;
+  }
+  else if(pos >= bitarr->num_of_bits)
+  {
+    // Just resize and copy!
+    bit_index_t num_bits_required = pos + add_top_bit_set + 1;
+    bit_array_resize_critical(bitarr, num_bits_required);
+    _array_copy(bitarr, pos, add, 0, add->num_of_bits);
+    return;
+  }
+  else if(pos == 0)
+  {
+    bit_array_add(bitarr, bitarr, add);
+    return;
+  }
+
+  /*
+  char str[1000];
+  printf(" add_words1: %s\n", bit_array_to_str_rev(bitarr, str));
+  printf(" add_words2: %s\n", bit_array_to_str_rev(add, str));
+  printf(" [pos: %i]\n", (int)pos);
+  */
+
+  bit_index_t num_bits_required = pos + add_top_bit_set + 1;
+  bit_array_ensure_size(bitarr, num_bits_required);
+
+  word_addr_t first_word = bitset64_wrd(pos);
+  word_offset_t first_offset = bitset64_idx(pos);
+
+  word_t w = add->words[0] << first_offset;
+  unsigned char carry = (WORD_MAX - bitarr->words[first_word] < w);
+
+  bitarr->words[first_word] += w;
+
+  word_addr_t i = first_word + 1;
+  bit_index_t offset = WORD_SIZE - first_offset;
+
+  for(; carry || offset <= add_top_bit_set; i++, offset += WORD_SIZE)
+  {
+    w = offset < add->num_of_bits ? _get_word(add, offset) : (word_t)0;
+
+    if(i >= bitarr->num_of_words)
+    {
+      // Extend by a word
+      bit_array_resize_critical(bitarr, (bit_index_t)(i+1)*WORD_SIZE+1);
+    }
+
+    word_t prev = bitarr->words[i];
+
+    bitarr->words[i] += w + carry;
+
+    carry = (WORD_MAX - prev < w || (carry && prev + w == WORD_MAX)) ? 1 : 0;
+  }
+
+  word_offset_t top_bits
+    = WORD_SIZE - leading_zeros(bitarr->words[bitarr->num_of_words-1]);
+
+  bit_index_t min_bits = (bitarr->num_of_words-1)*WORD_SIZE + top_bits;
+
+  if(bitarr->num_of_bits < min_bits)
+  {
+    // Extend within the last word
+    bitarr->num_of_bits = min_bits;
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+char bit_array_sub_word(BIT_ARRAY* bitarr, bit_index_t pos, word_t minus)
+{
+  DEBUG_VALIDATE(bitarr);
+
+  if(minus == 0)
+  {
+    return 1;
+  }
+
+  word_t w = _get_word(bitarr, pos);
+
+  if(w >= minus)
+  {
+    _set_word(bitarr, pos, w - minus);
+    DEBUG_VALIDATE(bitarr);
+    return 1;
+  }
+
+  minus -= w;
+
+  bit_index_t offset;
+  for(offset = pos + WORD_SIZE; offset < bitarr->num_of_bits; offset += WORD_SIZE)
+  {
+    w = _get_word(bitarr, offset);
+
+    if(w > 0)
+    {
+      // deduct one
+      _set_word(bitarr, offset, w - 1);
+
+      SET_REGION(bitarr, pos, offset-pos);
+
+      // -1 since we've already deducted 1
+      minus--;
+
+      _set_word(bitarr, pos, WORD_MAX - minus);
+
+      DEBUG_VALIDATE(bitarr);
+      return 1;
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+
+  return 0;
+}
+
+char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus)
+{
+  assert(bitarr != minus); // bitarr and minus cannot point to the same bit array
+
+  int cmp = bit_array_cmp_words(bitarr, pos, minus);
+
+  if(cmp == 0)
+  {
+    bit_array_clear_all(bitarr);
+    return 1;
+  }
+  else if(cmp < 0)
+  {
+    return 0;
+  }
+
+  bit_index_t bitarr_length = bitarr->num_of_bits;
+
+  bit_index_t bitarr_top_bit_set;
+  bit_array_find_last_set_bit(bitarr, &bitarr_top_bit_set);
+
+  // subtraction by method of complements:
+  // a - b = a + ~b + 1 = src1 + ~src2 +1
+
+  bit_array_not(minus, minus);
+
+  bit_array_add_words(bitarr, pos, minus);
+  bit_array_add_word(bitarr, pos, (word_t)1);
+
+  bit_array_sub_word(bitarr, pos+minus->num_of_bits, 1);
+  bit_array_resize(bitarr, bitarr_length);
+
+  bit_array_not(minus, minus);
+
+  DEBUG_VALIDATE(bitarr);
+
+  return 1;
+}
+
+void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier)
+{
+  if(bitarr->num_of_bits == 0 || multiplier == 1)
+  {
+    return;
+  }
+  else if(multiplier == 0)
+  {
+    bit_array_clear_all(bitarr);
+    return;
+  }
+
+  bit_index_t i;
+
+  for(i = bitarr->num_of_bits; i > 0; i--)
+  {
+    if(bit_array_get(bitarr, i-1))
+    {
+      bit_array_clear(bitarr, i-1);
+      bit_array_add_word(bitarr, i-1, multiplier);
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2)
+{
+  if(src1->num_of_bits == 0 || src2->num_of_bits == 0)
+  {
+    bit_array_clear_all(dst);
+    return;
+  }
+
+  // Cannot pass the same array as dst, src1 AND src2
+  assert(dst != src1 || dst != src2);
+
+  // Dev: multiplier == 1?
+
+  BIT_ARRAY *read_arr, *add_arr;
+
+  if(src1 == dst)
+  {
+    read_arr = src1;
+    add_arr = src2;
+  }
+  else
+  {
+    read_arr = src2;
+    add_arr = src1;
+  }
+
+  if(dst != src1 && dst != src2)
+  {
+    bit_array_clear_all(dst);
+  }
+
+  bit_index_t i;
+
+  for(i = read_arr->num_of_bits; i > 0; i--)
+  {
+    if(bit_array_get(read_arr, i-1))
+    {
+      bit_array_clear(dst, i-1);
+      bit_array_add_words(dst, i-1, add_arr);
+    }
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// bitarr = round_down(bitarr / divisor)
+// rem = bitarr % divisor
+void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem)
+{
+  assert(divisor != 0); // cannot divide by zero
+
+  bit_index_t div_top_bit = 63 - leading_zeros(divisor);
+  bit_index_t bitarr_top_bit;
+
+  if(!bit_array_find_last_set_bit(bitarr, &bitarr_top_bit))
+  {
+    *rem = 0;
+    return;
+  }
+
+  if(bitarr_top_bit < div_top_bit)
+  {
+    *rem = bitarr->words[0];
+    bit_array_clear_all(bitarr);
+    return;
+  }
+
+  // When div is shifted by offset, their top set bits are aligned
+  bit_index_t offset = bitarr_top_bit - div_top_bit;
+
+  uint64_t tmp = _get_word(bitarr, offset);
+  _set_word(bitarr, offset, (word_t)0);
+
+  // Carry if 1 if the top bit was set before left shift
+  char carry = 0;
+
+  // offset unsigned so break when offset == 0
+  while(1)
+  {
+    if(carry)
+    {
+      // (carry:tmp) - divisor = (WORD_MAX+1+tmp)-divisor
+      tmp = WORD_MAX - divisor + tmp + 1;
+      bit_array_set(bitarr, offset);
+    }
+    else if(tmp >= divisor)
+    {
+      tmp -= divisor;
+      bit_array_set(bitarr, offset);
+    }
+    else
+    {
+      bit_array_clear(bitarr, offset);
+    }
+
+    if(offset == 0)
+      break;
+
+    offset--;
+
+    // Is the top bit set (that we're about to shift off)?
+    carry = tmp & 0x8000000000000000;
+
+    tmp <<= 1;
+    tmp |= bit_array_get(bitarr, offset);
+  }
+
+  *rem = tmp;
+}
+
+// Results in:
+//   quotient = dividend / divisor
+//   dividend = dividend % divisor
+// (dividend is used to return the remainder)
+void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor)
+{
+  assert(bit_array_cmp_uint64(divisor, 0) != 0); // Cannot divide by zero
+
+  bit_array_clear_all(quotient);
+
+  int cmp = bit_array_cmp(dividend, divisor);
+
+  if(cmp == 0)
+  {
+    bit_array_ensure_size(quotient, 1);
+    bit_array_set(quotient, 0);
+    bit_array_clear_all(dividend);
+    return;
+  }
+  else if(cmp < 0)
+  {
+    // dividend is < divisor, quotient is zero -- done
+    return;
+  }
+
+  // now we know: dividend > divisor, quotient is zero'd,
+  //              dividend != 0, divisor != 0
+  bit_index_t dividend_top_bit = 0, div_top_bit = 0;
+
+  bit_array_find_last_set_bit(dividend, &dividend_top_bit);
+  bit_array_find_last_set_bit(divisor, &div_top_bit);
+
+  // When divisor is shifted by offset, their top set bits are aligned
+  bit_index_t offset = dividend_top_bit - div_top_bit;
+
+  // offset unsigned so break when offset == 0
+  for(; ; offset--)
+  {
+    if(bit_array_cmp_words(dividend, offset, divisor) >= 0)
+    {
+      bit_array_sub_words(dividend, offset, divisor);
+      bit_array_ensure_size(quotient, offset+1);
+      bit_array_set(quotient, offset);
+    }
+
+    if(offset == 0)
+      break;
+  }
+}
+
+//
+// Read/Write from files
+//
+// file format is [8 bytes: for number of elements in array][data]
+// data is written in little endian order (least sig byte first)
+//
+
+// Saves bit array to a file. Returns the number of bytes written
+// number of bytes returned should be 8+(bitarr->num_of_bits+7)/8
+bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f)
+{
+  bit_index_t num_of_bytes = roundup_bits2bytes(bitarr->num_of_bits);
+  bit_index_t bytes_written = 0;
+
+  const int endian = 1;
+  if(*(uint8_t*)&endian == 1)
+  {
+    // Little endian machine
+    // Write 8 bytes to store the number of bits in the array
+    bytes_written += fwrite(&bitarr->num_of_bits, 1, 8, f);
+
+    // Write the array
+    bytes_written += fwrite(bitarr->words, 1, num_of_bytes, f);
+  }
+  else
+  {
+    // Big endian machine
+    uint64_t i, w, whole_words = num_of_bytes/sizeof(word_t);
+    uint64_t rem_bytes = num_of_bytes - whole_words*sizeof(word_t);
+    uint64_t n_bits = byteswap64(bitarr->num_of_bits);
+
+    // Write 8 bytes to store the number of bits in the array
+    bytes_written += fwrite(&n_bits, 1, 8, f);
+
+    // Write the array
+    for(i = 0; i < whole_words; i++) {
+      w = byteswap64(bitarr->words[i]);
+      bytes_written += fwrite(&w, 1, 8, f);
+    }
+
+    if(rem_bytes > 0) {
+      w = byteswap64(bitarr->words[whole_words]);
+      bytes_written += fwrite(&w, 1, rem_bytes, f);
+    }
+  }
+
+  return bytes_written;
+}
+
+// Load a uint64 from little endian format.
+// Works for both big and little endian architectures
+static inline uint64_t le64_to_cpu(const uint8_t *x)
+{
+  return (((uint64_t)(x[0]))       | ((uint64_t)(x[1]) << 8)  |
+          ((uint64_t)(x[2]) << 16) | ((uint64_t)(x[3]) << 24) |
+          ((uint64_t)(x[4]) << 32) | ((uint64_t)(x[5]) << 40) |
+          ((uint64_t)(x[6]) << 48) | ((uint64_t)(x[7]) << 56));
+}
+
+// Reads bit array from a file. bitarr is resized and filled.
+// Returns 1 on success, 0 on failure
+char bit_array_load(BIT_ARRAY* bitarr, FILE* f)
+{
+  // Read in number of bits, return 0 if we can't read in
+  bit_index_t num_bits;
+  if(fread(&num_bits, 1, 8, f) != 8) return 0;
+  num_bits = le64_to_cpu((uint8_t*)&num_bits);
+
+  // Resize
+  bit_array_resize_critical(bitarr, num_bits);
+
+  // Have to calculate how many bytes are needed for the file
+  // (Note: this may be different from num_of_words * sizeof(word_t))
+  bit_index_t num_of_bytes = roundup_bits2bytes(bitarr->num_of_bits);
+  if(fread(bitarr->words, 1, num_of_bytes, f) != num_of_bytes) return 0;
+
+  // Fix endianness
+  word_addr_t i;
+  for(i = 0; i < bitarr->num_of_words; i++)
+    bitarr->words[i] = le64_to_cpu((uint8_t*)&bitarr->words[i]);
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+  return 1;
+}
+
+//
+// Hash function
+//
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define hashsize(n) ((uint32_t)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+/*
+From: lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+--------------------------------------------------------------------
+hashword2() -- same as hashword(), but take two seeds and return two
+32-bit values.  pc and pb must both be nonnull, and *pc and *pb must
+both be initialized with seeds.  If you pass in (*pb)==0, the output
+(*pc) will be the same as the return value from hashword().
+--------------------------------------------------------------------
+*/
+static void hashword2 (
+const uint32_t *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32_t       *pc,                      /* IN: seed OUT: primary hash value */
+uint32_t       *pb)               /* IN: more seed OUT: secondary hash value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
+  c += *pb;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  {
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  *pc=c; *pb=b;
+}
+
+// Pass seed as 0 on first call, pass previous hash value if rehashing due
+// to a collision
+// Using bob jenkins hash lookup3
+uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed)
+{
+  uint32_t seed32[2];
+  memcpy(seed32, &seed, sizeof(uint32_t)*2);
+
+  // Round up length to number 32bit words
+  hashword2((uint32_t*)bitarr->words, (bitarr->num_of_bits + 31) / 32,
+            &seed32[0], &seed32[1]);
+
+  // XOR with array length. This ensures arrays with different length but same
+  // contents have different hash values
+  seed ^= bitarr->num_of_bits;
+
+  return seed;
+}
+
+
+//
+// Generally useful functions
+//
+
+// Generalised 'binary to string' function
+// Adds bits to the string in order of lsb to msb
+// e.g. 0b11010 (26 in decimal) would come out as "01011"
+char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str)
+{
+  const uint8_t* d = (const uint8_t*)ptr;
+
+  size_t i;
+  for(i = 0; i < num_of_bits; i++)
+  {
+    uint8_t bit = (d[i/8] >> (i % 8)) & 0x1;
+    str[i] = bit ? '1' : '0';
+  }
+  str[num_of_bits] = '\0';
+  return str;
+}
+
+char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str)
+{
+  const uint8_t* d = (const uint8_t*)ptr;
+
+  size_t i;
+  for(i = 0; i < num_of_bits; i++)
+  {
+    uint8_t bit = (d[i/8] >> (i % 8)) & 0x1;
+    str[num_of_bits-1-i] = bit ? '1' : '0';
+  }
+  str[num_of_bits] = '\0';
+  return str;
+}
diff --git a/debian/rapmap/bit_array.h b/debian/rapmap/bit_array.h
new file mode 100644
index 0000000..70b50ad
--- /dev/null
+++ b/debian/rapmap/bit_array.h
@@ -0,0 +1,552 @@
+/*
+ bit_array.h
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ maintainer: Isaac Turner <turner.isaac at gmail.com>
+ license: Public Domain, no warranty
+ date: Sep 2014
+*/
+
+#ifndef BIT_ARRAY_HEADER_SEEN
+#define BIT_ARRAY_HEADER_SEEN
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "bit_macros.h"
+
+typedef struct BIT_ARRAY BIT_ARRAY;
+
+// 64 bit words
+typedef uint64_t word_t, word_addr_t, bit_index_t;
+typedef uint8_t word_offset_t; // Offset within a 64 bit word
+
+#define BIT_INDEX_MIN 0
+#define BIT_INDEX_MAX (~(bit_index_t)0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Structs
+//
+
+struct BIT_ARRAY
+{
+  word_t* words;
+  bit_index_t num_of_bits;
+  // Number of words used -- this is just round_up(num_of_bits / 64)
+  // if num_of_bits == 0, this is 0
+  word_addr_t num_of_words;
+  // For more efficient allocation we use realloc only to double size --
+  // not for adding every word.  Initial size is INIT_CAPACITY_WORDS.
+  word_addr_t capacity_in_words;
+};
+
+//
+// Basics: Constructor, destructor, get length, resize
+//
+
+// Constructor - create a new bit array of length nbits
+BIT_ARRAY* bit_array_create(bit_index_t nbits);
+
+// Destructor - free the memory used for a bit array
+void bit_array_free(BIT_ARRAY* bitarray);
+
+// Allocate using existing struct
+BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits);
+void bit_array_dealloc(BIT_ARRAY* bitarr);
+
+// Get length of bit array
+bit_index_t bit_array_length(const BIT_ARRAY* bit_arr);
+
+// Change the size of a bit array. Enlarging an array will add zeros
+// to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory)
+char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits);
+
+// If bitarr length < num_bits, resizes to num_bits
+char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits);
+
+// Same as above but exit with an error message if out of memory
+void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
+void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
+
+
+//
+// Macros
+//
+
+//
+// Get, set, clear, assign and toggle individual bits
+// Macros for fast access -- beware: no bounds checking
+//
+
+#define bit_array_get(arr,i)      bitset_get((arr)->words, i)
+#define bit_array_set(arr,i)      bitset_set((arr)->words, i)
+#define bit_array_clear(arr,i)    bitset_del((arr)->words, i)
+#define bit_array_toggle(arr,i)   bitset_tgl((arr)->words, i)
+// c must be 0 or 1
+#define bit_array_assign(arr,i,c) bitset_cpy((arr)->words,i,c)
+
+//
+// Get, set, clear, assign and toggle individual bits
+// "Safe": use assert() to check bounds
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b);
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c);
+
+//
+// "Resizing": enlarge array if needed
+//
+
+char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c);
+
+//
+// Set, clear and toggle several bits at once
+//
+
+// Set multiple bits at once.
+// e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+// Clear multiple bits at once.
+// e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+// Toggle multiple bits at once
+// e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+//
+// Set, clear and toggle all bits in a region
+//
+
+// Set all the bits in a region
+void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+// Clear all the bits in a region
+void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+// Toggle all the bits in a region
+void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+//
+// Set, clear and toggle all bits at once
+//
+
+// Set all bits in this array to 1
+void bit_array_set_all(BIT_ARRAY* bitarr);
+
+// Set all bits in this array to 0
+void bit_array_clear_all(BIT_ARRAY* bitarr);
+
+// Set all 1 bits to 0, and all 0 bits to 1
+void bit_array_toggle_all(BIT_ARRAY* bitarr);
+
+//
+// Get / set a word of a given size
+//
+
+// First bit is in the least significant bit position
+// start index must be within the range of the bit array (0 <= x < length)
+uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start);
+uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start);
+uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start);
+uint8_t  bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start);
+uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n);
+
+// Set 64 bits at once from a particular start position
+void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word);
+void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word);
+void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word);
+void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte);
+void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n);
+
+//
+// Number of bits set
+//
+
+// Get the number of bits set (hamming weight)
+bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr);
+
+// Get the number of bits not set (length - hamming weight)
+bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr);
+
+// Get the number of bits set in on array and not the other.  This is equivalent
+// to hamming weight of the XOR when the two arrays are the same length.
+// e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010)
+bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1,
+                                       const BIT_ARRAY* arr2);
+
+// Parity - returns 1 if odd number of bits set, 0 if even
+char bit_array_parity(const BIT_ARRAY* bitarr);
+
+//
+// Find indices of set/clear bits
+//
+
+// Find the index of the next bit that is set, at or after `offset`
+// Returns 1 if a bit is set, otherwise 0
+// Index of next set bit is stored in the integer pointed to by result
+// If no next bit is set result is not changed
+char bit_array_find_next_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the next bit that is NOT set, at or after `offset`
+// Returns 1 if a bit is NOT set, otherwise 0
+// Index of next zero bit is stored in the integer pointed to by `result`
+// If no next bit is zero, value at `result` is not changed
+char bit_array_find_next_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the previous bit that is set, before offset.
+// Returns 1 if a bit is set, otherwise 0
+// Index of previous set bit is stored in the integer pointed to by `result`
+// If no previous bit is set result is not changed
+char bit_array_find_prev_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the previous bit that is NOT set, before offset.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of previous zero bit is stored in the integer pointed to by `result`
+// If no previous bit is zero result is not changed
+char bit_array_find_prev_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                   bit_index_t* result);
+
+// Find the index of the first bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of first set bit is stored in the integer pointed to by `result`
+// If no bit is set result is not changed
+char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the first bit that is NOT set.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of first zero bit is stored in the integer pointed to by `result`
+// If no bit is zero result is not changed
+char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the last bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of last set bit is stored in the integer pointed to by `result`
+// If no bit is set result is not changed
+char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the last bit that is NOT set.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of last zero bit is stored in the integer pointed to by `result`
+// If no bit is zero result is not changed
+char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+
+//
+// Sorting
+//
+
+// Put all the 0s before all the 1s
+void bit_array_sort_bits(BIT_ARRAY* bitarr);
+
+// Put all the 1s before all the 0s
+void bit_array_sort_bits_rev(BIT_ARRAY* bitarr);
+
+
+//
+// String and printing methods
+//
+
+// Construct a BIT_ARRAY from a string.
+void bit_array_from_str(BIT_ARRAY* bitarr, const char* bitstr);
+
+// Construct a BIT_ARRAY from a substring with given on and off characters.
+void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset,
+                           const char* str, size_t len,
+                           const char *on, const char *off, char left_to_right);
+
+// Takes a char array to write to.  `str` must be bitarr->num_of_bits+1 in
+// length. Terminates string with '\0'
+char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str);
+char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str);
+
+// Get a string representations for a given region, using given on/off
+// characters.
+// Note: does not null-terminate
+void bit_array_to_substr(const BIT_ARRAY* bitarr,
+                         bit_index_t start, bit_index_t length,
+                         char* str, char on, char off, char left_to_right);
+
+// Print this array to a file stream.  Prints '0's and '1'.  Doesn't print
+// newline.
+void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout);
+
+// Print a string representations for a given region, using given on/off
+// characters. Reverse prints from highest to lowest -- this is useful for
+// printing binary numbers
+void bit_array_print_substr(const BIT_ARRAY* bitarr,
+                            bit_index_t start, bit_index_t length,
+                            FILE* fout, char on, char off, char left_to_right);
+
+//
+// Decimal
+//
+
+// Get bit array as decimal str (e.g. 0b1101 -> "13")
+size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len);
+
+// Return number of characters used
+size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal);
+
+//
+// Hexidecimal
+//
+
+// Loads array from hex string
+// Returns the number of bits loaded (will be chars rounded up to multiple of 8)
+// (0 on failure)
+bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset,
+                               const char* str, size_t len);
+
+// Returns number of characters written
+size_t bit_array_to_hex(const BIT_ARRAY* bitarr,
+                        bit_index_t start, bit_index_t length,
+                        char* str, char uppercase);
+
+// Print bit array as hex
+size_t bit_array_print_hex(const BIT_ARRAY* bitarr,
+                           bit_index_t start, bit_index_t length,
+                           FILE* fout, char uppercase);
+
+//
+// Clone and copy
+//
+
+// Copy a BIT_ARRAY struct and the data it holds - returns pointer to new object
+#define bit_array_dup	bit_array_clone
+BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr);
+
+// Copy bits from one array to another
+// Note: use MACRO bit_array_copy
+// Destination and source can be the same bit_array and
+// src/dst regions can overlap
+void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                    const BIT_ARRAY* src, bit_index_t srcindx,
+                    bit_index_t length);
+
+// copy all of src to dst. dst is resized to match src.
+void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src);
+
+//
+// Logic operators
+//
+
+// BIT_ARRAYs can all be different or the same object
+// dest array will be resized if it is too short
+//
+void bit_array_and(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_or (BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_xor(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_not(BIT_ARRAY* dest, const BIT_ARRAY* src);
+
+//
+// Comparisons
+//
+
+// Note: (bit_array_cmp(a,b) == 0) <=> (bit_array_cmp_big_endian(a,b) == 0)
+
+// comparison functions return:
+//   1 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  -1 iff bitarr1 < bitarr2
+
+// Compare two bit arrays by value stored, with index 0 being the Least
+// Significant Bit (LSB). Arrays do not have to be the same length.
+// Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side]
+int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
+
+// Compare two bit arrays by value stored, with index 0 being the Most
+// Significant Bit (MSB). Arrays do not have to be the same length.
+// Example: 10.. > 01.. [index 0 is MSB at left hand side]
+int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
+
+// compare bitarr with (bitarr2 << pos)
+int bit_array_cmp_words(const BIT_ARRAY *bitarr,
+                        bit_index_t pos, const BIT_ARRAY *bitarr2);
+
+//
+// Shift, interleave, reverse
+//
+
+// Shift array left/right.  If fill is zero, filled with 0, otherwise 1
+void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
+void bit_array_shift_left (BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
+
+// shift left without losing any bits. Resizes bitarr.
+void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist,
+                                 char fill);
+
+// Cyclic shift
+void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t dist);
+void bit_array_cycle_left (BIT_ARRAY* bitarr, bit_index_t dist);
+
+// Interleave
+// dst cannot point to the same bit array as src1 or src2
+// src1, src2 may point to the same bit array
+// abcd 1234 -> a1b2c3d4
+// 0011 0000 -> 00001010
+// 1111 0000 -> 10101010
+// 0101 1010 -> 01100110
+// Extends dst if it is too short, but does not shrink it if it is too long
+// if dst is longer than length(src1)+length(src2), the end bits are not altered
+void bit_array_interleave(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1,
+                          const BIT_ARRAY* src2);
+
+// Reverse the whole array or part of it
+void bit_array_reverse(BIT_ARRAY* bitarr);
+void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+//
+// Numeric
+//
+
+// Returns 1 on sucess, 0 if value in array is too big
+char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result);
+
+// 1 iff bitarr > value
+// 0 iff bitarr == value
+// -1 iff bitarr < value
+int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value);
+
+//
+// Arithmetic
+//
+
+// bitarr will be extended if needed
+void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value);
+
+// Add `add` to `bitarr` at `pos` -- same as:
+//   bitarr + (add << pos)
+// where pos can be bigger than the length of the array (bitarr will be resized)
+void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add);
+
+// Add `add` to `bitarr` at `pos`
+void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add);
+
+// If value is greater than bitarr, bitarr is not changed and 0 is returned
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value);
+
+// minus `minus` from `bitarr` at `pos` -- same as:
+//   bitarr + (minus << pos)
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_word(BIT_ARRAY *bitarr, bit_index_t pos, word_t minus);
+
+// minus `minus` from `bitarr` at `pos`
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus);
+
+// Multiply by some value
+void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier);
+
+// bitarr = round_down(bitarr / divisor)
+// rem = bitarr % divisor
+void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem);
+
+//
+// Arithmetic between arrays
+//
+
+// dst = src1 + src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than either of src1, src2, it is enlarged
+void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+
+// dst = src1 - src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than src1, it will be extended to be as long as src1
+// src1 must be greater than or equal to src2 (src1 >= src2)
+void bit_array_subtract(BIT_ARRAY* dst,
+                        const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+
+// dst = src1 * src2
+// Pointers cannot all point to the same BIT_ARRAY
+void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2);
+
+// Results in:
+//   quotient = dividend / divisor
+//   dividend = dividend % divisor
+// (dividend is used to return the remainder)
+void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor);
+
+//
+// Read/Write bit_array to a file
+//
+// File format is [8 bytes: for number of elements in array][data]
+// Number of bytes of data is: (int)((num_of_bits + 7) / 8)
+//
+
+// Saves bit array to a file
+// returns the number of bytes written
+bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f);
+
+// Reads bit array from a file. bitarr is resized and filled.
+// Returns 1 on success, 0 on failure
+char bit_array_load(BIT_ARRAY* bitarr, FILE* f);
+
+
+//
+// Hash function
+//
+
+// Pass seed as 0 on first call, pass previous hash value if rehashing due
+// to a collision
+// Using bob jenkins hash lookup3
+uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed);
+
+//
+// Randomness
+//
+
+// Set bits randomly with probability prob : 0 <= prob <= 1
+void bit_array_random(BIT_ARRAY* bitarr, float prob);
+
+// Shuffle the bits in an array randomly
+void bit_array_shuffle(BIT_ARRAY* bitarr);
+
+// Get the next permutation of an array with a fixed size and given number of
+// bits set.  Also known as next lexicographic permutation.
+// Given a bit array find the next lexicographic orginisation of the bits
+// Number of possible combinations given by (size choose bits_set) i.e. nCk
+// 00011 -> 00101 -> 00110 -> 01001 -> 01010 ->
+// 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start)
+void bit_array_next_permutation(BIT_ARRAY* bitarr);
+
+//
+// Generally useful functions
+//
+
+// Generalised 'binary to string' function
+// Adds bits to the string in order of lsb to msb
+// e.g. 0b11010 (26 in decimal) would come out as "01011"
+char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str);
+
+// Same as above but in reverse
+char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/debian/rapmap/bit_macros.h b/debian/rapmap/bit_macros.h
new file mode 100644
index 0000000..7eebfcb
--- /dev/null
+++ b/debian/rapmap/bit_macros.h
@@ -0,0 +1,205 @@
+/*
+ bit_macros.h
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ author: Isaac Turner <turner.isaac at gmail.com>
+ license: Public Domain, no warranty
+ date: Dec 2013
+*/
+
+#ifndef BITSET_H_
+#define BITSET_H_
+
+#include <inttypes.h>
+#include <sched.h>
+
+// trailing_zeros is number of least significant zeros
+// leading_zeros is number of most significant zeros
+#if defined(_WIN32)
+  #define trailing_zeros(x) ({ __typeof(x) _r; _BitScanReverse64(&_r, x); _r; })
+  #define leading_zeros(x) ({ __typeof(x) _r; _BitScanForward64(&_r, x); _r; })
+#else
+  #define trailing_zeros(x) ((x) ? (__typeof(x))__builtin_ctzll(x) : (__typeof(x))sizeof(x)*8)
+  #define leading_zeros(x) ((x) ? (__typeof(x))__builtin_clzll(x) : (__typeof(x))sizeof(x)*8)
+#endif
+
+// Get index of top set bit. If x is 0 return nbits
+#define top_set_bit(x) ((x) ? sizeof(x)*8-leading_zeros(x)-1 : sizeof(x)*8)
+
+#define roundup_bits2bytes(bits)   (((bits)+7)/8)
+#define roundup_bits2words32(bits) (((bits)+31)/32)
+#define roundup_bits2words64(bits) (((bits)+63)/64)
+
+// Round a number up to the nearest number that is a power of two
+#define roundup2pow(x) (1UL << (64 - leading_zeros(x)))
+
+#define rot32(x,r) (((x)<<(r)) | ((x)>>(32-(r))))
+#define rot64(x,r) (((x)<<(r)) | ((x)>>(64-(r))))
+
+// need to check for length == 0, undefined behaviour if uint64_t >> 64 etc
+#define bitmask(nbits,type) ((nbits) ? ~(type)0 >> (sizeof(type)*8-(nbits)): (type)0)
+#define bitmask32(nbits) bitmask(nbits,uint32_t)
+#define bitmask64(nbits) bitmask(nbits,uint64_t)
+
+// A possibly faster way to combine two words with a mask
+//#define bitmask_merge(a,b,abits) ((a & abits) | (b & ~abits))
+#define bitmask_merge(a,b,abits) (b ^ ((a ^ b) & abits))
+
+// Swap lowest four bits. A nibble is 4 bits (i.e. half a byte)
+#define rev_nibble(x) ((((x)&1)<<3)|(((x)&2)<<1)|(((x)&4)>>1)|(((x)&8)>>3))
+
+//
+// Bit array (bitset)
+//
+// bitsetX_wrd(): get word for a given position
+// bitsetX_idx(): get index within word for a given position
+#define _VOLPTR(x) ((volatile __typeof(x) *)(&(x)))
+#define _VOLVALUE(x) (*_VOLPTR(x))
+
+#define _TYPESHIFT(arr,word,shift) \
+        ((__typeof(*(arr)))((__typeof(*(arr)))(word) << (shift)))
+
+#define bitsetX_wrd(wrdbits,pos) ((pos) / (wrdbits))
+#define bitsetX_idx(wrdbits,pos) ((pos) % (wrdbits))
+
+#define bitset32_wrd(pos) ((pos) >> 5)
+#define bitset32_idx(pos) ((pos) & 31)
+
+#define bitset64_wrd(pos) ((pos) >> 6)
+#define bitset64_idx(pos) ((pos) & 63)
+
+//
+// Bit functions on arrays
+//
+#define bitset2_get(arr,wrd,idx)     (((arr)[wrd] >> (idx)) & 0x1)
+#define bitset2_set(arr,wrd,idx)     ((arr)[wrd] |=  _TYPESHIFT(arr,1,idx))
+#define bitset2_del(arr,wrd,idx)     ((arr)[wrd] &=~ _TYPESHIFT(arr,1,idx))
+#define bitset2_tgl(arr,wrd,idx)     ((arr)[wrd] ^=  _TYPESHIFT(arr,1,idx))
+#define bitset2_or(arr,wrd,idx,bit)  ((arr)[wrd] |=  _TYPESHIFT(arr,bit,idx))
+#define bitset2_xor(arr,wrd,idx,bit) ((arr)[wrd]  = ~((arr)[wrd] ^ (~_TYPESHIFT(arr,bit,idx))))
+#define bitset2_and(arr,wrd,idx,bit) ((arr)[wrd] &= (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx)))
+#define bitset2_cpy(arr,wrd,idx,bit) ((arr)[wrd]  = ((arr)[wrd] &~ _TYPESHIFT(arr,1,idx)) | _TYPESHIFT(arr,bit,idx))
+
+//
+// Thread safe versions
+//
+// They return the value of the bit (0 or 1) before it was updated
+#define bitset2_get_mt(arr,wrd,idx)     bitset2_get(_VOLPTR(*(arr)),wrd,idx)
+#define bitset2_set_mt(arr,wrd,idx)     ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_del_mt(arr,wrd,idx)     ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), ~_TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_tgl_mt(arr,wrd,idx)     ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_or_mt(arr,wrd,idx,bit)  ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
+#define bitset2_xor_mt(arr,wrd,idx,bit) ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
+#define bitset2_and_mt(arr,wrd,idx,bit) ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx))) >> (idx))&1)
+#define bitset2_cpy_mt(arr,wrd,idx,bit) ((bit) ? bitset2_set_mt(arr,wrd,idx) : bitset2_del_mt(arr,wrd,idx))
+
+//
+// Auto detect size of type from pointer
+//
+#define bitset_wrd(arr,pos) bitsetX_wrd(sizeof(*(arr))*8,pos)
+#define bitset_idx(arr,pos) bitsetX_idx(sizeof(*(arr))*8,pos)
+#define bitset_op(func,arr,pos)      func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos))
+#define bitset_op2(func,arr,pos,bit) func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos), bit)
+
+// Auto-detect type size: bit functions
+#define bitset_get(arr,pos)     bitset_op(bitset2_get, arr, pos)
+#define bitset_set(arr,pos)     bitset_op(bitset2_set, arr, pos)
+#define bitset_del(arr,pos)     bitset_op(bitset2_del, arr, pos)
+#define bitset_tgl(arr,pos)     bitset_op(bitset2_tgl, arr, pos)
+#define bitset_or(arr,pos,bit)  bitset_op2(bitset2_or, arr, pos, bit)
+#define bitset_xor(arr,pos,bit) bitset_op2(bitset2_xor, arr, pos, bit)
+#define bitset_and(arr,pos,bit) bitset_op2(bitset2_and, arr, pos, bit)
+#define bitset_cpy(arr,pos,bit) bitset_op2(bitset2_cpy, arr, pos, bit)
+
+// Auto-detect type size: thread safe bit functions
+// They return the value of the bit (0 or 1) before it was updated
+#define bitset_get_mt(arr,pos)     bitset_op(bitset2_get_mt,  arr, pos)
+#define bitset_set_mt(arr,pos)     bitset_op(bitset2_set_mt,  arr, pos)
+#define bitset_del_mt(arr,pos)     bitset_op(bitset2_del_mt,  arr, pos)
+#define bitset_tgl_mt(arr,pos)     bitset_op(bitset2_tgl_mt,  arr, pos)
+#define bitset_or_mt(arr,pos,bit)  bitset_op2(bitset2_or_mt,  arr, pos, bit)
+#define bitset_xor_mt(arr,pos,bit) bitset_op2(bitset2_xor_mt, arr, pos, bit)
+#define bitset_and_mt(arr,pos,bit) bitset_op2(bitset2_and_mt, arr, pos, bit)
+#define bitset_cpy_mt(arr,pos,bit) bitset_op2(bitset2_cpy_mt, arr, pos, bit)
+
+// Clearing a word does not return a meaningful value
+#define bitset_clear_word(arr,pos) ((arr)[bitset_wrd(arr,pos)] = 0)
+#define bitset_clear_word_mt(arr,pos) (_VOLVALUE((arr)[bitset_wrd(arr,pos)]) = 0)
+
+//
+// Compact bit array of spin locks
+// These are most effecient when arr is of type: volatile char*
+//
+// Acquire a lock
+#define bitlock_acquire_block(arr,pos,wait,abandon) do {                       \
+  size_t _w = bitset_wrd(arr,pos);                                             \
+  __typeof(*(arr)) _o, _n, _b = _TYPESHIFT(arr, 1, bitset_idx(arr,pos));       \
+  do {                                                                         \
+    while((_o = _VOLVALUE((arr)[_w])) & _b) { wait }                           \
+    abandon                                                                    \
+    _n = _o | _b;                                                              \
+  } while(!__sync_bool_compare_and_swap(_VOLPTR((arr)[_w]), _o, _n));          \
+  __sync_synchronize(); /* Must not move commands to before acquiring lock */  \
+} while(0)
+
+// Undefined behaviour if you do not already hold the lock
+#define bitlock_release(arr,pos) do {                                          \
+  size_t _w = bitset_wrd(arr,pos);                                             \
+  __typeof(*(arr)) _mask = ~_TYPESHIFT(arr, 1, bitset_idx(arr,pos));           \
+  __sync_synchronize(); /* Must get the lock before releasing it */            \
+  __sync_and_and_fetch(_VOLPTR((arr)[_w]), _mask);                             \
+} while(0)
+
+#define bitlock_acquire(arr,pos) bitlock_acquire_block(arr,pos,{},{})
+
+// calls yield if cannot acquire the lock
+#define bitlock_yield_acquire(arr,pos) bitlock_acquire_block(arr,pos,sched_yield();,{})
+
+// Block until we get the lock or someone else does
+// sets the memory pointed to by retptr to 1 if we got the lock, 0 otherwise
+#define bitlock_try_acquire(arr,pos,retptr) do {                               \
+  *retptr = 1; /* default to success, set to zero if locked */                 \
+  bitlock_acquire_block(arr,pos,{*retptr=0;break;},if(!*retptr){break;});      \
+} while(0)
+
+/*
+ * Byteswapping
+ */
+
+/* clang uses these to check for features */
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+/* GCC versions < 4.3 do not have __builtin_bswapX() */
+#if ( defined(__clang__) && !__has_builtin(__builtin_bswap64) ) ||             \
+    ( !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) &&   \
+      ( (__GNUC__ < 4)  || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)) )
+  #define byteswap64(x) ( (((uint64_t)(x) << 56))                       | \
+                          (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
+                          (((uint64_t)(x) << 24) & 0xff0000000000ULL)   | \
+                          (((uint64_t)(x) <<  8) & 0xff00000000ULL)     | \
+                          (((uint64_t)(x) >>  8) & 0xff000000ULL)       | \
+                          (((uint64_t)(x) >> 24) & 0xff0000ULL)         | \
+                          (((uint64_t)(x) >> 40) & 0xff00ULL)           | \
+                          (((uint64_t)(x) >> 56)) )
+
+  #define byteswap32(x) ( (((uint32_t)(x) << 24))                       | \
+                          (((uint32_t)(x) <<  8) & 0xff0000U)           | \
+                          (((uint32_t)(x) >>  8) & 0xff00U)             | \
+                          (((uint32_t)(x) >> 24)) )
+
+  /* uint16_t type might be bigger than 2 bytes, so need to mask */
+  #define byteswap16(x) ( (((uint16_t)(x) & 0xff) << 8) | \
+                          (((uint16_t)(x) >> 8) & 0xff) )
+#else
+  #define byteswap64(x) __builtin_bswap64(x)
+  #define byteswap32(x) __builtin_bswap64(x)
+  #define byteswap16(x) __builtin_bswap64(x)
+#endif
+
+#endif /* BITLOCK_H_ */
diff --git a/debian/rapmap/kseq.h b/debian/rapmap/kseq.h
new file mode 100644
index 0000000..b2238d1
--- /dev/null
+++ b/debian/rapmap/kseq.h
@@ -0,0 +1,235 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t)						\
+	typedef struct __kstream_t {				\
+		unsigned char *buf;						\
+		int begin, end, is_eof;					\
+		type_t f;								\
+	} kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)								\
+	static inline kstream_t *ks_init(type_t f)						\
+	{																\
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
+		ks->f = f;													\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
+		return ks;													\
+	}																\
+	static inline void ks_destroy(kstream_t *ks)					\
+	{																\
+		if (ks) {													\
+			free(ks->buf);											\
+			free(ks);												\
+		}															\
+	}
+
+#define __KS_GETC(__read, __bufsize)						\
+	static inline int ks_getc(kstream_t *ks)				\
+	{														\
+		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
+		if (ks->begin >= ks->end) {							\
+			ks->begin = 0;									\
+			ks->end = __read(ks->f, ks->buf, __bufsize);	\
+			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
+		}													\
+		return (int)ks->buf[ks->begin++];					\
+	}
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)								\
+	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+	{																	\
+		int gotany = 0;													\
+		if (dret) *dret = 0;											\
+		str->l = append? str->l : 0;									\
+		for (;;) {														\
+			int i;														\
+			if (ks->begin >= ks->end) {									\
+				if (!ks->is_eof) {										\
+					ks->begin = 0;										\
+					ks->end = __read(ks->f, ks->buf, __bufsize);		\
+					if (ks->end == 0) { ks->is_eof = 1; break; }		\
+				} else break;											\
+			}															\
+			if (delimiter == KS_SEP_LINE) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (ks->buf[i] == '\n') break; \
+			} else if (delimiter > KS_SEP_MAX) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (ks->buf[i] == delimiter) break;					\
+			} else if (delimiter == KS_SEP_SPACE) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i])) break;						\
+			} else if (delimiter == KS_SEP_TAB) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+			} else i = 0; /* never come to here! */						\
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
+				str->m = str->l + (i - ks->begin) + 1;					\
+				kroundup32(str->m);										\
+				str->s = (char*)realloc(str->s, str->m);				\
+			}															\
+			gotany = 1;													\
+			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+			str->l = str->l + (i - ks->begin);							\
+			ks->begin = i + 1;											\
+			if (i < ks->end) {											\
+				if (dret) *dret = ks->buf[i];							\
+				break;													\
+			}															\
+		}																\
+		if (!gotany && ks_eof(ks)) return -1;							\
+		if (str->s == 0) {												\
+			str->m = 1;													\
+			str->s = (char*)calloc(1, 1);								\
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+		str->s[str->l] = '\0';											\
+		return str->l;													\
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+	__KS_TYPE(type_t)							\
+	__KS_BASIC(type_t, __bufsize)				\
+	__KS_GETC(__read, __bufsize)				\
+	__KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
+	{																	\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+		s->f = ks_init(fd);												\
+		return s;														\
+	}																	\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
+	{																	\
+		if (!ks) return;												\
+		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+		ks_destroy(ks->f);												\
+		free(ks);														\
+	}
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c; \
+		kstream_t *ks = seq->f; \
+		if (seq->last_char == 0) { /* then jump to the next header line */ \
+			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+			if (c == -1) return -1; /* end of file */ \
+			seq->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
+		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+		if (c == -1) return -2; /* error: no quality string */ \
+		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+		seq->last_char = 0;	/* we have not come to the next header line */ \
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
+	}
+
+#define __KSEQ_TYPE(type_t)						\
+	typedef struct {							\
+		kstring_t name, comment, seq, qual;		\
+		int last_char;							\
+		kstream_t *f;							\
+	} kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT(type_t, __read, 16384)			\
+	__KSEQ_TYPE(type_t)							\
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/debian/rapmap/macros.h b/debian/rapmap/macros.h
new file mode 100644
index 0000000..8a0853d
--- /dev/null
+++ b/debian/rapmap/macros.h
@@ -0,0 +1,59 @@
+/*		 
+ * Sux: Succinct data structures
+ *
+ * Copyright (C) 2007-2013 Sebastiano Vigna 
+ *
+ *  This library is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU Lesser General Public License as published by the Free
+ *  Software Foundation; either version 3 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This library is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef ranksel_macros_h
+#define ranksel_macros_h
+
+#define ONES_STEP_4 ( 0x1111111111111111ULL )
+#define ONES_STEP_8 ( 0x0101010101010101ULL )
+#define ONES_STEP_9 ( 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | 1ULL << 36 | 1ULL << 45 | 1ULL << 54 )
+#define ONES_STEP_16 ( 1ULL << 0 | 1ULL << 16 | 1ULL << 32 | 1ULL << 48 )
+#define MSBS_STEP_4 ( 0x8ULL * ONES_STEP_4 )
+#define MSBS_STEP_8 ( 0x80ULL * ONES_STEP_8 )
+#define MSBS_STEP_9 ( 0x100ULL * ONES_STEP_9 )
+#define MSBS_STEP_16 ( 0x8000ULL * ONES_STEP_16 )
+#define INCR_STEP_8 ( 0x80ULL << 56 | 0x40ULL << 48 | 0x20ULL << 40 | 0x10ULL << 32 | 0x8ULL << 24 | 0x4ULL << 16 | 0x2ULL << 8 | 0x1 )
+
+#define ONES_STEP_32 ( 0x0000000100000001ULL )
+#define MSBS_STEP_32 ( 0x8000000080000000ULL )
+	
+#define COMPARE_STEP_8(x,y) ( ( ( ( ( (x) | MSBS_STEP_8 ) - ( (y) & ~MSBS_STEP_8 ) ) ^ (x) ^ ~(y) ) & MSBS_STEP_8 ) >> 7 )
+#define LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( (x) & ~MSBS_STEP_8 ) ) ^ (x) ^ (y) ) & MSBS_STEP_8 ) >> 7 )
+
+#define UCOMPARE_STEP_9(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_9 ) - ( (y) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_9 ) >> 8 )
+#define UCOMPARE_STEP_16(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_16 ) - ( (y) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_16 ) >> 15 )
+#define ULEQ_STEP_9(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_9 ) - ( (x) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_9 ) >> 8 )
+#define ULEQ_STEP_16(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_16 ) - ( (x) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_16 ) >> 15 )
+#define ZCOMPARE_STEP_8(x) ( ( ( x | ( ( x | MSBS_STEP_8 ) - ONES_STEP_8 ) ) & MSBS_STEP_8 ) >> 7 )
+
+#define EASY_LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( x ) ) ) & MSBS_STEP_8 ) >> 7 )
+#define EASY_LEQ_STEP_8_MSBS(x,y) ( ( ( ( (y) | MSBS_STEP_8 ) - ( x ) ) ) & MSBS_STEP_8 )
+
+__inline static int ceil_log2( const uint64_t x ) {
+	return x <= 2 ? x - 1 : 64 - __builtin_clzll( x - 1 );
+}
+
+__inline static int msb( const uint64_t x ) {
+	if ( x == 0 ) return -1;
+	return 63 - __builtin_clzll( x );
+}
+
+
+#endif
diff --git a/debian/rapmap/rank9b.cpp b/debian/rapmap/rank9b.cpp
new file mode 100644
index 0000000..58756a4
--- /dev/null
+++ b/debian/rapmap/rank9b.cpp
@@ -0,0 +1,67 @@
+/*		 
+ * Sux: Succinct data structures
+ *
+ * Copyright (C) 2007-2013 Sebastiano Vigna 
+ *
+ *  This library is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU Lesser General Public License as published by the Free
+ *  Software Foundation; either version 3 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This library is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <cassert>
+#include <cstring>
+#include "rank9b.h"
+
+rank9b::rank9b() {}
+
+rank9b::rank9b( const uint64_t * const bits, const uint64_t num_bits ) {
+	this->bits = bits;
+	num_words = ( num_bits + 63 ) / 64;
+	num_counts = ( ( num_bits + 64 * 8 - 1 ) / ( 64 * 8 ) ) * 2;
+	
+	// Init rank structure
+	counts = new uint64_t[ num_counts + 1 ];
+	memset( counts, 0, ( num_counts + 1 ) * sizeof *counts );
+
+	uint64_t c = 0;
+	uint64_t pos = 0;
+	for( uint64_t i = 0; i < num_words; i += 8, pos += 2 ) {
+		counts[ pos ] = c;
+		c += __builtin_popcountll( bits[ i ] );
+		for( int j = 1;  j < 8; j++ ) {
+			counts[ pos + 1 ] |= ( c - counts[ pos ] ) << 63 - 9 * j;
+			if ( i + j < num_words ) c += __builtin_popcountll( bits[ i + j ] );
+		}
+	}
+
+	counts[ num_counts ] = c;
+
+	assert( c <= num_bits );
+}
+
+rank9b::~rank9b() {
+	delete [] counts;
+}
+
+
+uint64_t rank9b::rank( const uint64_t k ) {
+	const uint64_t word = k / 64;
+	const uint64_t block = word / 4 & ~1;
+	const int offset = word % 8;
+	return counts[ block ] + ( counts[ block + 1 ] >> ( 63 - offset * 9 ) & 0x1FF ) + __builtin_popcountll( bits[ word ] & ( ( 1ULL << k % 64 ) - 1 ) );
+}
+
+uint64_t rank9b::bit_count() {
+	return num_counts * 64;
+}
+
+void rank9b::print_counts() {}
diff --git a/debian/rapmap/rank9b.h b/debian/rapmap/rank9b.h
new file mode 100644
index 0000000..080d69a
--- /dev/null
+++ b/debian/rapmap/rank9b.h
@@ -0,0 +1,42 @@
+/*		 
+ * Sux: Succinct data structures
+ *
+ * Copyright (C) 2007-2013 Sebastiano Vigna 
+ *
+ *  This library is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU Lesser General Public License as published by the Free
+ *  Software Foundation; either version 3 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This library is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef rank9b_h
+#define rank9b_h
+#include <stdint.h>
+#include "macros.h"
+
+class rank9b {
+private:
+	const uint64_t *bits;
+	uint64_t *counts, *inventory;
+	uint64_t num_words, num_counts, inventory_size, ones_per_inventory, log2_ones_per_inventory, num_ones;
+
+public:
+	rank9b();
+	rank9b( const uint64_t * const bits, const uint64_t num_bits );
+	~rank9b();
+	uint64_t rank( const uint64_t pos );
+	// Just for analysis purposes
+	void print_counts();
+	uint64_t bit_count();
+};
+
+#endif
diff --git a/debian/rules b/debian/rules
index cf9d3a2..fd283e7 100755
--- a/debian/rules
+++ b/debian/rules
@@ -11,6 +11,28 @@ VERSION        := $(shell echo '$(DEBVERS)' | sed -e 's/^[0-9]*://' -e 's/-.*//'
 %:
 	dh $@ --with sphinxdoc --parallel
 
+override_dh_auto_configure:
+	cp -v debian/rapmap/RapMap* \
+              debian/rapmap/Boo???.hpp \
+              debian/rapmap/IndexHeader.hpp \
+              debian/rapmap/HitManager.hpp \
+              debian/rapmap/JFRaw.hpp \
+              debian/rapmap/SA*.hpp \
+              debian/rapmap/kseq.h \
+              debian/rapmap/SpinLock.hpp \
+              debian/rapmap/ScopedTimer.hpp \
+              debian/rapmap/bit_*.h \
+              debian/rapmap/rank9b.h \
+              debian/rapmap/macros.h \
+              include/
+	cp -v debian/rapmap/bit_array.c \
+              debian/rapmap/rank9b.cpp \
+              debian/rapmap/RapMapSA*.cpp \
+	      debian/rapmap/RapMapFileSystem.cpp \
+	      debian/rapmap/HitManager.cpp \
+              src/
+	dh_auto_configure
+
 override_dh_auto_build:
 	dh_auto_build
 	mv doc/source/license.rst doc/ # unused
@@ -26,6 +48,23 @@ override_dh_auto_build:
 
 override_dh_auto_clean:
 	dh_auto_clean
+	rm -f include/RapMap* \
+	      include/Boo???.hpp \
+	      include/IndexHeader.hpp \
+	      include/HitManager.hpp \
+	      include/JFRaw.hpp \
+	      include/SA*.hpp \
+              include/kseq.h \
+	      include/bit_*.h \
+	      include/rank9b.h \
+	      include/macros. h \
+	      include/SpinLock.hpp \
+	      include/ScopedTimer.hpp
+	rm -f src/bit_array.c \
+              src/rank9b.cpp \
+	      src/RapMapSA*.cpp \
+	      src/RapMapFileSystem.cpp \
+	      src/HitManager.cpp
 	rm -f debian/*.1
 	rm -Rf sample_data
 	cd doc && $(MAKE) clean

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/salmon.git