[med-svn] [salmon] 01/07: New upstream version 0.8.0+ds1

Tue Jan 24 13:17:28 UTC 2017

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository salmon.

commit 993416c4338d85a6b64cf0eeefeb56c6dbeb42a6
Author: Andreas Tille <tille at debian.org>
Date:   Tue Jan 24 09:45:35 2017 +0100

    New upstream version 0.8.0+ds1
---
 .drone.yml                             |   26 +
 .drone.yml.sig                         |    1 +
 .drone/build.sh                        |   19 +
 .drone/copy_build.sh                   |   10 +
 .drone/test_quant.sh                   |   27 +
 CMakeLists.txt                         |  162 +-
 doc/source/salmon.rst                  |    8 +-
 include/AlignmentLibrary.hpp           |   96 +
 include/CollapsedGibbsSampler.hpp      |   11 +-
 include/EquivalenceClassBuilder.hpp    |    2 +-
 include/FragmentLengthDistribution.hpp |   11 +-
 include/GCFragModel.hpp                |  334 +--
 include/GZipWriter.hpp                 |    7 +-
 include/PartitionRefiner.hpp           |   42 -
 include/PerfectHashIndex.hpp           |  174 --
 include/ReadExperiment.hpp             |  132 +-
 include/ReadPair.hpp                   |    7 +-
 include/SailfishUtils.hpp              |  153 --
 include/SalmonConfig.hpp               |    9 +-
 include/SalmonIndex.hpp                |    9 +-
 include/SalmonOpts.hpp                 |   29 +-
 include/SalmonUtils.hpp                |    6 +-
 include/Sampler.hpp                    |   86 +-
 include/SimplePosBias.hpp              |    6 +
 include/Transcript.hpp                 |  101 +-
 include/TranscriptGeneMap.hpp          |   16 +-
 include/UnpairedRead.hpp               |    2 +-
 include/btree.h                        | 2394 ---------------------
 include/btree_container.h              |  350 ----
 include/btree_map.h                    |  130 --
 include/btree_set.h                    |  121 --
 include/concurrentqueue.h              |    4 +-
 include/count_main_cmdline.hpp         |  711 -------
 include/cuckoohash_config.hh           |   14 +-
 include/cuckoohash_map.hh              | 3572 ++++++++++++++++++--------------
 include/cuckoohash_util.hh             |   81 +-
 include/lazy_array.hh                  |  119 --
 include/libcuckoo_lazy_array.hh        |  202 ++
 include/merge_files.hpp                |   30 -
 include/pcg_extras.hpp                 |  638 ++++++
 include/pcg_random.hpp                 | 1756 ++++++++++++++++
 include/safe_btree.h                   |  395 ----
 include/safe_btree_map.h               |   89 -
 include/safe_btree_set.h               |   88 -
 scripts/ConvertBootstrapsToTSV.py      |    9 +-
 scripts/Dockerfile                     |   34 +
 scripts/fetchRapMap.sh                 |    4 +-
 scripts/make-release.sh                |    3 +
 scripts/test_sim_corr.py               |   48 +
 src/BuildSalmonIndex.cpp               |   14 +-
 src/CMakeLists.txt                     |   40 +-
 src/CollapsedEMOptimizer.cpp           |  227 +-
 src/CollapsedGibbsSampler.cpp          |  774 +++++--
 src/FASTAParser.cpp                    |   38 +-
 src/FragmentLengthDistribution.cpp     |   51 +-
 src/GZipWriter.cpp                     |  158 +-
 src/JellyfishMerCounter.cpp            |  379 ----
 src/LookUpTableUtils.cpp               |  255 ---
 src/SailfishUtils.cpp                  |  699 -------
 src/Salmon.cpp                         |  123 +-
 src/SalmonQuantify.cpp                 |  382 +++-
 src/SalmonQuantifyAlignments.cpp       |  213 +-
 src/SalmonUtils.cpp                    |  347 +++-
 src/SimplePosBias.cpp                  |   20 +-
 tests/test_quant.nf                    |   71 +
 65 files changed, 7508 insertions(+), 8561 deletions(-)

diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 0000000..1157fe1
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,26 @@
+ pipeline:
+  setup:
+   image: hbb:salmon_build
+   commands:
+    - echo "Starting build"
+    - ./.drone/build.sh
+  test_indexing:
+   image: hbb:salmon_build
+   commands:
+    - echo "[Testing quant]"
+    - ./.drone/test_quant.sh 
+   volumes:
+    - /mnt/scratch6/avi/data:/mnt/data
+    - /mnt/scratch6/salmon_ci:/mnt/ci_res
+  copy_build:
+    image: hbb:salmon_build
+    commands:
+     - echo "[Packaging binary]"
+     - ./.drone/copy_build.sh
+    volumes:
+     - /mnt/scratch6/avi/data:/mnt/data
+     - /mnt/scratch6/salmon_ci:/mnt/ci_res
+  notify_gitter:
+    image: plugins/gitter
+    commands:
+     - echo "[Notifying gitter]"
diff --git a/.drone.yml.sig b/.drone.yml.sig
new file mode 100644
index 0000000..c95a130
--- /dev/null
+++ b/.drone.yml.sig
@@ -0,0 +1 @@
+eyJhbGciOiJIUzI1NiJ9.IHBpcGVsaW5lOgogIHNldHVwOgogICBpbWFnZTogaGJiOnNhbG1vbl9idWlsZAogICBjb21tYW5kczoKICAgIC0gZWNobyAiU3RhcnRpbmcgYnVpbGQiCiAgICAtIC4vLmRyb25lL2J1aWxkLnNoCiAgdGVzdF9pbmRleGluZzoKICAgaW1hZ2U6IGhiYjpzYWxtb25fYnVpbGQKICAgY29tbWFuZHM6CiAgICAtIGVjaG8gIltUZXN0aW5nIHF1YW50XSIKICAgIC0gLi8uZHJvbmUvdGVzdF9xdWFudC5zaCAKICAgdm9sdW1lczoKICAgIC0gL21udC9zY3JhdGNoNi9hdmkvZGF0YTovbW50L2RhdGEKICAgIC0gL21udC9zY3JhdGNoNi9zYWxtb25fY2k6L21udC9jaV9yZXMKICBjb3B5X2J1aWxkOgogICAgaW1hZ2U6IGhiYjpzYWx [...]
\ No newline at end of file
diff --git a/.drone/build.sh b/.drone/build.sh
new file mode 100755
index 0000000..0b5ecec
--- /dev/null
+++ b/.drone/build.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+source /hbb_exe/activate
+
+set -e
+
+CPATH=`pwd`
+echo "[Drone build] current path : ${CPATH}"
+echo "[Drone build] making build directory"
+
+mkdir build
+cd build
+
+echo "[Drone build] cmake configuration"
+
+cmake -DDO_QUIET_MAKE=TRUE ..
+
+echo "[Drone build] making salmon and installing locally (this could take a while)"
+
+make -j8 -s install
diff --git a/.drone/copy_build.sh b/.drone/copy_build.sh
new file mode 100755
index 0000000..0cbd66b
--- /dev/null
+++ b/.drone/copy_build.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+source /hbb_exe/activate
+
+set -e
+
+cd scripts
+bash make-release.sh -v latest -n linux_x86_64
+cd ../RELEASES
+mkdir -p "/mnt/ci_res/${DRONE_REPO}/${DRONE_COMMIT_BRANCH}/build"
+cp *.tar.gz "/mnt/ci_res/${DRONE_REPO}/${DRONE_COMMIT_BRANCH}/build/"
diff --git a/.drone/test_quant.sh b/.drone/test_quant.sh
new file mode 100755
index 0000000..ac93fbc
--- /dev/null
+++ b/.drone/test_quant.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+source /hbb_exe/activate
+
+set -e
+
+CPATH=`pwd`
+echo "[Drone test] current path : ${CPATH}"
+echo "[Drone test] making quant test directory"
+
+export PATH=/root/miniconda2/bin:$PATH
+export PYTHONPATH=/root/miniconda2/lib/python2.7/site-packages
+
+echo "[Drone test] run nextflow pipeline"
+
+nextflow tests/test_quant.nf
+# store the nextflow return value
+nextflowret=$?
+if [ $nextflowret -ne 0 ]; then
+    echo "[Drone test]: nextflow pipeline test_quant.nf failed!"
+    exit 1
+fi
+
+echo "[Drone test] echoing quants here"
+grep "spearman" sim/*.json
+
+mkdir -p "/mnt/ci_res/${DRONE_REPO}/${DRONE_COMMIT_BRANCH}/${DRONE_COMMIT_SHA}/sim"
+cp sim/*.json "/mnt/ci_res/${DRONE_REPO}/${DRONE_COMMIT_BRANCH}/${DRONE_COMMIT_SHA}/sim/"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b84540..6a7013e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,10 +4,10 @@ enable_testing()
 
 project (Salmon)
 
-set(CPACK_PACKAGE_VERSION "0.7.2")
+set(CPACK_PACKAGE_VERSION "0.8.0")
 set(CPACK_PACKAGE_VERSION_MAJOR "0")
-set(CPACK_PACKAGE_VERSION_MINOR "7")
-set(CPACK_PACKAGE_VERSION_PATCH "2")
+set(CPACK_PACKAGE_VERSION_MINOR "8")
+set(CPACK_PACKAGE_VERSION_PATCH "0")
 set(PROJECT_VERSION ${CPACK_PACKAGE_VERSION})
 set(CPACK_GENERATOR "TGZ")
 set(CPACK_SOURCE_GENERATOR "TGZ")
@@ -20,14 +20,19 @@ set(CPACK_SOURCE_PACKAGE_FILE_NAME
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 
+if (APPLE)
 set (WARNING_IGNORE_FLAGS "-Wno-deprecated-register")
-set (BOOST_CXX_FLAGS "-Wno-deprecated-register -std=c++11")
+else()
+set (WARNING_IGNORE_FLAGS "")
+endif()
+
+set (BOOST_CXX_FLAGS "${WARNING_IGNORE_FLAGS} -std=c++11")
 ## Prefer static to dynamic libraries
 SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
 
 ## Set the standard required compile flags
 # Nov 18th --- removed -DHAVE_CONFIG_H
-set (CMAKE_CXX_FLAGS "-pthread -ftree-vectorize -funroll-loops -fPIC -fomit-frame-pointer -Ofast -DRAPMAP_SALMON_SUPPORT -DHAVE_ANSI_TERM -DHAVE_SSTREAM -Wall -Wno-unknown-pragmas -Wno-reorder -Wno-unused-variable -std=c++11 -Wreturn-type -Werror=return-type")
+set (CMAKE_CXX_FLAGS "-pthread -ftree-vectorize -funroll-loops -fPIC -fomit-frame-pointer -O3 -DRAPMAP_SALMON_SUPPORT -DHAVE_ANSI_TERM -DHAVE_SSTREAM -Wall -Wno-unknown-pragmas -Wno-reorder -Wno-unused-variable -std=c++11 -Wreturn-type -Werror=return-type")
 
 ##
 # OSX is strange (some might say, stupid in this regard).  Deal with it's quirkines here.
@@ -50,6 +55,11 @@ set( BOOST_EXTRA_FLAGS "--layout=tagged" )
 ## this get's set differently below if we
 ## are on clang & apple
 set (NON_APPLECLANG_LIBS gomp rt)
+
+if(UNIX AND NOT APPLE)
+	set(LIBRT rt)
+endif()
+
 set (PTHREAD_LIB)
 
 ##
@@ -123,6 +133,12 @@ else ()
     message(FATAL_ERROR "Your C++ compiler does not support C++11.")
 endif ()
 
+if (DO_QUIET_MAKE)
+    set( QUIET_MAKE "--silent" )
+else()
+    set( QUIET_MAKE "")
+endif()
+
 ## TODO: Figure out how to detect this automatically
 # If the "assembler" is too old, tell TBB not to compile
 # with -mrtm
@@ -153,7 +169,7 @@ endif ()
 set (GAT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 # Have CMake tell us what it's doing
-set (CMAKE_VERBOSE_MAKEFILE true)
+# set (CMAKE_VERBOSE_MAKEFILE true)
 
 ###
 #
@@ -176,7 +192,12 @@ endif ( DEFINED CUSTOM_BOOST_PATH )
 ##
 # We want static, multithreaded boost libraries
 ##
-set (Boost_USE_STATIC_LIBS ON)
+if(CONDA_BUILD)
+  set (Boost_USE_STATIC_LIBS OFF)
+else ()
+  set (Boost_USE_STATIC_LIBS ON)
+endif(CONDA_BUILD)
+
 set (Boost_USE_MULTITHREADED ON)
 #set (Boost_USE_STATIC_RUNTIME OFF)
 
@@ -196,14 +217,16 @@ ExternalProject_Add(liblzma
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     BUILD_IN_SOURCE TRUE
     CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/xz-5.2.2/configure --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
-    BUILD_COMMAND make
-    INSTALL_COMMAND make install
+    BUILD_COMMAND make ${QUIET_MAKE}
+    INSTALL_COMMAND make ${QUIET_MAKE} install
 )
+
 # Tell cmake that the external project generated a library so we can
 # add dependencies here instead of later
 set (LIBLZMA_LIBRARIES ${GAT_SOURCE_DIR}/external/install/lib/liblzma.a)
 set (LIBSTADEN_LDFLAGS "-L${GAT_SOURCE_DIR}/external/install/lib")
 set (LIBSTADEN_CFLAGS "-I${GAT_SOURCE_DIR}/external/install/include")
+set (FETCHED_LIBLZMA TRUE)
 else()
     message("Found liblzma library: ${LIBLZMA_LIBRARIES}")
     message("===========================================")
@@ -220,14 +243,15 @@ ExternalProject_Add(libbz2
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     BUILD_IN_SOURCE TRUE
     CONFIGURE_COMMAND ""
-    BUILD_COMMAND make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
-    INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
+    BUILD_COMMAND make ${QUIET_MAKE} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
+    INSTALL_COMMAND make ${QUIET_MAKE} install PREFIX=<INSTALL_DIR>
 )
 # Tell cmake that the external project generated a library so we can
 # add dependencies here instead of later
 set (BZIP2_LIBRARIES ${GAT_SOURCE_DIR}/external/install/lib/libbz2.a)
 set (LIBSTADEN_LDFLAGS "-L${GAT_SOURCE_DIR}/external/install/lib -I${GAT_SOURCE_DIR}/external/install/include")
 set (LIBSTADEN_CFLAGS "-I${GAT_SOURCE_DIR}/external/install/include")
+set (FETCHED_LIBBZ2 TRUE)
 else()
     message("Found libbz2 library: ${BZIP2_LIBRARIES}")
     message("===========================================")
@@ -237,7 +261,7 @@ endif()
 # Set the latest version and look for what we need
 ##
 set(Boost_ADDITIONAL_VERSIONS "1.53" "1.53.0" "1.54" "1.55" "1.56" "1.57.0" "1.58" "1.59" "1.60" "1.61")
-find_package(Boost 1.53.0 COMPONENTS iostreams filesystem system thread timer chrono program_options serialization)
+find_package(Boost 1.53.0 COMPONENTS iostreams filesystem system thread timer chrono program_options)
 message("BOOST_INCLUDEDIR = ${BOOST_INCLUDEDIR}")
 message("BOOST_LIBRARYDIR = ${BOOST_LIBRARYDIR}")
 message("Boost_FOUND = ${Boost_FOUND}")
@@ -311,6 +335,7 @@ elseif(FETCH_BOOST)
         COMMAND ${CMAKE_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR} ${RECONFIG_FLAGS}
         DEPENDEES install
     )
+    set (FETCHED_BOOST TRUE)
 endif()
 
 ##
@@ -341,12 +366,19 @@ ExternalProject_Add(libdivsufsort
     URL ${CMAKE_CURRENT_SOURCE_DIR}/external/libdivsufsort.zip
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libdivsufsort-master
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
-    UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
+    #UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
     BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libdivsufsort-master/build
     CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR> -DBUILD_DIVSUFSORT64=TRUE -DUSE_OPENMP=TRUE -DBUILD_SHARED_LIBS=FALSE
 )
+ExternalProject_Add_Step(libdivsufsort makedir
+  COMMAND mkdir -p <SOURCE_DIR>/build 
+  COMMENT "Make build directory"
+  DEPENDEES download 
+  DEPENDERS configure)
+ 
 set(SUFFARRAY_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/external/install/include)
 
+
 message("Build system will fetch and build the Cereal serialization library")
 message("==================================================================")
 include(ExternalProject)
@@ -356,12 +388,17 @@ ExternalProject_Add(libcereal
 		tar -xzvf cereal-v1.1.2.tar.gz
 	SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.1.2
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
-    UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
+    #UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
     BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/cereal-1.1.2/build
     CONFIGURE_COMMAND ""
     BUILD_COMMAND ""
     INSTALL_COMMAND sh -c "mkdir -p <INSTALL_DIR>/include && cp -r <SOURCE_DIR>/include/cereal <INSTALL_DIR>/include"
 )
+ExternalProject_Add_Step(libcereal makedir
+  COMMAND mkdir -p <SOURCE_DIR>/build 
+  COMMENT "Make build directory"
+  DEPENDEES download 
+  DEPENDERS configure)
 
 message("Build system will fetch and build BWA (for Salmon)")
 message("==================================================================")
@@ -374,7 +411,7 @@ ExternalProject_Add(libbwa
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/bwa-master
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     CONFIGURE_COMMAND ""
-    BUILD_COMMAND sh -c "make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}"
+    BUILD_COMMAND sh -c "make ${QUIET_MAKE} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}"
     INSTALL_COMMAND sh -c "mkdir -p <INSTALL_DIR>/lib && mkdir -p <INSTALL_DIR>/include/bwa && cp libbwa.a <INSTALL_DIR>/lib && cp *.h <INSTALL_DIR>/include/bwa && cp is.c bwtindex.c bwt_gen.c QSufSort.c ${CMAKE_CURRENT_SOURCE_DIR}/src/"
     BUILD_IN_SOURCE TRUE
 )
@@ -394,18 +431,20 @@ ExternalProject_Add(libjellyfish
     CONFIGURE_COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/external/jellyfish-2.2.6/configure --prefix=<INSTALL_DIR> CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
     BUILD_COMMAND ${MAKE} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CXXFLAGS=${JELLYFISH_CXX_FLAGS}
     BUILD_IN_SOURCE 1
-    INSTALL_COMMAND make install
+    INSTALL_COMMAND make ${QUIET_MAKE} install
 )
+set (FETCHED_JELLYFISH TRUE)
 endif()
 
-find_package(TBB)
+## Try and find TBB first
+find_package(TBB 4.4 COMPONENTS tbb tbbmalloc tbbmalloc_proxy )
 
 ##
 #
 # Fetch and build Intel's Threading Building Blocks library.
 #
 ##
-if(NOT TBB_FOUND)
+if((NOT TBB_FOUND) OR (TBB_FOUND AND (TBB_VERSION VERSION_LESS 4.4)))
 
 set(TBB_WILL_RECONFIGURE TRUE)
 # Set the appropriate compiler
@@ -418,7 +457,7 @@ endif()
 message("Build system will fetch and build Intel Threading Building Blocks")
 message("==================================================================")
 # These are useful for the custom install step we'll do later
-set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb44_20160526oss)
+set(TBB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2017_U3)
 set(TBB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install)
 
 if ("${TBB_COMPILER}" STREQUAL "gcc")
@@ -429,14 +468,14 @@ endif()
 
 ExternalProject_Add(libtbb
 	DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    URL https://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb44_20160526oss_src_0.tgz
-    DOWNLOAD_COMMAND curl -k -L https://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb44_20160526oss_src_0.tgz -o tbb_20160526oss_src.tgz &&
-                    tar -xzvf tbb_20160526oss_src.tgz
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb44_20160526oss
+    #URL https://github.com/01org/tbb/archive/2017_U3.tar.gz 
+    DOWNLOAD_COMMAND curl -k -L https://github.com/01org/tbb/archive/2017_U3.tar.gz -o tbb-2017_U3.tgz &&
+                    tar -xzvf tbb-2017_U3.tgz
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/tbb-2017_U3
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     PATCH_COMMAND "${TBB_PATCH_STEP}"
     CONFIGURE_COMMAND ""
-    BUILD_COMMAND make CXXFLAGS=${TBB_CXXFLAGS} lambdas=1 compiler=${TBB_COMPILER} cfg=release tbb_build_prefix=LIBS
+    BUILD_COMMAND make ${QUIET_MAKE} CXXFLAGS=${TBB_CXXFLAGS} lambdas=1 compiler=${TBB_COMPILER} cfg=release tbb_build_prefix=LIBS
     INSTALL_COMMAND sh -c "cp ${TBB_SOURCE_DIR}/build/LIBS_release/*.${SHARED_LIB_EXTENSION}* ${TBB_INSTALL_DIR}/lib && cp -r ${TBB_SOURCE_DIR}/include/* ${TBB_INSTALL_DIR}/include"
     BUILD_IN_SOURCE 1
 )
@@ -446,6 +485,8 @@ ExternalProject_Add_Step(libtbb reconfigure
         COMMAND ${CMAKE_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR} ${RECONFIG_FLAGS}
         DEPENDEES install
 )
+
+set (FETCHED_TBB TRUE)
 endif()
 
 ##
@@ -483,24 +524,34 @@ message("Build system will compile libgff")
 message("==================================================================")
 ExternalProject_Add(libgff
     DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    DOWNLOAD_COMMAND curl -k -L https://github.com/Kingsford-Group/libgff/archive/v1.0.tar.gz -o libgff.tgz &&
+    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/libgff/archive/v1.1.tar.gz -o libgff.tgz &&
     tar -xzvf libgff.tgz &&
     rm -fr libgff &&
-    mv libgff-1.0 libgff
+    mv libgff-1.1 libgff
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libgff
-    UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
+    #UPDATE_COMMAND sh -c "mkdir -p <SOURCE_DIR>/build"
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/libgff/build
     CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_SOURCE_DIR}/external/install
 )
+ExternalProject_Add_Step(libgff makedir
+  COMMAND mkdir -p <SOURCE_DIR>/build 
+  COMMENT "Make build directory"
+  DEPENDEES download 
+  DEPENDERS configure)
 
-# Because of the way that Apple has changed SIP 
+# Because of the way that Apple has changed SIP
 # in el capitan, some headers may be in a new location
 if (APPLE)
     set(STADEN_INC "-I/usr/local/include")
     set(STADEN_LIB "-L/usr/local/lib")
 endif()
 
+if (CONDA_BUILD)
+  set(LZFLAG "-lz")
+else ()
+  set(LZFLAG "")
+endif (CONDA_BUILD)
 
 message("Build system will compile Staden IOLib")
 message("==================================================================")
@@ -514,7 +565,7 @@ ExternalProject_Add(libstadenio
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/staden-io_lib
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     CONFIGURE_COMMAND ./configure --enable-shared=no --without-libcurl --prefix=<INSTALL_DIR> LDFLAGS=${LIBSTADEN_LDFLAGS} CFLAGS=${LIBSTADEN_CFLAGS} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
-    BUILD_COMMAND make CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS+=${STADEN_INC} CFLAGS+=${STADEN_LIB}
+    BUILD_COMMAND make ${QUIET_MAKE} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS+=${STADEN_INC} CFLAGS+=${STADEN_LIB} CFLAGS+=${LZFLAG}
 
     BUILD_IN_SOURCE 1
     INSTALL_COMMAND make install
@@ -524,10 +575,10 @@ message("Build system will fetch SPDLOG")
 message("==================================================================")
 ExternalProject_Add(libspdlog
     DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/spdlog/archive/v1.12.tar.gz -o spdlog-v1.12.tar.gz &&
-                     tar -xzf spdlog-v1.12.tar.gz &&
+    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/spdlog/archive/v0.11.0.tar.gz -o spdlog-v0.11.0.tar.gz &&
+                     tar -xzf spdlog-v0.11.0.tar.gz &&
                      rm -fr spdlog &&
-                     mv -f  spdlog-1.12 spdlog
+                     mv -f  spdlog-0.11.0 spdlog
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/spdlog
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     CONFIGURE_COMMAND ""
@@ -558,40 +609,55 @@ if (NOT HAVE_FAST_MALLOC)
     endif()
 endif()
 
+if(CONDA_BUILD)
+  set (JEMALLOC_FLAGS "CC=${CMAKE_C_COMPILER} CFLAGS=-fPIC CPPFLAGS=-fPIC")
+else ()
+  set (JEMALLOC_FLAGS "CC=${CMAKE_C_COMPILER}")
+endif()
+
 if (NOT HAVE_FAST_MALLOC)
     message("Build system will fetch and use JEMalloc")
     message("==================================================================")
     ExternalProject_Add(libjemalloc
         DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-        DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/jemalloc/archive/4.0.4.tar.gz -o jemalloc-4.0.4.tar.gz &&
-        tar -xzf jemalloc-4.0.4.tar.gz
-        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jemalloc-4.0.4
+        DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/jemalloc/archive/4.4.0.tar.gz -o jemalloc-4.4.0.tar.gz &&
+        tar -xzf jemalloc-4.4.0.tar.gz
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/jemalloc-4.4.0
         BUILD_IN_SOURCE TRUE
         INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
-        CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} ./autogen.sh --prefix=<INSTALL_DIR>"
+        CONFIGURE_COMMAND sh -c "${JEMALLOC_FLAGS} ./autogen.sh --prefix=<INSTALL_DIR>"
         INSTALL_COMMAND cp -r lib <INSTALL_DIR>/ && cp -r include <INSTALL_DIR>/
         )
 
     set (FAST_MALLOC_LIB ${CMAKE_CURRENT_SOURCE_DIR}/external/install/lib/libjemalloc.a)
     set (HAVE_FAST_MALLOC TRUE)
+    set (FETCHED_JEMALLOC TRUE)
+    if (FETCHED_LIBBZ2)
+       add_dependencies(libjemalloc libbz2)
+    endif()
+
+    if (FETCHED_LIBLZMA)
+       add_dependencies(libjemalloc liblzma)
+    endif()
+
 endif ()
 
 
 ##
 ## This depenency is for RapMap
 ##
-message("Build system will fetch and build SparseHash")
-message("==================================================================")
-ExternalProject_Add(libsparsehash
-    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/sparsehash/archive/sparsehash-2.0.2.tar.gz -o sparsehash-2.0.2.tar.gz &&
-        tar -xzf sparsehash-2.0.2.tar.gz
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/sparsehash-sparsehash-2.0.2
-    BUILD_IN_SOURCE TRUE
-    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
-    CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ./configure --prefix=<INSTALL_DIR>"
-    INSTALL_COMMAND make install
-)
+#message("Build system will fetch and build SparseHash")
+#message("==================================================================")
+#ExternalProject_Add(libsparsehash
+#    DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
+#    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/sparsehash/archive/sparsehash-2.0.2.tar.gz -o sparsehash-2.0.2.tar.gz &&
+#        tar -xzf sparsehash-2.0.2.tar.gz
+#    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/sparsehash-sparsehash-2.0.2
+#    BUILD_IN_SOURCE TRUE
+#    INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
+#    CONFIGURE_COMMAND sh -c "CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ./configure --prefix=<INSTALL_DIR>"
+#    INSTALL_COMMAND make install
+#)
 
 ###
 #
diff --git a/doc/source/salmon.rst b/doc/source/salmon.rst
index 6f6dc30..4e0002c 100644
--- a/doc/source/salmon.rst
+++ b/doc/source/salmon.rst
@@ -492,7 +492,13 @@ compatible format. If no options are provided to this argument, then
 the output will be written to stdout (so that e.g. it can be piped to
 samtools and directly converted into BAM format).  Otherwise, this 
 argument can optionally be provided with a filename, and the mapping 
-information will be written to that file.
+information will be written to that file. **Note:** Because of the way
+that the boost options parser that we use works, and the fact that 
+``--writeMappings`` has an implicit argument of ``stdout``, if you 
+provide an explicit argument to ``--writeMappings``, you must do so 
+with the syntax ``--writeMappings=<outfile>`` rather than the synatx 
+``--writeMappings <outfile>``.  This is a due to a limitation of the 
+parser in how the latter could be interpreted.
 
 .. note:: Compatible mappings
 
diff --git a/include/AlignmentLibrary.hpp b/include/AlignmentLibrary.hpp
index 970dd60..509f773 100644
--- a/include/AlignmentLibrary.hpp
+++ b/include/AlignmentLibrary.hpp
@@ -65,6 +65,8 @@ class AlignmentLibrary {
     	fragStartDists_(5),
         posBiasFW_(5),
         posBiasRC_(5),
+	posBiasExpectFW_(5),
+	posBiasExpectRC_(5),
         seqBiasModel_(1.0),
     	eqBuilder_(salmonOpts.jointLog),
         quantificationPasses_(0),
@@ -123,6 +125,7 @@ class AlignmentLibrary {
             fmt::print(stderr, "Populating targets from aln = {}, fasta = {} . . .",
                        alnFiles.front(), transcriptFile_);
             fp.populateTargets(transcripts_, salmonOpts);
+            /*
 	    for (auto& txp : transcripts_) {
 		    // Length classes taken from
 		    // ======
@@ -143,6 +146,15 @@ class AlignmentLibrary {
 			    txp.lengthClassIndex(0);
 		    }
 	    }
+        */
+
+            std::vector<uint32_t> lengths;
+            lengths.reserve(transcripts_.size());
+            for (auto& txp : transcripts_) {
+               lengths.push_back(txp.RefLength);
+            }
+            setTranscriptLengthClasses_(lengths, posBiasFW_.size());
+
             fmt::print(stderr, "done\n");
 
             // Create the cluster forest for this set of transcripts
@@ -165,6 +177,34 @@ class AlignmentLibrary {
             alnMod_.reset(new AlignmentModel(1.0, salmonOpts.numErrorBins));
             alnMod_->setLogger(salmonOpts.jointLog);
 
+            if (libFmt.type == ReadType::SINGLE_END) {
+                // Convert the PMF to non-log scale
+                std::vector<double> logPMF;
+                size_t minVal;
+                size_t maxVal;
+                flDist_->dumpPMF(logPMF, minVal, maxVal);
+                double sum = salmon::math::LOG_0;
+                for (auto v : logPMF) {
+                    sum = salmon::math::logAdd(sum, v);
+                }
+                for (auto& v : logPMF) {
+                    v -= sum;
+                }
+
+                // Create the non-logged distribution.
+                // Here, we multiply by 100 to discourage small
+                // numbers in the correctionFactorsfromCounts call
+                // below.
+                std::vector<double> pmf(maxVal + 1, 0.0);
+                for (size_t i = minVal; i < maxVal; ++i) {
+                    pmf[i] = 100.0 * std::exp(logPMF[i - minVal]);
+                }
+
+                using distribution_utils::DistributionSpace;
+                // We compute the factors in linear space (since we've de-logged the pmf)
+                conditionalMeans_ = distribution_utils::correctionFactorsFromMass(pmf, DistributionSpace::LINEAR);
+            }
+
             // Start parsing the alignments
            NullFragmentFilter<FragT>* nff = nullptr;
            bool onlyProcessAmbiguousAlignments = false;
@@ -222,6 +262,8 @@ class AlignmentLibrary {
         }
     }
 
+    const std::vector<double>& condMeans() const { return conditionalMeans_; }
+
     std::vector<Transcript>& transcripts() { return transcripts_; }
     const std::vector<Transcript>& transcripts() const { return transcripts_; }
 
@@ -351,6 +393,14 @@ class AlignmentLibrary {
         return (dir == salmon::utils::Direction::FORWARD) ? posBiasFW_ : posBiasRC_; 
     }
 
+    std::vector<SimplePosBias>& posBiasExpected(salmon::utils::Direction dir) {
+	return (dir == salmon::utils::Direction::FORWARD) ? posBiasExpectFW_ : posBiasExpectRC_;
+    }
+
+    const std::vector<SimplePosBias>& posBiasExpected(salmon::utils::Direction dir) const {
+	return (dir == salmon::utils::Direction::FORWARD) ? posBiasExpectFW_ : posBiasExpectRC_;
+    }
+
     ReadKmerDist<6, std::atomic<uint32_t>>& readBias(salmon::utils::Direction dir) { 
         return (dir == salmon::utils::Direction::FORWARD) ? readBias_[0] : readBias_[1]; 
     }
@@ -377,7 +427,49 @@ class AlignmentLibrary {
 	readBiasModelExpected_[idx] = std::move(model);
     }
  
+    const std::vector<uint32_t>& getLengthQuantiles() const { return lengthQuantiles_; }
+    
     private:
+    
+    void setTranscriptLengthClasses_(std::vector<uint32_t>& lengths, size_t nbins) {
+        auto n = lengths.size();
+        if ( n > nbins) {
+            lengthQuantiles_.clear();
+            lengthQuantiles_.reserve(nbins);
+      
+            size_t step = lengths.size() / nbins;
+            size_t cumStep = 0;
+            for (size_t i = 0; i < nbins; ++i) {
+                cumStep += step;
+                size_t ind = std::min(cumStep, n-1);
+                std::nth_element(lengths.begin(), lengths.begin() + ind, lengths.end());
+                // Find the proper quantile 
+                lengthQuantiles_.push_back(*(lengths.begin() + ind));
+            }
+        } else {
+            lengthQuantiles_.clear();
+            lengthQuantiles_.reserve(n);
+            std::sort(lengths.begin(), lengths.end());
+            for (auto l : lengths) {
+                lengthQuantiles_.push_back(l);
+            }
+            posBiasFW_.resize(n);
+            posBiasRC_.resize(n);
+            posBiasExpectFW_.resize(n);
+            posBiasExpectRC_.resize(n);
+        }
+
+        auto qb = lengthQuantiles_.begin();
+        auto qe = lengthQuantiles_.end();
+        auto maxQuant = std::distance(qb, qe) - 1;
+        for (auto& t : transcripts_) {
+            auto ind = std::min(maxQuant, std::distance(qb, std::upper_bound(qb, qe, t.RefLength)));
+            // the index is the smallest quantile longer than this length
+            t.lengthClassIndex(ind);
+        }
+    }
+
+
     /**
      * The file from which the alignments will be read.
      * This can be a SAM or BAM file, and can be a regular
@@ -438,8 +530,11 @@ class AlignmentLibrary {
     EquivalenceClassBuilder eqBuilder_;
 
     /** Positional bias things**/
+    std::vector<uint32_t> lengthQuantiles_;
     std::vector<SimplePosBias> posBiasFW_;
     std::vector<SimplePosBias> posBiasRC_;
+    std::vector<SimplePosBias> posBiasExpectFW_;
+    std::vector<SimplePosBias> posBiasExpectRC_;
  
     /** GC-fragment bias things **/
     // One bin for each percentage GC content
@@ -456,6 +551,7 @@ class AlignmentLibrary {
     //ReadKmerDist<6, std::atomic<uint32_t>> readBias_;
     std::vector<double> expectedBias_;
     std::unique_ptr<LibraryTypeDetector> detector_{nullptr};
+    std::vector<double> conditionalMeans_;
 };
 
 #endif // ALIGNMENT_LIBRARY_HPP
diff --git a/include/CollapsedGibbsSampler.hpp b/include/CollapsedGibbsSampler.hpp
index aa617cb..16bca52 100644
--- a/include/CollapsedGibbsSampler.hpp
+++ b/include/CollapsedGibbsSampler.hpp
@@ -22,8 +22,17 @@ class CollapsedGibbsSampler {
         template <typename ExpT>
         bool sample(ExpT& readExp,
                       SalmonOpts& sopt,
-                      std::function<bool(const std::vector<int>&)>& writeBootstrap,
+                      std::function<bool(const std::vector<double>&)>& writeBootstrap,
                       uint32_t numSamples = 500);
+
+  /*
+        template <typename ExpT>
+        bool sampleMultipleChains(ExpT& readExp,
+              SalmonOpts& sopt,
+              std::function<bool(const std::vector<double>&)>& writeBootstrap,
+              uint32_t numSamples = 500);
+  */
+
 };
 
 #endif // COLLAPSED_EM_OPTIMIZER_HPP
diff --git a/include/EquivalenceClassBuilder.hpp b/include/EquivalenceClassBuilder.hpp
index 1f53ed3..d589c99 100644
--- a/include/EquivalenceClassBuilder.hpp
+++ b/include/EquivalenceClassBuilder.hpp
@@ -69,7 +69,7 @@ class EquivalenceClassBuilder {
             countMap_.reserve(1000000);
         }
 
-        ~EquivalenceClassBuilder() {}
+  //~EquivalenceClassBuilder() {}
 
         void start() { active_ = true; }
 
diff --git a/include/FragmentLengthDistribution.hpp b/include/FragmentLengthDistribution.hpp
index 5aa1800..e057d5e 100644
--- a/include/FragmentLengthDistribution.hpp
+++ b/include/FragmentLengthDistribution.hpp
@@ -16,6 +16,8 @@
 #include <string>
 #include <mutex>
 
+#include "SpinLock.hpp" // RapMap's with try_lock
+
 /**
  * The LengthDistribution class keeps track of the observed length distribution.
  * It is initialized with a Gaussian prior with parameters specified by the
@@ -36,9 +38,12 @@ class FragmentLengthDistribution {
    /**
    * A private vector that stores the observed (logged) mass for each length.
    */
-    std::vector<double> cachedCMF_;
+  std::vector<double> cachedCMF_;
+  std::vector<double> cachedPMF_;
     volatile bool haveCachedCMF_;
-    std::mutex fldMut_;
+  //std::mutex fldMut_;
+  SpinLock sl_;
+
 
   /**
    * A private double that stores the total observed (logged) mass.
@@ -127,6 +132,8 @@ public:
    * @return (Logged) cmf of bins.
    */
   std::vector<double> cmf() const;
+  // do the same thing as above, but use pmf to compute this CMF
+  std::vector<double> cmf(const std::vector<double>& pmf) const;
 
 
   /**
diff --git a/include/GCFragModel.hpp b/include/GCFragModel.hpp
index 67fcafa..d2ccbe9 100644
--- a/include/GCFragModel.hpp
+++ b/include/GCFragModel.hpp
@@ -2,180 +2,226 @@
 #define __GC_FRAG_MODEL__
 
 #include "DistributionUtils.hpp"
-#include "SalmonMath.hpp"
 #include "Eigen/Dense"
+#include "SalmonMath.hpp"
 
 #include <boost/iostreams/filtering_stream.hpp>
 
-#include <vector>
 #include <iostream>
+#include <vector>
 
 struct GCDesc {
-    int32_t fragFrac;
-    int32_t contextFrac;
-
-    // assumes 101 bins
-    int32_t fragBin() { return fragFrac; }
-    int32_t contextBin() { return contextFrac; }
-
-    int32_t fragBin(int32_t n) {
-        double w = (100.0 / n);
-        return std::min(n-1, static_cast<int32_t>(fragFrac / w));
-    }
-    int32_t contextBin(int32_t n) {
-        double w = (100.0 / n);
-        return std::min(n-1, static_cast<int32_t>(contextFrac / w));
-    }
+  int32_t fragFrac;
+  int32_t contextFrac;
+
+  // assumes 101 bins
+  int32_t fragBin() { return fragFrac; }
+  int32_t contextBin() { return contextFrac; }
+
+  int32_t fragBin(int32_t n) {
+    double w = (100.0 / n);
+    return std::min(n - 1, static_cast<int32_t>(fragFrac / w));
+  }
+  int32_t contextBin(int32_t n) {
+    double w = (100.0 / n);
+    return std::min(n - 1, static_cast<int32_t>(contextFrac / w));
+  }
 };
 
 class GCFragModel {
 public:
-  GCFragModel(size_t condBins=3,
-	      size_t numGCBins=101,
-	      distribution_utils::DistributionSpace dspace=distribution_utils::DistributionSpace::LOG) : 
-    condBins_(condBins),
-    numGCBins_(numGCBins),
-	dspace_(dspace),
-        normalized_(false)
-    {
-        counts_ = Eigen::MatrixXd(condBins_, numGCBins_);
-	if (dspace_ == distribution_utils::DistributionSpace::LOG) {
-	  counts_.setOnes();
-	  counts_ *= salmon::math::LOG_0;
-	} else {
-	  counts_.setZero();
-	}
+  GCFragModel(size_t condBins = 3, size_t numGCBins = 101,
+              distribution_utils::DistributionSpace dspace =
+                  distribution_utils::DistributionSpace::LOG)
+      : condBins_(condBins), numGCBins_(numGCBins), dspace_(dspace),
+        normalized_(false) {
+    counts_ = Eigen::MatrixXd(condBins_, numGCBins_);
+    if (dspace_ == distribution_utils::DistributionSpace::LOG) {
+      counts_.setOnes();
+      counts_ *= salmon::math::LOG_0;
+    } else {
+      counts_.setZero();
     }
-
-    bool writeBinary(boost::iostreams::filtering_ostream& out) const {
-        auto* mutThis = const_cast<GCFragModel*>(this);
-        int32_t dtype = (dspace_ == distribution_utils::DistributionSpace::LINEAR) ? 0 : 1;
-        out.write(reinterpret_cast<char*>(&dtype), sizeof(dtype));
-        typename Eigen::MatrixXd::Index rows= counts_.rows(), cols= counts_.cols();
-        out.write(reinterpret_cast<char*>(&rows), sizeof(typename Eigen::MatrixXd::Index));
-        out.write(reinterpret_cast<char*>(&cols), sizeof(typename Eigen::MatrixXd::Index));
-        out.write(reinterpret_cast<char*>(mutThis->counts_.data()), rows*cols*sizeof(typename Eigen::MatrixXd::Scalar));
-        return true;
+    // set the total vector to be the right size and full of 0's.
+    modelTotals_.resize(condBins_, 0.0);
+  }
+
+  bool writeBinary(boost::iostreams::filtering_ostream& out) const {
+    auto* mutThis = const_cast<GCFragModel*>(this);
+    int32_t dtype =
+        (dspace_ == distribution_utils::DistributionSpace::LINEAR) ? 0 : 1;
+    out.write(reinterpret_cast<char*>(&dtype), sizeof(dtype));
+    typename Eigen::MatrixXd::Index rows = counts_.rows(),
+                                    cols = counts_.cols();
+    out.write(reinterpret_cast<char*>(&rows),
+              sizeof(typename Eigen::MatrixXd::Index));
+    out.write(reinterpret_cast<char*>(&cols),
+              sizeof(typename Eigen::MatrixXd::Index));
+    out.write(reinterpret_cast<char*>(const_cast<double*>(modelTotals_.data())), sizeof(double) * rows);
+    out.write(reinterpret_cast<char*>(mutThis->counts_.data()),
+              rows * cols * sizeof(typename Eigen::MatrixXd::Scalar));
+    return true;
+  }
+
+  GCFragModel(const GCFragModel&) = default;
+  GCFragModel(GCFragModel&&) = default;
+  GCFragModel& operator=(const GCFragModel&) = default;
+  GCFragModel& operator=(GCFragModel&&) = default;
+
+  /*
+  double likelihood_(uint32_t numBins) {
+  }
+  uint32_t optNumBins() {
+    if (numGCBins_ != 101) {
+      std::cerr << "Selecting the optimal number of bins is currently "
+                << "only supported when the initial histograms are generated "
+                << "using 101 bins.\n";
+      std::exit(1);
     }
+    std::vector<uint32_t> nbins{5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 101};
+    // Using the "BR" penalty
+    // see:
+    // Davies, Laurie, et al. "A comparison of automatic histogram constructions."
+    // ESAIM: Probability and Statistics 13 (2009): 181-196.
+    std::vector<double> scores;
+    for (auto nb : nbins) {
+      likelihood_(nb)
+    }
+  }
+  */
+
+  void reset(distribution_utils::DistributionSpace dspace =
+                 distribution_utils::DistributionSpace::LOG) {
+    normalized_ = false;
+    dspace_ = dspace;
+    if (dspace_ == distribution_utils::DistributionSpace::LOG) {
+      counts_.setOnes();
+      counts_ *= salmon::math::LOG_0;
+    } else {
+      counts_.setZero();
+    }
+  }
 
-
-    GCFragModel(const GCFragModel&) = default;
-    GCFragModel(GCFragModel&&) = default;
-    GCFragModel& operator=(const GCFragModel&) = default;
-    GCFragModel& operator=(GCFragModel&&) = default;
- 
-    void reset(distribution_utils::DistributionSpace dspace=distribution_utils::DistributionSpace::LOG) {
-        normalized_ = false;
-	dspace_=dspace;
-	if (dspace_ == distribution_utils::DistributionSpace::LOG) {
-	  counts_.setOnes();
-	  counts_ *= salmon::math::LOG_0;
-	} else {
-	  counts_.setZero();
-	}
+  GCFragModel ratio(GCFragModel& other, double maxRatio) {
+    if (!normalized_) {
+      normalize();
     }
-    
-    GCFragModel ratio(GCFragModel& other, double maxRatio) {
-        if (!normalized_) { normalize(); }
-        if (!other.normalized_) { other.normalize(); }
-        double minRatio = 1.0 / maxRatio;
-
-        GCFragModel ratioModel(condBins_, numGCBins_, dspace_);
-        for (size_t r = 0; r <condBins_; ++r) {
-            for (size_t c = 0; c < numGCBins_; ++c) {
-                double rat = (counts_(r,c) / other.counts_(r,c));
-                if (rat > maxRatio) { rat = maxRatio; }
-                if (rat < minRatio ) { rat = minRatio; }
-                ratioModel.counts_(r,c) =  rat;
-            }
+    if (!other.normalized_) {
+      other.normalize();
+    }
+    double minRatio = 1.0 / maxRatio;
+
+    GCFragModel ratioModel(condBins_, numGCBins_, dspace_);
+    for (size_t r = 0; r < condBins_; ++r) {
+      for (size_t c = 0; c < numGCBins_; ++c) {
+        double rat = (counts_(r, c) / other.counts_(r, c));
+        if (rat > maxRatio) {
+          rat = maxRatio;
+        }
+        if (rat < minRatio) {
+          rat = minRatio;
         }
-        return ratioModel;
+        ratioModel.counts_(r, c) = rat;
+      }
     }
-
-    void inc(
-             GCDesc desc,
-             double fragWeight    //< the weight associated with this fragment 
-             ) {
-      auto ctx = (condBins_ > 1) ? desc.contextBin(condBins_) : 0;
-        auto frag = (numGCBins_ != 101) ? desc.fragBin(numGCBins_) : desc.fragBin();
-
-	if (dspace_ == distribution_utils::DistributionSpace::LOG) {
-	  counts_(ctx, frag) = salmon::math::logAdd(counts_(ctx, frag), fragWeight);
-	} else {
-	  counts_(ctx, frag) += fragWeight;
-	}
+    return ratioModel;
+  }
+
+  void inc(GCDesc desc,
+           double fragWeight //< the weight associated with this fragment
+           ) {
+    auto ctx = (condBins_ > 1) ? desc.contextBin(condBins_) : 0;
+    auto frag = (numGCBins_ != 101) ? desc.fragBin(numGCBins_) : desc.fragBin();
+
+    if (dspace_ == distribution_utils::DistributionSpace::LOG) {
+      counts_(ctx, frag) = salmon::math::logAdd(counts_(ctx, frag), fragWeight);
+    } else {
+      counts_(ctx, frag) += fragWeight;
     }
+  }
 
   double get(GCDesc desc) {
-      auto ctx = (condBins_ > 1) ? desc.contextBin(condBins_) : 0;
-        auto frag = (numGCBins_ != 101) ? desc.fragBin(numGCBins_) : desc.fragBin();
-        return counts_(ctx, frag); 
+    auto ctx = (condBins_ > 1) ? desc.contextBin(condBins_) : 0;
+    auto frag = (numGCBins_ != 101) ? desc.fragBin(numGCBins_) : desc.fragBin();
+    return counts_(ctx, frag);
+  }
+
+  distribution_utils::DistributionSpace distributionSpace() const {
+    return dspace_;
+  }
+
+  void combineCounts(const GCFragModel& other) {
+    if (dspace_ != other.dspace_) {
+      std::cerr
+          << "Cannot combine distributions that live in a different space!\n";
+      std::exit(1);
     }
-
-  distribution_utils::DistributionSpace distributionSpace() const { return dspace_; }
-
-    void combineCounts(const GCFragModel& other) {
-      if (dspace_ != other.dspace_) {
-	std::cerr << "Cannot combine distributions that live in a different space!\n";
-	std::exit(1);
+    if (dspace_ == distribution_utils::DistributionSpace::LOG) {
+      for (size_t r = 0; r < condBins_; ++r) {
+        for (size_t c = 0; c < numGCBins_; ++c) {
+          counts_(r, c) =
+              salmon::math::logAdd(counts_(r, c), other.counts_(r, c));
+        }
+      }
+    } else {
+      for (size_t r = 0; r < condBins_; ++r) {
+        for (size_t c = 0; c < numGCBins_; ++c) {
+          counts_(r, c) += other.counts_(r, c);
+        }
       }
+    }
+  }
+
+  /**
+   * NOTE: Improve interface --- also converts out of log space
+   */
+  void normalize(double prior = 0.1) {
+    if (!normalized_) {
       if (dspace_ == distribution_utils::DistributionSpace::LOG) {
-	for (size_t r = 0; r <condBins_; ++r) {
-	  for (size_t c = 0; c < numGCBins_; ++c) {
-	    counts_(r,c) = salmon::math::logAdd(counts_(r,c), other.counts_(r,c));
-	  }
-	}
+        prior = std::log(prior);
+        for (size_t r = 0; r < condBins_; ++r) {
+          double rowMass{salmon::math::LOG_0};
+          for (size_t c = 0; c < numGCBins_; ++c) {
+            rowMass = salmon::math::logAdd(
+                prior, salmon::math::logAdd(rowMass, counts_(r, c)));
+          }
+          if (!salmon::math::isLog0(rowMass)) {
+            for (size_t c = 0; c < numGCBins_; ++c) {
+              counts_(r, c) = std::exp(
+                  salmon::math::logAdd(prior, counts_(r, c)) - rowMass);
+            }
+            modelTotals_[r] = std::exp(rowMass);
+          }
+          // if rowMass is LOG_0, then leave modelTotals_[r] as 0.0
+        }
       } else {
-	for (size_t r = 0; r <condBins_; ++r) {
-	  for (size_t c = 0; c < numGCBins_; ++c) {
-	    counts_(r,c) += other.counts_(r,c);
-	  }
-	}
+        for (size_t r = 0; r < condBins_; ++r) {
+          double rowMass = 0.0;
+          for (size_t c = 0; c < numGCBins_; ++c) {
+            rowMass += (prior + counts_(r, c));
+          }
+          if (rowMass > 0.0) {
+            double norm = 1.0 / rowMass;
+            for (size_t c = 0; c < numGCBins_; ++c) {
+              counts_(r, c) = (prior + counts_(r, c)) * norm;
+            }
+            modelTotals_[r] = rowMass;
+          }
+          // if rowMass is 0.0, just leave modelTotals_[r] as 0.0; 
+        }
       }
+      normalized_ = true;
+      dspace_ = distribution_utils::DistributionSpace::LINEAR;
     }
+  }
 
-    /**
-     * NOTE: Improve interface --- also converts out of log space
-     */
-    void normalize(double prior=0.1) {
-        if (!normalized_){
-	  if (dspace_ == distribution_utils::DistributionSpace::LOG) {
-	    prior = std::log(prior);
-	    for (size_t r = 0; r < condBins_; ++r) {
-	      double rowMass{salmon::math::LOG_0};
-	      for (size_t c = 0; c < numGCBins_; ++c) {
-		rowMass = salmon::math::logAdd(prior, salmon::math::logAdd(rowMass, counts_(r,c)));
-	      }
-	      if (!salmon::math::isLog0(rowMass)) {
-		for (size_t c = 0; c < numGCBins_; ++c) {
-		  counts_(r,c) = std::exp(salmon::math::logAdd(prior, counts_(r,c)) - rowMass);
-		}
-	      }
-	    }
-	  } else {
-	    for (size_t r = 0; r < condBins_; ++r) {
-	      double rowMass = 0.0;
-	      for (size_t c = 0; c < numGCBins_; ++c) {
-		rowMass += (prior + counts_(r,c));
-	      }
-	      if (rowMass > 0.0) {
-		double norm = 1.0 / rowMass;
-		for (size_t c = 0; c < numGCBins_; ++c) {
-		  counts_(r,c) = (prior + counts_(r,c)) * norm;
-		}
-	      }
-	    }
-	  }
-	  normalized_ = true;
-	  dspace_ = distribution_utils::DistributionSpace::LINEAR;
-	}
-    }
 private:
   size_t condBins_;
   size_t numGCBins_;
-    distribution_utils::DistributionSpace dspace_;
-    bool normalized_;
-    Eigen::MatrixXd counts_;
+  distribution_utils::DistributionSpace dspace_;
+  bool normalized_;
+  Eigen::MatrixXd counts_;
+  std::vector<double> modelTotals_;
 };
 
 #endif //__GC_FRAG_MODEL__
diff --git a/include/GZipWriter.hpp b/include/GZipWriter.hpp
index deb6e0e..e3975ea 100644
--- a/include/GZipWriter.hpp
+++ b/include/GZipWriter.hpp
@@ -28,9 +28,7 @@ class GZipWriter {
     template <typename ExpT>
     bool writeMeta(
 	const SalmonOpts& opts,
-	const ExpT& experiment,
-    const std::string& tstring  = "now"  // the start time of the run
-	);
+	const ExpT& experiment);
 
     template <typename ExpT>
     bool writeAbundances(
@@ -38,8 +36,9 @@ class GZipWriter {
       ExpT& readExp);
 
     template <typename T>
-    bool writeBootstrap(const std::vector<T>& abund);
+    bool writeBootstrap(const std::vector<T>& abund, bool quiet=false);
 
+  bool setSamplingPath(const SalmonOpts& sopt);
    private:
      boost::filesystem::path path_;
      boost::filesystem::path bsPath_;
diff --git a/include/PartitionRefiner.hpp b/include/PartitionRefiner.hpp
deleted file mode 100644
index eb349a2..0000000
--- a/include/PartitionRefiner.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
->HEADER
-    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
-
-    This file is part of Sailfish.
-
-    Sailfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Sailfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-#ifndef __PARTITION_REFINER_HPP__
-#define __PARTITION_REFINER_HPP__
-
-#include <vector>
-#include "LookUpTableUtils.hpp"
-
-class PartitionRefiner {
-public:
-        PartitionRefiner(LUTTools::KmerID numElem);
-        void splitWith(std::vector<LUTTools::KmerID> splitter);
-        void relabel();
-        const std::vector<LUTTools::KmerID>& partitionMembership();
-
-private:
-        LUTTools::KmerID numElem_;
-        std::vector<LUTTools::KmerID> membership_;
-        LUTTools::KmerID maxSetIdx_;
-};
-
-#endif // __PARTITION_REFINER_HPP__
diff --git a/include/PerfectHashIndex.hpp b/include/PerfectHashIndex.hpp
deleted file mode 100644
index 3217858..0000000
--- a/include/PerfectHashIndex.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/**
->HEADER
-    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
-
-    This file is part of Sailfish.
-
-    Sailfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Sailfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-#ifndef __PERFECT_HASH_INDEX_HPP__
-#define __PERFECT_HASH_INDEX_HPP__
-
-#include <vector>
-#include <chrono>
-#include <iostream>
-#include <cstdio>
-#include <memory>
-#include <functional>
-
-#include <sys/mman.h>
-
-#include "boost/timer/timer.hpp"
-#include "cmph.h"
-
-//template <typename Deleter>
-class PerfectHashIndex {
-  using Kmer = uint64_t;
-  using Count = uint32_t;
-  using AtomicKmer = std::atomic<Kmer>;
-  using AtomicCount = std::atomic<Count>;
-  using Deleter = std::function<void(cmph_t*)>;
-
-  public:
-   // We'll return this invalid id if a kmer is not found in our DB
-   size_t INVALID = std::numeric_limits<size_t>::max();
-
-   PerfectHashIndex( std::vector<Kmer>& kmers, std::unique_ptr<cmph_t, Deleter>& hash, 
-                     uint32_t merSize, bool canonical ) : kmers_(std::move(kmers)), 
-                                                          hash_(std::move(hash)), 
-                                                          hashRaw_(hash_.get()),
-                                                          merSize_(merSize),
-                                                          canonical_(canonical) {}
-
-   PerfectHashIndex( PerfectHashIndex&& ph ) {
-   	merSize_ = ph.merSize_;
-   	hash_ = std::move(ph.hash_);
-    hashRaw_ = hash_.get();
-   	kmers_ = std::move(ph.kmers_);
-    canonical_ = ph.canonical_;
-   }
-
-   void dumpToFile(const std::string& fname) {
-   	FILE* out = fopen(fname.c_str(), "w");
-
-   	// read the key set
-    fwrite( reinterpret_cast<char*>(&merSize_), sizeof(merSize_), 1, out );
-    fwrite( reinterpret_cast<char*>(&canonical_), sizeof(canonical_), 1, out);
-    size_t numCounts = kmers_.size();
-    fwrite( reinterpret_cast<char*>(&numCounts), sizeof(size_t), 1, out );
-    fwrite( reinterpret_cast<char*>(&kmers_[0]), sizeof(Kmer), numCounts, out );
-
-    cmph_dump(hash_.get(), out); 
-    fclose(out);
-
-   }
-
-   static PerfectHashIndex fromFile( const std::string& fname ) {
-   	FILE* in = fopen(fname.c_str(),"r");
-
-   	// read the key set
-    uint32_t merSize;
-    fread( reinterpret_cast<char*>(&merSize), sizeof(merSize), 1, in );
-    bool canonical;
-    fread( reinterpret_cast<char*>(&canonical), sizeof(canonical), 1, in );
-    size_t numCounts;
-    fread( reinterpret_cast<char*>(&numCounts), sizeof(size_t), 1, in );
-    std::vector<Kmer> kmers(numCounts, Kmer(0));
-    fread( reinterpret_cast<char*>(&kmers[0]), sizeof(Kmer), numCounts, in );
-
-    // read the hash
-    std::unique_ptr<cmph_t, Deleter> hash( cmph_load(in), cmph_destroy );
-    PerfectHashIndex index(kmers, hash, merSize, canonical);
-
-    fclose(in);
-
-    return index;
-   }
-
-   inline size_t getKmerIndex( uint64_t kmer ) {
-    return kmer % kmers_.size();
-   }
-
-   inline size_t index( uint64_t kmer ) {
-   	char *key = reinterpret_cast<char*>(&kmer);
-    unsigned int id{cmph_search(hashRaw_, key, sizeof(uint64_t))};
-    return (kmers_[id] == kmer) ? id : INVALID;
-   }
-
-   inline size_t numKeys() { return kmers_.size(); }
-
-   bool verify() {
-   	auto start = std::chrono::steady_clock::now();
-   	for ( auto k : kmers_ ) { 
-   		if( kmers_[index(k)] != k ) { return false; }
-   	}
-   	auto end = std::chrono::steady_clock::now();
-   	auto ms = std::chrono::duration_cast<std::chrono::microseconds>(end-start);
-   	std::cerr << "verified: " << static_cast<double>(ms.count()) / kmers_.size() << " us / key\n";
-   	return true;
-   }
-
-   void will_need(uint32_t threadIdx, uint32_t numThreads) {
-     auto pageSize = sysconf(_SC_PAGESIZE);
-     size_t numPages{0};
-     
-     auto entriesPerPage = pageSize / sizeof(char);
-     auto size = cmph_size(hashRaw_);
-     numPages = (sizeof(char) * size) / entriesPerPage;
-     // number of pages that each thread should touch
-     auto numPagesPerThread = numPages / numThreads;
-     auto entriesPerThread = entriesPerPage * numPagesPerThread;
-     // the page this thread starts touching
-     auto start = entriesPerPage * threadIdx;
-     // the last page this thread touches
-     auto end = start + entriesPerThread;
-
-     for (size_t i = start; i < size; i += numThreads*entriesPerPage) {      
-      *(reinterpret_cast<char*>(hashRaw_)+i) = *(reinterpret_cast<char*>(hashRaw_)+i);
-     }
-
-     // entries per page
-     entriesPerPage = pageSize / sizeof(Kmer);
-     // total number of pages
-     size = kmers_.size();
-     numPages = (sizeof(Kmer) * kmers_.size()) / entriesPerPage;
-     // number of pages that each thread should touch
-     numPagesPerThread = numPages / numThreads;
-     entriesPerThread = entriesPerPage * numPagesPerThread;
-     // the page this thread starts touching
-     start = entriesPerPage * threadIdx;
-     // the last page this thread touches
-     end = start + entriesPerThread;
-     for (size_t i = start; i < size; i += numThreads*entriesPerPage) {
-      //std::cerr << "thread " << threadIdx << " is touching page " << i / entriesPerPage << "\n";
-      kmers_[i] = kmers_[i];
-     }
-   }
-
-   inline bool canonical() { return canonical_; }
-   inline uint32_t kmerLength() { return merSize_; }
-   const std::vector<Kmer>& kmers() { return kmers_; }
-
-   private:
-   	std::vector<Kmer> kmers_;
-   	std::unique_ptr<cmph_t, Deleter> hash_;
-    cmph_t* hashRaw_;
-   	uint32_t merSize_;
-    bool canonical_;
-};
-
-#endif // __PERFECT_HASH_INDEX_HPP__
\ No newline at end of file
diff --git a/include/ReadExperiment.hpp b/include/ReadExperiment.hpp
index 68f709d..225da27 100644
--- a/include/ReadExperiment.hpp
+++ b/include/ReadExperiment.hpp
@@ -65,6 +65,8 @@ class ReadExperiment {
         fragStartDists_(5),
         posBiasFW_(5),
         posBiasRC_(5),
+	posBiasExpectFW_(5),
+	posBiasExpectRC_(5),
         seqBiasModel_(1.0),
 	eqBuilder_(sopt.jointLog),
         expectedBias_(constExprPow(4, readBias_[0].getK()), 1.0),
@@ -88,6 +90,36 @@ class ReadExperiment {
                     fragLenKernelN,
                     fragLenKernelP, 1));
 
+
+            if (readLibraries_.front().getFormat().type == ReadType::SINGLE_END) {
+                // Convert the PMF to non-log scale
+                std::vector<double> logPMF;
+                size_t minVal;
+                size_t maxVal;
+                fragLengthDist_->dumpPMF(logPMF, minVal, maxVal);
+                double sum = salmon::math::LOG_0;
+                for (auto v : logPMF) {
+                    sum = salmon::math::logAdd(sum, v);
+                }
+                for (auto& v : logPMF) {
+                    v -= sum;
+                }
+
+                // Create the non-logged distribution.
+                // Here, we multiply by 100 to discourage small
+                // numbers in the correctionFactorsfromCounts call
+                // below.
+                std::vector<double> pmf(maxVal + 1, 0.0);
+                for (size_t i = minVal; i < maxVal; ++i) {
+                    pmf[i] = 100.0 * std::exp(logPMF[i - minVal]);
+                }
+
+                using distribution_utils::DistributionSpace;
+                // We compute the factors in linear space (since we've de-logged the pmf)
+                conditionalMeans_ = distribution_utils::correctionFactorsFromMass(pmf, DistributionSpace::LINEAR);
+            }
+
+
             // Make sure the transcript file exists.
             /*
             if (!bfs::exists(transcriptFile_)) {
@@ -152,6 +184,8 @@ class ReadExperiment {
     std::vector<Transcript>& transcripts() { return transcripts_; }
     const std::vector<Transcript>& transcripts() const { return transcripts_; }
 
+        const std::vector<double>& condMeans() const { return conditionalMeans_; }
+
     void updateTranscriptLengthsAtomic(std::atomic<bool>& done) {
         if (sl_.try_lock()) {
             if (!done) {
@@ -230,13 +264,16 @@ class ReadExperiment {
 
     SalmonIndex* getIndex() { return salmonIndex_.get(); }
 
+
     template <typename QuasiIndexT>
     void loadTranscriptsFromQuasi(QuasiIndexT* idx_, const SalmonOpts& sopt) {
 	    size_t numRecords = idx_->txpNames.size();
-        auto log = spdlog::get("jointLog");
+        auto log = sopt.jointLog.get();
 
 	    log->info("Index contained {} targets", numRecords);
 	    //transcripts_.resize(numRecords);
+	    std::vector<uint32_t> lengths;
+	    lengths.reserve(numRecords);
 	    double alpha = 0.005;
 	    for (auto i : boost::irange(size_t(0), numRecords)) {
 		    uint32_t id = i;
@@ -245,33 +282,38 @@ class ReadExperiment {
 		    // copy over the length, then we're done.
 		    transcripts_.emplace_back(id, name, len, alpha);
 		    auto& txp = transcripts_.back();
+        txp.setCompleteLength(idx_->txpCompleteLens[i]);
 		    // The transcript sequence
 		    //auto txpSeq = idx_->seq.substr(idx_->txpOffsets[i], len);
 
 		    // Set the transcript sequence
 		    txp.setSequenceBorrowed(idx_->seq.c_str() + idx_->txpOffsets[i],
                                     sopt.gcBiasCorrect, sopt.gcSampFactor);
+		    lengths.push_back(txp.RefLength);
+		    /*
 		    // Length classes taken from
-            // https://github.com/cole-trapnell-lab/cufflinks/blob/master/src/biascorrection.cpp
+		    // https://github.com/cole-trapnell-lab/cufflinks/blob/master/src/biascorrection.cpp
 		    // ======
 		    // Roberts, Adam, et al.
 		    // "Improving RNA-Seq expression estimates by correcting for fragment bias."
 		    // Genome Biol 12.3 (2011): R22.
 		    // ======
 		    // perhaps, define these in a more data-driven way
-            if (txp.RefLength <= 791) {
-                txp.lengthClassIndex(0);
-            } else if (txp.RefLength <= 1265) {
-                txp.lengthClassIndex(1);
-            } else if (txp.RefLength <= 1707) {
-                txp.lengthClassIndex(2);
-            } else if (txp.RefLength <= 2433) {
-                txp.lengthClassIndex(3);
-            } else {
-                txp.lengthClassIndex(4);
-            }
+		    if (txp.RefLength <= 791) {
+			txp.lengthClassIndex(0);
+		    } else if (txp.RefLength <= 1265) {
+			txp.lengthClassIndex(1);
+		    } else if (txp.RefLength <= 1707) {
+			txp.lengthClassIndex(2);
+		    } else if (txp.RefLength <= 2433) {
+			txp.lengthClassIndex(3);
+		    } else {
+			txp.lengthClassIndex(4);
+		    }
+		    */
       }
 	    // ====== Done loading the transcripts from file
+	    setTranscriptLengthClasses_(lengths, posBiasFW_.size());
     }
 
     void loadTranscriptsFromFMD() {
@@ -303,6 +345,8 @@ class ReadExperiment {
 	    nucTab[0] = 'A'; nucTab[1] = 'C'; nucTab[2] = 'G'; nucTab[3] = 'T';
 	    for (size_t i = 4; i < 256; ++i) { nucTab[i] = 'N'; }
 
+	    std::vector<uint32_t> lengths;
+	    lengths.reserve(transcripts_tmp.size());
         size_t tnum = 0;
 	    // Load the transcript sequence from file
 	    for (auto& t : transcripts_tmp) {
@@ -331,7 +375,8 @@ class ReadExperiment {
             std::strcpy(seqCopy, seq.c_str());
             txp.setSequenceOwned(seqCopy);
 		    txp.setSAMSequenceOwned(salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength));
-
+	    lengths.push_back(t.RefLength);
+		    /*
             // Length classes taken from
             // https://github.com/cole-trapnell-lab/cufflinks/blob/master/src/biascorrection.cpp
 		    // ======
@@ -351,6 +396,7 @@ class ReadExperiment {
             } else {
                 txp.lengthClassIndex(4);
             }
+	    */
 		    free(rseq);
 		    /* end BWA code */
             ++tnum;
@@ -363,12 +409,13 @@ class ReadExperiment {
 	    /** END TEST OPT **/
 	    transcripts_tmp.clear();
 	    // ====== Done loading the transcripts from file
+	    setTranscriptLengthClasses_(lengths, posBiasFW_.size());
     }
 
 
     template <typename CallbackT>
     bool processReads(const uint32_t& numThreads, const SalmonOpts& sopt, CallbackT& processReadLibrary) {
-        std::atomic<bool> burnedIn{totalAssignedFragments_ + numAssignedFragments_ > sopt.numBurninFrags};
+        std::atomic<bool> burnedIn{totalAssignedFragments_ + numAssignedFragments_ >= sopt.numBurninFrags};
         for (auto& rl : readLibraries_) {
             processReadLibrary(rl, salmonIndex_.get(), transcripts_, clusterForest(),
                                *(fragLengthDist_.get()), numAssignedFragments_,
@@ -627,6 +674,13 @@ class ReadExperiment {
         return (dir == salmon::utils::Direction::FORWARD) ? posBiasFW_ : posBiasRC_; 
     }
 
+    std::vector<SimplePosBias>& posBiasExpected(salmon::utils::Direction dir) {
+      return (dir == salmon::utils::Direction::FORWARD) ? posBiasExpectFW_ : posBiasExpectRC_;
+    }
+    const std::vector<SimplePosBias>& posBiasExpected(salmon::utils::Direction dir) const {
+      return (dir == salmon::utils::Direction::FORWARD) ? posBiasExpectFW_ : posBiasExpectRC_;
+    }
+
     ReadKmerDist<6, std::atomic<uint32_t>>& readBias(salmon::utils::Direction dir) { 
         return (dir == salmon::utils::Direction::FORWARD) ? readBias_[0] : readBias_[1]; 
     }
@@ -651,8 +705,50 @@ class ReadExperiment {
         size_t idx = (dir == salmon::utils::Direction::FORWARD) ? 0 : 1;
 	readBiasModelExpected_[idx] = std::move(model);
     }
+
+    const std::vector<uint32_t>& getLengthQuantiles() const { return lengthQuantiles_; }
   
     private:
+
+  void setTranscriptLengthClasses_(std::vector<uint32_t>& lengths, size_t nbins) {
+    auto n = lengths.size();
+    if ( n > nbins) {
+      lengthQuantiles_.clear();
+      lengthQuantiles_.reserve(nbins);
+      
+      size_t step = lengths.size() / nbins;
+      size_t cumStep = 0;
+      for (size_t i = 0; i < nbins; ++i) {
+	cumStep += step;
+	size_t ind = std::min(cumStep, n-1);
+	std::nth_element(lengths.begin(), lengths.begin() + ind, lengths.end());
+	// Find the proper quantile 
+	lengthQuantiles_.push_back(*(lengths.begin() + ind));
+      }
+    } else {
+      lengthQuantiles_.clear();
+      lengthQuantiles_.reserve(n);
+      std::sort(lengths.begin(), lengths.end());
+      for (auto l : lengths) {
+	lengthQuantiles_.push_back(l);
+      }
+      posBiasFW_.resize(n);
+      posBiasRC_.resize(n);
+      posBiasExpectFW_.resize(n);
+      posBiasExpectRC_.resize(n);
+    }
+
+    auto qb = lengthQuantiles_.begin();
+    auto qe = lengthQuantiles_.end();
+    auto maxQuant = std::distance(qb, qe) - 1;
+    for (auto& t : transcripts_) {
+      auto ind = std::min(maxQuant, std::distance(qb, std::upper_bound(qb, qe, t.RefLength)));
+      // the index is the smallest quantile longer than this length
+      t.lengthClassIndex(ind);
+    }
+  }
+
+  
     /**
      * The file from which the alignments will be read.
      * This can be a SAM or BAM file, and can be a regular
@@ -705,9 +801,12 @@ class ReadExperiment {
     EquivalenceClassBuilder eqBuilder_;
 
     /** Positional bias things**/
+    std::vector<uint32_t> lengthQuantiles_;
     std::vector<SimplePosBias> posBiasFW_;
     std::vector<SimplePosBias> posBiasRC_;
- 
+    std::vector<SimplePosBias> posBiasExpectFW_;
+    std::vector<SimplePosBias> posBiasExpectRC_;
+
     /** GC-fragment bias things **/
     // One bin for each percentage GC content
     double gcFracFwd_{-1.0};
@@ -722,6 +821,7 @@ class ReadExperiment {
     std::array<SBModel, 2> readBiasModelExpected_;
     //std::array<std::vector<double>, 2> expectedBias_;
     std::vector<double> expectedBias_;
+    std::vector<double> conditionalMeans_;
 };
 
 #endif // EXPERIMENT_HPP
diff --git a/include/ReadPair.hpp b/include/ReadPair.hpp
index 1939907..8a6ab7d 100644
--- a/include/ReadPair.hpp
+++ b/include/ReadPair.hpp
@@ -82,6 +82,11 @@ struct ReadPair {
 
     inline int32_t pos() const { return left(); }
     inline bool fwd() const { return !bam_strand(read1); }
+    inline bool isInward() const {
+      bool fw1 = !bam_strand(read1);
+      bool fw2 = !bam_strand(read2);
+      return (fw1 != fw2);
+    }
 
     /**
       * returns 0 on success, -1 on failure.
@@ -112,10 +117,8 @@ struct ReadPair {
     // end of the 3' read (can be less than the length of a single read)
     inline uint32_t fragLengthPedantic(uint32_t txpLen) const { 
         if (!isPaired()) { return 0; }
-
         bool fw1 = !bam_strand(read1);
         bool fw2 = !bam_strand(read2);
-
         if (fw1 != fw2) {
             int32_t p1 = fw1 ? bam_pos(read1) : bam_pos(read2);
             p1 = (p1 < 0) ? 0 : p1;
diff --git a/include/SailfishUtils.hpp b/include/SailfishUtils.hpp
deleted file mode 100644
index 749d73a..0000000
--- a/include/SailfishUtils.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/**
->HEADER
-    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
-
-    This file is part of Sailfish.
-
-    Sailfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Sailfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Sailfish.  If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-#ifndef SAILFISH_UTILS_HPP
-#define SAILFISH_UTILS_HPP
-
-#include <future>
-#include <algorithm>
-#include <iostream>
-#include <tuple>
-#include <unordered_set>
-#include <unordered_map>
-#include <vector>
-#include <boost/filesystem.hpp>
-#include <boost/program_options.hpp>
-
-#include "LibraryFormat.hpp"
-#include "ReadLibrary.hpp"
-#include "TranscriptGeneMap.hpp"
-#include "GenomicFeature.hpp"
-
-namespace sailfish{
-namespace utils {
-using std::string;
-using NameVector = std::vector<string>;
-using IndexVector = std::vector<size_t>;
-using KmerVector = std::vector<uint64_t>;
-
-// Returns a uint64_t where the upper 32-bits
-// contain tid and the lower 32-bits contain offset
-uint64_t encode(uint64_t tid, uint64_t offset);
-
-// Given a uin64_t generated by encode(), return the
-// transcript id --- upper 32-bits
-uint32_t transcript(uint64_t enc);
-
-// Given a uin64_t generated by encode(), return the
-// offset --- lower 32-bits
-uint32_t offset(uint64_t enc);
-
-
-LibraryFormat parseLibraryFormatStringNew(std::string& fmt);
-
-std::vector<ReadLibrary> extractReadLibraries(boost::program_options::parsed_options& orderedOptions);
-
-LibraryFormat parseLibraryFormatString(std::string& fmt);
-
-size_t numberOfReadsInFastaFile(const std::string& fname);
-
-bool readKmerOrder( const std::string& fname, std::vector<uint64_t>& kmers );
-
-template <template<typename> class S, typename T>
-bool overlap( const S<T> &a, const S<T> &b );
-
-template< typename T >
-TranscriptGeneMap transcriptToGeneMapFromFeatures( std::vector<GenomicFeature<T>> &feats ) {
-    using std::unordered_set;
-    using std::unordered_map;
-    using std::vector;
-    using std::tuple;
-    using std::string;
-    using std::get;
-
-    using NameID = tuple<string, size_t>;
-
-    IndexVector t2g;
-    NameVector transcriptNames;
-    NameVector geneNames;
-
-    // holds the mapping from transcript ID to gene ID
-    IndexVector t2gUnordered;
-    // holds the set of gene IDs
-    unordered_map<string, size_t> geneNameToID;
-
-    // To read the input and assign ids
-    size_t geneCounter = 0;
-    string transcript;
-    string gene;
-
-    std::sort( feats.begin(), feats.end(),
-    []( const GenomicFeature<T> & a, const GenomicFeature<T> & b) -> bool {
-        return a.sattr.transcript_id < b.sattr.transcript_id;
-    } );
-
-    std::string currentTranscript = "";
-    for ( auto & feat : feats ) {
-
-        auto &gene = feat.sattr.gene_id;
-        auto &transcript = feat.sattr.transcript_id;
-
-        if ( transcript != currentTranscript ) {
-            auto geneIt = geneNameToID.find(gene);
-            size_t geneID = 0;
-
-            if ( geneIt == geneNameToID.end() ) {
-                // If we haven't seen this gene yet, give it a new ID
-                geneNameToID[gene] = geneCounter;
-                geneID = geneCounter;
-                geneNames.push_back(gene);
-                ++geneCounter;
-            } else {
-                // Otherwise lookup the ID
-                geneID = geneIt->second;
-            }
-
-            transcriptNames.push_back(transcript);
-            t2g.push_back(geneID);
-
-            //++transcriptID;
-            currentTranscript = transcript;
-        }
-
-    }
-
-    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
-}
-
-TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname, std::string key="gene_id");
-
-TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile );
-
-TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile );
-
-void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm, boost::filesystem::path& inputPath);
-
-// NOTE: Throws an invalid_argument exception of the quant or quant_bias_corrected files do
-// not exist!
-void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
-                                boost::filesystem::path& estDir,
-                                bool haveBiasCorrectedFile = false);
-
-}
-}
-#endif // UTILS_HPP
diff --git a/include/SalmonConfig.hpp b/include/SalmonConfig.hpp
index 59524d0..a65421f 100644
--- a/include/SalmonConfig.hpp
+++ b/include/SalmonConfig.hpp
@@ -27,10 +27,11 @@
 
 namespace salmon {
 	constexpr char majorVersion[] = "0";
-	constexpr char minorVersion[] = "7";
-	constexpr char patchVersion[] = "2";
-	constexpr char version[] = "0.7.2";
-    constexpr uint32_t indexVersion = 2;
+	constexpr char minorVersion[] = "8";
+	constexpr char patchVersion[] = "0";
+	constexpr char version[] = "0.8.0";
+  constexpr uint32_t indexVersion = 2;
+  constexpr char requiredQuasiIndexVersion[] = "q4";
 }
 
 #endif // SALMON_CONFIG_HPP
diff --git a/include/SalmonIndex.hpp b/include/SalmonIndex.hpp
index d3b2b53..7cdf775 100644
--- a/include/SalmonIndex.hpp
+++ b/include/SalmonIndex.hpp
@@ -18,6 +18,7 @@ extern "C" {
 #include "cereal/types/vector.hpp"
 
 #include "BooMap.hpp"
+#include "FrugalBooMap.hpp"
 #include "RapMapSAIndex.hpp"
 #include "IndexHeader.hpp"
 #include "BWAUtils.hpp"
@@ -32,11 +33,11 @@ int bwa_index(int argc, char* argv[]);
 int rapMapSAIndex(int argc, char* argv[]);
 
 template <typename IndexT> 
-using DenseHash = google::dense_hash_map<uint64_t, 
+using DenseHash = spp::sparse_hash_map<uint64_t, 
                                          rapmap::utils::SAInterval<IndexT>, 
                                          rapmap::utils::KmerKeyHasher>;
 template <typename IndexT> 
-using PerfectHash = BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>>;
+using PerfectHash = FrugalBooMap<uint64_t, rapmap::utils::SAInterval<IndexT>>;
 
 class SalmonIndex{
         public:
@@ -291,6 +292,10 @@ class SalmonIndex{
                   }
                   indexStream.close();
 
+                  if (h.version() != salmon::requiredQuasiIndexVersion) {
+                    fmt::print(stderr, "I found a quasi-index with version {}, but I require {}",
+                               h.version(), salmon::requiredQuasiIndexVersion);
+                  }
                   if (h.indexType() != IndexType::QUASI) {
                     fmt::print(stderr, "The index {} does not appear to be of the "
                                         "appropriate type (quasi)", indexStr);
diff --git a/include/SalmonOpts.hpp b/include/SalmonOpts.hpp
index 0b3a937..8c422d9 100644
--- a/include/SalmonOpts.hpp
+++ b/include/SalmonOpts.hpp
@@ -28,21 +28,35 @@ struct SalmonOpts {
     bool maxMEMIntervals; // If true, don't split (S)MEMs into MEMs
     */
 
-    SalmonOpts() : allowOrphans(false), splitSpanningSeeds(false), noFragLengthDist(false),
+    SalmonOpts() : alternativeInitMode(false), allowOrphans(false), splitSpanningSeeds(false), noFragLengthDist(false),
                    noEffectiveLengthCorrection(false), useReadCompat(false),
                    maxReadOccs(200), extraSeedPass(false),
                    mappingCacheMemoryLimit(5000000), useQuasi(false) {}
 
+    bool alternativeInitMode; // Weigh unique reads more heavily when initialzing the optimization.
+
     bool allowOrphans; // Consider orphaned reads when performing lightweight alignemnt.
 
     std::string auxDir; // The directory where auxiliary files will be written.
 
-    std::string runStartTime; // String representation of the date / time at which the run began.
+  std::string runStartTime; // String representation of the date / time at which the run began.
+
+  std::string runStopTime; // String representation of the date / time at which the run ended.
 
     bool consistentHits;  // Enforce consistency of hits gathered during quasi-mapping.
 
     bool dumpEq; 	     // Dump the equivalence classes and counts to file
 
+    bool dumpEqWeights; 	     // Dump the equivalence classes rich weights 
+
+    bool fasterMapping; // [Developer]: Disables some extra checks during quasi-mapping. This may make mapping a 
+                        // little bit faster at the potential cost of returning too many mappings (i.e. some sub-optimal mappings) 
+                        // for certain reads. Only use this option if you know what it does (enables NIP-skipping)
+
+    bool gencodeRef; // The reference is expected to be from Gencode.
+
+    double quasiCoverage; // [Experimental]: Default of 0.  The coverage by MMPs required for a read to be considered mapped.
+
     bool splitSpanningSeeds; // Attempt to split seeds that span multiple transcripts.
 
     bool noFragLengthDist ; // Don't give a fragment assignment a likelihood based on an emperically
@@ -52,6 +66,8 @@ struct SalmonOpts {
                                       // account when computing the probability that a
                                      // fragment was generated from a transcript.
 
+    bool noLengthCorrection; // Don't account for transcript length at all during abundance estimation.
+
     bool noBiasLengthThreshold; // Don't require that the recomputed effective length for a target
                                 // be above a threshold before applying it.
     bool useBiasLengthThreshold; // Don't require that the recomputed effective length for a target
@@ -114,6 +130,8 @@ struct SalmonOpts {
     bool extraSeedPass; // Perform extra pass trying to find seeds to cover the read
 
     bool disableMappingCache; // Don't write mapping results to temporary mapping cache file
+    
+    bool meta; // Set other options to be optimized for metagenomic data
 
     boost::filesystem::path outputDirectory; // Quant output directory
 
@@ -136,11 +154,18 @@ struct SalmonOpts {
 
   std::unique_ptr<std::ofstream> unmappedFile{nullptr};
     bool writeUnmappedNames; // write the names of unmapped reads
+    std::shared_ptr<spdlog::logger> unmappedLog{nullptr};
+    
+    std::unique_ptr<std::ofstream> orphanLinkFile{nullptr};
+    bool writeOrphanLinks; // write the names of unmapped reads
+    std::shared_ptr<spdlog::logger> orphanLinkLog{nullptr};
+
     bool sampleOutput; // Sample alignments according to posterior estimates of transcript abundance.
     bool sampleUnaligned; // Pass along un-aligned reads in the sampling.
 
     uint32_t numGibbsSamples; // Number of rounds of Gibbs sampling to perform
     uint32_t numBootstraps; // Number of bootstrap samples to draw
+    uint32_t thinningFactor; // Gibbs chain thinning factor
 
     bool initUniform{false}; // initialize offline optimization parameters uniformly, rather than with online estimates.
     bool alnMode{false};     // true if we're in alignment based mode, false otherwise
diff --git a/include/SalmonUtils.hpp b/include/SalmonUtils.hpp
index 4dd6428..cd31591 100644
--- a/include/SalmonUtils.hpp
+++ b/include/SalmonUtils.hpp
@@ -119,7 +119,7 @@ Eigen::VectorXd updateEffectiveLengths(
 template <typename AbundanceVecT, typename ReadExpT>
 Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
                                                       Eigen::VectorXd& effLensIn,
-                                                      AbundanceVecT& alphas, bool finalRound=false);
+                                       AbundanceVecT& alphas, std::vector<bool>& available, bool finalRound=false);
 
 
 /*
@@ -163,6 +163,10 @@ inline void incLoop(tbb::atomic<double>& val, double inc) {
         } while (returnedMass != oldMass);
 }
 
+std::string getCurrentTimeAsString();
+
+bool validateOptions(SalmonOpts& sopt);
+
 bool processQuantOptions(SalmonOpts& sopt, boost::program_options::variables_map& vm, int32_t numBiasSamples);
 
 
diff --git a/include/Sampler.hpp b/include/Sampler.hpp
index 4ed6ec1..70e3102 100644
--- a/include/Sampler.hpp
+++ b/include/Sampler.hpp
@@ -50,7 +50,6 @@ extern "C" {
 #include "AlignmentModel.hpp"
 #include "FragmentLengthDistribution.hpp"
 #include "TranscriptCluster.hpp"
-#include "SailfishUtils.hpp"
 #include "SalmonUtils.hpp"
 #include "SalmonConfig.hpp"
 #include "SalmonOpts.hpp"
@@ -95,10 +94,17 @@ namespace salmon {
                 std::uniform_real_distribution<> uni(0.0, 1.0 + std::numeric_limits<double>::min());
 
                 using salmon::math::LOG_0;
+                using salmon::math::LOG_1;
+                using salmon::math::LOG_EPSILON;
                 using salmon::math::logAdd;
                 using salmon::math::logSub;
 
                 bool useFSPD{salmonOpts.useFSPD};
+                bool noLengthCorrection{salmonOpts.noLengthCorrection};
+                bool useFragLengthDist{!salmonOpts.noFragLengthDist};
+                bool useAuxParams = (processedReads >= salmonOpts.numPreBurninFrags);
+                bool considerCondProb = (useAuxParams or burnedIn);
+
                 auto& refs = alnLib.transcripts();
                 auto& clusterForest = alnLib.clusterForest();
                 auto& fragmentQueue = alnLib.fragmentQueue();
@@ -111,6 +117,10 @@ namespace salmon {
 
                 const auto expectedLibraryFormat = alnLib.format();
 
+                auto isUnexpectedOrphan = [expectedLibraryFormat](FragT* aln) -> bool {
+                  return (expectedLibraryFormat.type == ReadType::PAIRED_END and !aln->isPaired());
+                };
+
                 std::chrono::microseconds sleepTime(1);
                 MiniBatchInfo<AlignmentGroup<FragT*>>* miniBatch = nullptr;
                 size_t numTranscripts = refs.size();
@@ -161,39 +171,71 @@ namespace salmon {
 
                                     double refLength = transcript.RefLength > 0 ? transcript.RefLength : 1.0;
 
+                                    auto flen = aln->fragLen();
+                                    // If we have a properly-paired read then use the "pedantic"
+                                    // definition here.
+                                    if (aln->isPaired() and aln->isInward()) { 
+                                      flen = aln->fragLengthPedantic(refLength); 
+                                    }
+
+                                    // The probability of drawing a fragment of this length;
                                     double logFragProb = salmon::math::LOG_1;
+                                    // If we are expecting a paired-end library, and this is an orphan,
+                                    // then logFragProb should be small
+                                    if (isUnexpectedOrphan(aln)) {
+                                      logFragProb = LOG_EPSILON;
+                                    }
 
-                                    if (!salmonOpts.noFragLengthDist) {
-                                        if(aln->fragLen() == 0) {
-                                            if (aln->isLeft() and transcript.RefLength - aln->left() < fragLengthDist.maxVal()) {
-                                                logFragProb = fragLengthDist.cmf(transcript.RefLength - aln->left());
-                                            } else if (aln->isRight() and aln->right() < fragLengthDist.maxVal()) {
-                                                logFragProb = fragLengthDist.cmf(aln->right());
-                                            }
-                                        } else {
-                                            logFragProb = fragLengthDist.pmf(static_cast<size_t>(aln->fragLen()));
+                                    if (flen > 0.0 and aln->isPaired() and useFragLengthDist and considerCondProb) {
+                                      size_t fl = flen;
+                                      double lenProb = fragLengthDist.pmf(fl); 
+                                      if (burnedIn) {
+                                        /* condition fragment length prob on txp length */
+                                        double refLengthCM = fragLengthDist.cmf(static_cast<size_t>(refLength)); 
+                                        bool computeMass = fl < refLength and !salmon::math::isLog0(refLengthCM);
+                                        logFragProb = (computeMass) ?
+                                                                (lenProb - refLengthCM) :
+                                          salmon::math::LOG_EPSILON;
+                                        if (computeMass and refLengthCM < lenProb) {
+                                          // Threading is hard!  It's possible that an update to the PMF snuck in between when we asked to cache the CMF and when the
+                                          // "burnedIn" variable was last seen as false.
+                                          log->info("reference length = {}, CMF[refLen] = {}, fragLen = {}, PMF[fragLen] = {}",
+                                                    refLength, std::exp(refLengthCM), aln->fragLen(), std::exp(lenProb));
                                         }
+                                      } else if (useAuxParams) {
+                                        logFragProb = lenProb;
+                                      }
+                                    }
+
+                                    if (!salmonOpts.noFragLengthDist and aln->fragLen() > 0.0) {
+                                      logFragProb = fragLengthDist.pmf(static_cast<size_t>(aln->fragLen()));
                                     }
 
                                     // The alignment probability is the product of a
                                     // transcript-level term (based on abundance and) an
                                     // alignment-level term.
                                     double logRefLength{salmon::math::LOG_0};
-                                    if (salmonOpts.noEffectiveLengthCorrection or !burnedIn) {
-                                        logRefLength = std::log(transcript.RefLength);
+                                    if (noLengthCorrection) {
+                                      logRefLength = 1.0;
+                                    } else if (salmonOpts.noEffectiveLengthCorrection or !burnedIn) {
+                                      logRefLength = std::log(transcript.RefLength);
                                     } else {
-                                        logRefLength = transcript.getCachedLogEffectiveLength();
+                                      logRefLength = transcript.getCachedLogEffectiveLength();
                                     }
 
-
-                                    double logAlignCompatProb =
-                                        (salmonOpts.useReadCompat) ?
-                                        (salmon::utils::logAlignFormatProb(
-                                                  aln->libFormat(),
-                                                  expectedLibraryFormat,
-                                                  aln->pos(),
-                                                  aln->fwd(), aln->mateStatus(), salmonOpts.incompatPrior)
-                                        ) : LOG_1;
+                                    // The probability that the fragments align to the given strands in the
+                                    // given orientations.
+                                    bool isCompat = 
+                                      salmon::utils::isCompatible(
+                                                                  aln->libFormat(),
+                                                                  expectedLibraryFormat,
+                                                                  aln->pos(),
+                                                                  aln->fwd(), aln->mateStatus());
+                                    double logAlignCompatProb = isCompat ? LOG_1 : salmonOpts.incompatPrior;
+                                    if (!isCompat and salmonOpts.ignoreIncompat) {
+                                      aln->logProb = salmon::math::LOG_0;
+                                      continue;
+                                    }
 
                                     // Adjustment to the likelihood due to the
                                     // error model
diff --git a/include/SimplePosBias.hpp b/include/SimplePosBias.hpp
index d619ff4..a8ea851 100644
--- a/include/SimplePosBias.hpp
+++ b/include/SimplePosBias.hpp
@@ -4,6 +4,8 @@
 #include "spline.h"
 #include <array>
 #include <vector>
+#include "spdlog/spdlog.h"
+#include <boost/iostreams/filtering_stream.hpp>
 
 class SimplePosBias {
 public:
@@ -28,10 +30,14 @@ public:
   // compute the cdf etc.
   void finalize();
 
+  // Seralize this model.
+  bool writeBinary(boost::iostreams::filtering_ostream& out) const; 
+
 private:
   int32_t numBins_;
   std::vector<double> masses_;
   bool isLogged_{true};
+  bool isFinalized_{false};
   tk::spline s_;
   // position bins taken from Cufflinks:
   // https://github.com/cole-trapnell-lab/cufflinks/blob/master/src/biascorrection.cpp
diff --git a/include/Transcript.hpp b/include/Transcript.hpp
index 5250be5..d739abe 100644
--- a/include/Transcript.hpp
+++ b/include/Transcript.hpp
@@ -18,6 +18,7 @@ public:
 
     Transcript() :
         RefName(nullptr), RefLength(std::numeric_limits<uint32_t>::max()),
+        CompleteLength(std::numeric_limits<uint32_t>::max()),
         EffectiveLength(-1.0), id(std::numeric_limits<uint32_t>::max()),
         logPerBasePrior_(salmon::math::LOG_0),
         priorMass_(salmon::math::LOG_0),
@@ -32,7 +33,7 @@ public:
 
 
     Transcript(size_t idIn, const char* name, uint32_t len, double alpha = 0.05) :
-        RefName(name), RefLength(len), EffectiveLength(-1.0), id(idIn),
+      RefName(name), RefLength(len), CompleteLength(len), EffectiveLength(-1.0), id(idIn),
         logPerBasePrior_(std::log(alpha)),
         priorMass_(std::log(alpha*len)),
         mass_(salmon::math::LOG_0), sharedCount_(0.0),
@@ -53,6 +54,7 @@ public:
 
         RefName = std::move(other.RefName);
         RefLength = other.RefLength;
+        CompleteLength = other.CompleteLength;
         EffectiveLength = other.EffectiveLength;
 
         SAMSequence_ = std::move(other.SAMSequence_);
@@ -82,6 +84,7 @@ public:
 
         RefName = std::move(other.RefName);
         RefLength = other.RefLength;
+        CompleteLength = other.CompleteLength;
         EffectiveLength = other.EffectiveLength;
         SAMSequence_ = std::move(other.SAMSequence_);
         Sequence_ = std::move(other.Sequence_);
@@ -299,44 +302,112 @@ public:
         return hasAnchorFragment_.load();
     }
 
-    inline GCDesc gcDesc(int32_t s, int32_t e) const {
+  inline GCDesc gcDesc(int32_t s, int32_t e, bool& valid) const {
         int outsideContext{3};
         int insideContext{2};
-        
+
         int outside5p = outsideContext + 1;
         int outside3p = outsideContext;
 
         int inside5p = insideContext - 1;
         int inside3p = insideContext;
 
-        int contextSize = outsideContext + insideContext;
+        double contextSize = outsideContext + insideContext;
         int lastPos = RefLength - 1;
         if (gcStep_ == 1) {
-            auto cs = GCCount_[s];
+            auto cs = (s > 0) ? GCCount_[s - 1] : 0;
             auto ce = GCCount_[e];
 
+            
+            int fs = s - outside5p;
+            int fe = s + inside5p;
+            int ts = e - inside3p;
+            int te = e + outside3p;
+
+            bool fpLeftExists = (fs >= 0);
+            bool fpRightExists = (fe <= lastPos);
+            bool tpLeftExists = (ts >= 0);
+            bool tpRightExists = (te <= lastPos);
+
+            /*
+            if (!(fpLeftExists and fpRightExists and tpLeftExists and tpRightExists)) {
+              return GCDesc();
+            }
+            */
+            auto fps = (fpLeftExists) ? GCCount_[fs] : 0;
+            auto fpe = (fpRightExists) ? GCCount_[fe] : ce;
+            auto tps = (tpLeftExists) ? GCCount_[ts] : 0;
+            auto tpe = (tpRightExists) ? GCCount_[te] : ce;
+            
+            // now, clamp to actual bounds
+            fs = (fs < 0) ? 0 : fs;
+            fe = (fe > lastPos) ? lastPos : fe;
+            ts = (ts < 0) ? 0 : ts;
+            te = (te > lastPos) ? lastPos : te;
+            int fpContextSize = (!fpLeftExists) ? (fe + 1) : (fe - fs);
+            int tpContextSize = (!tpLeftExists) ? (te + 1) : (te - ts);
+            contextSize = static_cast<double>(fpContextSize + tpContextSize);
+            if (contextSize == 0) {
+              //std::cerr << "" << std::endl;
+              return GCDesc();
+            }
+            valid = true;
+            /* 
+            
+            int fs = std::max(s - outside5p, 0);
+            int fe = std::min(s + inside5p, lastPos);
+            int ts = std::max(e - inside3p, 0);
+            int te = std::min(e + outside3p, lastPos);
+            
+            //contextSize = static_cast<double>((fe - fs) + (te - ts));
+            auto fps = (s >= outside5p) ? GCCount_[fs] : 0;
+            auto fpe = (inside5p > 0) ? GCCount_[fe] : cs;
+            auto tps = (inside3p > 0) ?
+            ((e >= inside3p) ? GCCount_[e-inside3p] : 0) : ce;
+            auto tpe = GCCount_[te];
+            */
+            
+
+            /* 
             auto fps = (s >= outside5p) ? GCCount_[s-outside5p] : 0;
             auto fpe = (inside5p > 0) ? GCCount_[std::min(s+inside5p, lastPos)] : cs;
-            auto tps = (inside3p > 0) ? 
+            auto tps = (inside3p > 0) ?
                 ((e >= inside3p) ? GCCount_[e-inside3p] : 0) : ce;
             auto tpe = GCCount_[std::min(e+outside3p, lastPos)];
+            */
             
             int32_t fragFrac = std::lrint((100.0 * (ce - cs)) / (e - s + 1));
-            int32_t contextFrac = std::lrint((100.0 * (((fpe - fps) + (tpe - tps)) / (2.0 * contextSize))));
+            //int32_t contextFrac = std::lrint((100.0 * (((fpe - fps) + (tpe - tps)) / (2.0 * contextSize))));
+            int32_t contextFrac = std::lrint((100.0 * (((fpe - fps) + (tpe - tps)) / (contextSize))));
+            //int32_t contextFrac = std::lrint((100.0 * (((fpeCount - fpsCount) + (tpeCount - tpsCount)) / (contextSize))));
+            /*
+            if (contextFrac > 100) {
+              std::cerr << "NOTE : 5' count = " << (fpeCount - fpsCount) << ", 3' count =" << (tpeCount - tpsCount) << ", context size = " << contextSize << std::endl;
+              std::cerr << "s = " << s << ", e = " << e << ", l = " << RefLength << ", fs = " << fs  << ", fe =  " << fe << ", ts = " << ts << ", te = " << te << std::endl;
+              std::cerr << "fpsCount = " << fpsCount << 
+                ", fpeCount = " << fpeCount << 
+                ", tpsCount = " << tpsCount <<
+                ", tpeCount = " << tpeCount  <<
+                ", fpContextSize = " << fpContextSize  <<
+                ", tpContextSize = " << tpContextSize << std::endl;
+
+            } 
+            */
             GCDesc desc = {fragFrac, contextFrac};
             return desc;
         } else {
             auto cs = gcCountInterp_(s);
             auto ce = gcCountInterp_(e);
 
+            valid = true;
 	    auto fps = (s >= outside5p) ? gcCountInterp_(s-outside5p) : 0;
 	    auto fpe = (inside5p > 0) ? gcCountInterp_(std::min(s+inside5p, lastPos)) : cs;
-	    auto tps = (inside3p > 0) ? 
+	    auto tps = (inside3p > 0) ?
 	      ((e >= inside3p) ? gcCountInterp_(e-inside3p) : 0) : ce;
 	    auto tpe = gcCountInterp_(std::min(e+outside3p, lastPos));
-	    
+
             int32_t fragFrac = std::lrint((100.0 * (ce - cs)) / (e - s + 1));
-            int32_t contextFrac = std::lrint((100.0 * (((fpe - fps) + (tpe - tps)) / (10.0))));
+            int32_t contextFrac = std::lrint((100.0 * (((fpe - fps) + (tpe - tps)) / (2.0 * contextSize))));
             GCDesc desc = {fragFrac, contextFrac};
             return desc;
         }
@@ -350,11 +421,11 @@ public:
     // in the interval [s,e] (note; this interval is closed on both sides).
     inline int32_t gcFrac(int32_t s, int32_t e) const {
         if (gcStep_ == 1) {
-            auto cs = GCCount_[s];
+            auto cs = (s > 0) ? GCCount_[s - 1] : 0;
             auto ce = GCCount_[e];
             return std::lrint((100.0 * (ce - cs)) / (e - s + 1));
         } else {
-            auto cs = gcCountInterp_(s);
+            auto cs = (s > 0) ? gcCountInterp_(s - 1) : 0;
             auto ce = gcCountInterp_(e);
             return std::lrint((100.0 * (ce - cs)) / (e - s + 1));
         }
@@ -404,9 +475,13 @@ public:
         return SAMSequence_.get();
     }
 
+  void setCompleteLength(uint32_t completeLengthIn) {
+    CompleteLength = completeLengthIn;
+  }
 
     std::string RefName;
     uint32_t RefLength;
+    uint32_t CompleteLength;
     double EffectiveLength;
     uint32_t id;
 
@@ -426,7 +501,7 @@ private:
     }
 
     inline int32_t closestBin_(int32_t p) const {
-      return static_cast<int32_t>(std::round( static_cast<double>(p) / gcStep_ )); 
+      return static_cast<int32_t>(std::round( static_cast<double>(p) / gcStep_ ));
     }
 
     inline double gcCountInterp_(int32_t p) const {
diff --git a/include/TranscriptGeneMap.hpp b/include/TranscriptGeneMap.hpp
index 68499a0..6c98007 100644
--- a/include/TranscriptGeneMap.hpp
+++ b/include/TranscriptGeneMap.hpp
@@ -95,7 +95,11 @@ public:
         using std::distance;
         using std::lower_bound;
         auto it = lower_bound( _transcriptNames.begin(), _transcriptNames.end(), tname );
-        return ( it == _transcriptNames.end() ) ? INVALID : ( distance(_transcriptNames.begin(), it) );
+        if (it == _transcriptNames.end() or *it != tname) {
+          return INVALID;
+        } else {
+          return distance(_transcriptNames.begin(), it);
+        }
     }
 
     Size numTranscripts() {
@@ -128,15 +132,15 @@ public:
         return _geneNames[_transcriptsToGenes[transcriptID]];
     }
     inline std::string geneName (const std::string& transcriptName,
-                                 bool complain=true) {
+                                 bool& found) {
+        found = false;
         auto tid = findTranscriptID(transcriptName);
         if (tid != INVALID) {
+            found = true;
             return geneName(tid);
         } else {
-            std::cerr << "WARNING: couldn't find transcript named ["
-                      << transcriptName << "]; returning transcript "
-                      << " as it's own gene\n";
-            return transcriptName;
+           found = false;
+           return transcriptName;
         }
     }
 
diff --git a/include/UnpairedRead.hpp b/include/UnpairedRead.hpp
index d855d5b..11cdf8a 100644
--- a/include/UnpairedRead.hpp
+++ b/include/UnpairedRead.hpp
@@ -63,7 +63,7 @@ struct UnpairedRead {
 
    inline int32_t pos() const { return left(); }
    inline bool fwd() const { return !bam_strand(read); }
-
+   inline bool isInward() const { return false; }
     // return 0 on success, -1 on failure
     int writeToFile(scram_fd* fp) {
         return scram_put_seq(fp, read);
diff --git a/include/btree.h b/include/btree.h
deleted file mode 100755
index 49310a2..0000000
--- a/include/btree.h
+++ /dev/null
@@ -1,2394 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// A btree implementation of the STL set and map interfaces. A btree is both
-// smaller and faster than STL set/map. The red-black tree implementation of
-// STL set/map has an overhead of 3 pointers (left, right and parent) plus the
-// node color information for each stored value. So a set<int32> consumes 20
-// bytes for each value stored. This btree implementation stores multiple
-// values on fixed size nodes (usually 256 bytes) and doesn't store child
-// pointers for leaf nodes. The result is that a btree_set<int32> may use much
-// less memory per stored value. For the random insertion benchmark in
-// btree_test.cc, a btree_set<int32> with node-size of 256 uses 4.9 bytes per
-// stored value.
-//
-// The packing of multiple values on to each node of a btree has another effect
-// besides better space utilization: better cache locality due to fewer cache
-// lines being accessed. Better cache locality translates into faster
-// operations.
-//
-// CAVEATS
-//
-// Insertions and deletions on a btree can cause splitting, merging or
-// rebalancing of btree nodes. And even without these operations, insertions
-// and deletions on a btree will move values around within a node. In both
-// cases, the result is that insertions and deletions can invalidate iterators
-// pointing to values other than the one being inserted/deleted. This is
-// notably different from STL set/map which takes care to not invalidate
-// iterators on insert/erase except, of course, for iterators pointing to the
-// value being erased.  A partial workaround when erasing is available:
-// erase() returns an iterator pointing to the item just after the one that was
-// erased (or end() if none exists).  See also safe_btree.
-
-// PERFORMANCE
-//
-//   btree_bench --benchmarks=. 2>&1 | ./benchmarks.awk
-//
-// Run on pmattis-warp.nyc (4 X 2200 MHz CPUs); 2010/03/04-15:23:06
-// Benchmark                 STL(ns) B-Tree(ns) @    <size>
-// --------------------------------------------------------
-// BM_set_int32_insert        1516      608  +59.89%  <256>    [40.0,  5.2]
-// BM_set_int32_lookup        1160      414  +64.31%  <256>    [40.0,  5.2]
-// BM_set_int32_fulllookup     960      410  +57.29%  <256>    [40.0,  4.4]
-// BM_set_int32_delete        1741      528  +69.67%  <256>    [40.0,  5.2]
-// BM_set_int32_queueaddrem   3078     1046  +66.02%  <256>    [40.0,  5.5]
-// BM_set_int32_mixedaddrem   3600     1384  +61.56%  <256>    [40.0,  5.3]
-// BM_set_int32_fifo           227      113  +50.22%  <256>    [40.0,  4.4]
-// BM_set_int32_fwditer        158       26  +83.54%  <256>    [40.0,  5.2]
-// BM_map_int32_insert        1551      636  +58.99%  <256>    [48.0, 10.5]
-// BM_map_int32_lookup        1200      508  +57.67%  <256>    [48.0, 10.5]
-// BM_map_int32_fulllookup     989      487  +50.76%  <256>    [48.0,  8.8]
-// BM_map_int32_delete        1794      628  +64.99%  <256>    [48.0, 10.5]
-// BM_map_int32_queueaddrem   3189     1266  +60.30%  <256>    [48.0, 11.6]
-// BM_map_int32_mixedaddrem   3822     1623  +57.54%  <256>    [48.0, 10.9]
-// BM_map_int32_fifo           151      134  +11.26%  <256>    [48.0,  8.8]
-// BM_map_int32_fwditer        161       32  +80.12%  <256>    [48.0, 10.5]
-// BM_set_int64_insert        1546      636  +58.86%  <256>    [40.0, 10.5]
-// BM_set_int64_lookup        1200      512  +57.33%  <256>    [40.0, 10.5]
-// BM_set_int64_fulllookup     971      487  +49.85%  <256>    [40.0,  8.8]
-// BM_set_int64_delete        1745      616  +64.70%  <256>    [40.0, 10.5]
-// BM_set_int64_queueaddrem   3163     1195  +62.22%  <256>    [40.0, 11.6]
-// BM_set_int64_mixedaddrem   3760     1564  +58.40%  <256>    [40.0, 10.9]
-// BM_set_int64_fifo           146      103  +29.45%  <256>    [40.0,  8.8]
-// BM_set_int64_fwditer        162       31  +80.86%  <256>    [40.0, 10.5]
-// BM_map_int64_insert        1551      720  +53.58%  <256>    [48.0, 20.7]
-// BM_map_int64_lookup        1214      612  +49.59%  <256>    [48.0, 20.7]
-// BM_map_int64_fulllookup     994      592  +40.44%  <256>    [48.0, 17.2]
-// BM_map_int64_delete        1778      764  +57.03%  <256>    [48.0, 20.7]
-// BM_map_int64_queueaddrem   3189     1547  +51.49%  <256>    [48.0, 20.9]
-// BM_map_int64_mixedaddrem   3779     1887  +50.07%  <256>    [48.0, 21.6]
-// BM_map_int64_fifo           147      145   +1.36%  <256>    [48.0, 17.2]
-// BM_map_int64_fwditer        162       41  +74.69%  <256>    [48.0, 20.7]
-// BM_set_string_insert       1989     1966   +1.16%  <256>    [64.0, 44.5]
-// BM_set_string_lookup       1709     1600   +6.38%  <256>    [64.0, 44.5]
-// BM_set_string_fulllookup   1573     1529   +2.80%  <256>    [64.0, 35.4]
-// BM_set_string_delete       2520     1920  +23.81%  <256>    [64.0, 44.5]
-// BM_set_string_queueaddrem  4706     4309   +8.44%  <256>    [64.0, 48.3]
-// BM_set_string_mixedaddrem  5080     4654   +8.39%  <256>    [64.0, 46.7]
-// BM_set_string_fifo          318      512  -61.01%  <256>    [64.0, 35.4]
-// BM_set_string_fwditer       182       93  +48.90%  <256>    [64.0, 44.5]
-// BM_map_string_insert       2600     2227  +14.35%  <256>    [72.0, 55.8]
-// BM_map_string_lookup       2068     1730  +16.34%  <256>    [72.0, 55.8]
-// BM_map_string_fulllookup   1859     1618  +12.96%  <256>    [72.0, 44.0]
-// BM_map_string_delete       3168     2080  +34.34%  <256>    [72.0, 55.8]
-// BM_map_string_queueaddrem  5840     4701  +19.50%  <256>    [72.0, 59.4]
-// BM_map_string_mixedaddrem  6400     5200  +18.75%  <256>    [72.0, 57.8]
-// BM_map_string_fifo          398      596  -49.75%  <256>    [72.0, 44.0]
-// BM_map_string_fwditer       243      113  +53.50%  <256>    [72.0, 55.8]
-
-#ifndef UTIL_BTREE_BTREE_H__
-#define UTIL_BTREE_BTREE_H__
-
-#include <assert.h>
-#include <stddef.h>
-#include <string.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <type_traits>
-#include <new>
-#include <ostream>
-#include <string>
-#include <utility>
-
-#ifndef NDEBUG
-#define NDEBUG 1
-#endif
-
-namespace btree {
-
-// Inside a btree method, if we just call swap(), it will choose the
-// btree::swap method, which we don't want. And we can't say ::swap
-// because then MSVC won't pickup any std::swap() implementations. We
-// can't just use std::swap() directly because then we don't get the
-// specialization for types outside the std namespace. So the solution
-// is to have a special swap helper function whose name doesn't
-// collide with other swap functions defined by the btree classes.
-template <typename T>
-inline void btree_swap_helper(T &a, T &b) {
-  using std::swap;
-  swap(a, b);
-}
-
-// A template helper used to select A or B based on a condition.
-template<bool cond, typename A, typename B>
-struct if_{
-  typedef A type;
-};
-
-template<typename A, typename B>
-struct if_<false, A, B> {
-  typedef B type;
-};
-
-// Types small_ and big_ are promise that sizeof(small_) < sizeof(big_)
-typedef char small_;
-
-struct big_ {
-  char dummy[2];
-};
-
-// A compile-time assertion.
-template <bool>
-struct CompileAssert {
-};
-
-#define COMPILE_ASSERT(expr, msg) \
-  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
-
-// A helper type used to indicate that a key-compare-to functor has been
-// provided. A user can specify a key-compare-to functor by doing:
-//
-//  struct MyStringComparer
-//      : public util::btree::btree_key_compare_to_tag {
-//    int operator()(const string &a, const string &b) const {
-//      return a.compare(b);
-//    }
-//  };
-//
-// Note that the return type is an int and not a bool. There is a
-// COMPILE_ASSERT which enforces this return type.
-struct btree_key_compare_to_tag {
-};
-
-// A helper class that indicates if the Compare parameter is derived from
-// btree_key_compare_to_tag.
-template <typename Compare>
-struct btree_is_key_compare_to
-    : public std::is_convertible<Compare, btree_key_compare_to_tag> {
-};
-
-// A helper class to convert a boolean comparison into a three-way
-// "compare-to" comparison that returns a negative value to indicate
-// less-than, zero to indicate equality and a positive value to
-// indicate greater-than. This helper class is specialized for
-// less<string> and greater<string>. The btree_key_compare_to_adapter
-// class is provided so that btree users automatically get the more
-// efficient compare-to code when using common google string types
-// with common comparison functors.
-template <typename Compare>
-struct btree_key_compare_to_adapter : Compare {
-  btree_key_compare_to_adapter() { }
-  btree_key_compare_to_adapter(const Compare &c) : Compare(c) { }
-  btree_key_compare_to_adapter(const btree_key_compare_to_adapter<Compare> &c)
-      : Compare(c) {
-  }
-};
-
-template <>
-struct btree_key_compare_to_adapter<std::less<std::string> >
-    : public btree_key_compare_to_tag {
-  btree_key_compare_to_adapter() {}
-  btree_key_compare_to_adapter(const std::less<std::string>&) {}
-  btree_key_compare_to_adapter(
-      const btree_key_compare_to_adapter<std::less<std::string> >&) {}
-  int operator()(const std::string &a, const std::string &b) const {
-    return a.compare(b);
-  }
-};
-
-template <>
-struct btree_key_compare_to_adapter<std::greater<std::string> >
-    : public btree_key_compare_to_tag {
-  btree_key_compare_to_adapter() {}
-  btree_key_compare_to_adapter(const std::greater<std::string>&) {}
-  btree_key_compare_to_adapter(
-      const btree_key_compare_to_adapter<std::greater<std::string> >&) {}
-  int operator()(const std::string &a, const std::string &b) const {
-    return b.compare(a);
-  }
-};
-
-// A helper class that allows a compare-to functor to behave like a plain
-// compare functor. This specialization is used when we do not have a
-// compare-to functor.
-template <typename Key, typename Compare, bool HaveCompareTo>
-struct btree_key_comparer {
-  btree_key_comparer() {}
-  btree_key_comparer(Compare c) : comp(c) {}
-  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
-    return comp(x, y);
-  }
-  bool operator()(const Key &x, const Key &y) const {
-    return bool_compare(comp, x, y);
-  }
-  Compare comp;
-};
-
-// A specialization of btree_key_comparer when a compare-to functor is
-// present. We need a plain (boolean) comparison in some parts of the btree
-// code, such as insert-with-hint.
-template <typename Key, typename Compare>
-struct btree_key_comparer<Key, Compare, true> {
-  btree_key_comparer() {}
-  btree_key_comparer(Compare c) : comp(c) {}
-  static bool bool_compare(const Compare &comp, const Key &x, const Key &y) {
-    return comp(x, y) < 0;
-  }
-  bool operator()(const Key &x, const Key &y) const {
-    return bool_compare(comp, x, y);
-  }
-  Compare comp;
-};
-
-// A helper function to compare to keys using the specified compare
-// functor. This dispatches to the appropriate btree_key_comparer comparison,
-// depending on whether we have a compare-to functor or not (which depends on
-// whether Compare is derived from btree_key_compare_to_tag).
-template <typename Key, typename Compare>
-static bool btree_compare_keys(
-    const Compare &comp, const Key &x, const Key &y) {
-  typedef btree_key_comparer<Key, Compare,
-      btree_is_key_compare_to<Compare>::value> key_comparer;
-  return key_comparer::bool_compare(comp, x, y);
-}
-
-template <typename Key, typename Compare,
-          typename Alloc, int TargetNodeSize, int ValueSize>
-struct btree_common_params {
-  // If Compare is derived from btree_key_compare_to_tag then use it as the
-  // key_compare type. Otherwise, use btree_key_compare_to_adapter<> which will
-  // fall-back to Compare if we don't have an appropriate specialization.
-  typedef typename if_<
-    btree_is_key_compare_to<Compare>::value,
-    Compare, btree_key_compare_to_adapter<Compare> >::type key_compare;
-  // A type which indicates if we have a key-compare-to functor or a plain old
-  // key-compare functor.
-  typedef btree_is_key_compare_to<key_compare> is_key_compare_to;
-
-  typedef Alloc allocator_type;
-  typedef Key key_type;
-  typedef ssize_t size_type;
-  typedef ptrdiff_t difference_type;
-
-  enum {
-    kTargetNodeSize = TargetNodeSize,
-
-    // Available space for values.  This is largest for leaf nodes,
-    // which has overhead no fewer than two pointers.
-    kNodeValueSpace = TargetNodeSize - 2 * sizeof(void*),
-  };
-
-  // This is an integral type large enough to hold as many
-  // ValueSize-values as will fit a node of TargetNodeSize bytes.
-  typedef typename if_<
-    (kNodeValueSpace / ValueSize) >= 256,
-    uint16_t,
-    uint8_t>::type node_count_type;
-};
-
-// A parameters structure for holding the type parameters for a btree_map.
-template <typename Key, typename Data, typename Compare,
-          typename Alloc, int TargetNodeSize>
-struct btree_map_params
-    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
-                                 sizeof(Key) + sizeof(Data)> {
-  typedef Data data_type;
-  typedef Data mapped_type;
-  typedef std::pair<const Key, data_type> value_type;
-  typedef std::pair<Key, data_type> mutable_value_type;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef value_type& reference;
-  typedef const value_type& const_reference;
-
-  enum {
-    kValueSize = sizeof(Key) + sizeof(data_type),
-  };
-
-  static const Key& key(const value_type &x) { return x.first; }
-  static const Key& key(const mutable_value_type &x) { return x.first; }
-  static void swap(mutable_value_type *a, mutable_value_type *b) {
-    btree_swap_helper(a->first, b->first);
-    btree_swap_helper(a->second, b->second);
-  }
-};
-
-// A parameters structure for holding the type parameters for a btree_set.
-template <typename Key, typename Compare, typename Alloc, int TargetNodeSize>
-struct btree_set_params
-    : public btree_common_params<Key, Compare, Alloc, TargetNodeSize,
-                                 sizeof(Key)> {
-  typedef std::false_type data_type;
-  typedef std::false_type mapped_type;
-  typedef Key value_type;
-  typedef value_type mutable_value_type;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef value_type& reference;
-  typedef const value_type& const_reference;
-
-  enum {
-    kValueSize = sizeof(Key),
-  };
-
-  static const Key& key(const value_type &x) { return x; }
-  static void swap(mutable_value_type *a, mutable_value_type *b) {
-    btree_swap_helper<mutable_value_type>(*a, *b);
-  }
-};
-
-// An adapter class that converts a lower-bound compare into an upper-bound
-// compare.
-template <typename Key, typename Compare>
-struct btree_upper_bound_adapter : public Compare {
-  btree_upper_bound_adapter(Compare c) : Compare(c) {}
-  bool operator()(const Key &a, const Key &b) const {
-    return !static_cast<const Compare&>(*this)(b, a);
-  }
-};
-
-template <typename Key, typename CompareTo>
-struct btree_upper_bound_compare_to_adapter : public CompareTo {
-  btree_upper_bound_compare_to_adapter(CompareTo c) : CompareTo(c) {}
-  int operator()(const Key &a, const Key &b) const {
-    return static_cast<const CompareTo&>(*this)(b, a);
-  }
-};
-
-// Dispatch helper class for using linear search with plain compare.
-template <typename K, typename N, typename Compare>
-struct btree_linear_search_plain_compare {
-  static int lower_bound(const K &k, const N &n, Compare comp)  {
-    return n.linear_search_plain_compare(k, 0, n.count(), comp);
-  }
-  static int upper_bound(const K &k, const N &n, Compare comp)  {
-    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
-    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
-  }
-};
-
-// Dispatch helper class for using linear search with compare-to
-template <typename K, typename N, typename CompareTo>
-struct btree_linear_search_compare_to {
-  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
-    return n.linear_search_compare_to(k, 0, n.count(), comp);
-  }
-  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
-    typedef btree_upper_bound_adapter<K,
-        btree_key_comparer<K, CompareTo, true> > upper_compare;
-    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
-  }
-};
-
-// Dispatch helper class for using binary search with plain compare.
-template <typename K, typename N, typename Compare>
-struct btree_binary_search_plain_compare {
-  static int lower_bound(const K &k, const N &n, Compare comp)  {
-    return n.binary_search_plain_compare(k, 0, n.count(), comp);
-  }
-  static int upper_bound(const K &k, const N &n, Compare comp)  {
-    typedef btree_upper_bound_adapter<K, Compare> upper_compare;
-    return n.binary_search_plain_compare(k, 0, n.count(), upper_compare(comp));
-  }
-};
-
-// Dispatch helper class for using binary search with compare-to.
-template <typename K, typename N, typename CompareTo>
-struct btree_binary_search_compare_to {
-  static int lower_bound(const K &k, const N &n, CompareTo comp)  {
-    return n.binary_search_compare_to(k, 0, n.count(), CompareTo());
-  }
-  static int upper_bound(const K &k, const N &n, CompareTo comp)  {
-    typedef btree_upper_bound_adapter<K,
-        btree_key_comparer<K, CompareTo, true> > upper_compare;
-    return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp));
-  }
-};
-
-// A node in the btree holding. The same node type is used for both internal
-// and leaf nodes in the btree, though the nodes are allocated in such a way
-// that the children array is only valid in internal nodes.
-template <typename Params>
-class btree_node {
- public:
-  typedef Params params_type;
-  typedef btree_node<Params> self_type;
-  typedef typename Params::key_type key_type;
-  typedef typename Params::data_type data_type;
-  typedef typename Params::value_type value_type;
-  typedef typename Params::mutable_value_type mutable_value_type;
-  typedef typename Params::pointer pointer;
-  typedef typename Params::const_pointer const_pointer;
-  typedef typename Params::reference reference;
-  typedef typename Params::const_reference const_reference;
-  typedef typename Params::key_compare key_compare;
-  typedef typename Params::size_type size_type;
-  typedef typename Params::difference_type difference_type;
-  // Typedefs for the various types of node searches.
-  typedef btree_linear_search_plain_compare<
-    key_type, self_type, key_compare> linear_search_plain_compare_type;
-  typedef btree_linear_search_compare_to<
-    key_type, self_type, key_compare> linear_search_compare_to_type;
-  typedef btree_binary_search_plain_compare<
-    key_type, self_type, key_compare> binary_search_plain_compare_type;
-  typedef btree_binary_search_compare_to<
-    key_type, self_type, key_compare> binary_search_compare_to_type;
-  // If we have a valid key-compare-to type, use linear_search_compare_to,
-  // otherwise use linear_search_plain_compare.
-  typedef typename if_<
-    Params::is_key_compare_to::value,
-    linear_search_compare_to_type,
-    linear_search_plain_compare_type>::type linear_search_type;
-  // If we have a valid key-compare-to type, use binary_search_compare_to,
-  // otherwise use binary_search_plain_compare.
-  typedef typename if_<
-    Params::is_key_compare_to::value,
-    binary_search_compare_to_type,
-    binary_search_plain_compare_type>::type binary_search_type;
-  // If the key is an integral or floating point type, use linear search which
-  // is faster than binary search for such types. Might be wise to also
-  // configure linear search based on node-size.
-  typedef typename if_<
-    std::is_integral<key_type>::value ||
-    std::is_floating_point<key_type>::value,
-    linear_search_type, binary_search_type>::type search_type;
-
-  struct base_fields {
-    typedef typename Params::node_count_type field_type;
-
-    // A boolean indicating whether the node is a leaf or not.
-    bool leaf;
-    // The position of the node in the node's parent.
-    field_type position;
-    // The maximum number of values the node can hold.
-    field_type max_count;
-    // The count of the number of values in the node.
-    field_type count;
-    // A pointer to the node's parent.
-    btree_node *parent;
-  };
-
-  enum {
-    kValueSize = params_type::kValueSize,
-    kTargetNodeSize = params_type::kTargetNodeSize,
-
-    // Compute how many values we can fit onto a leaf node.
-    kNodeTargetValues = (kTargetNodeSize - sizeof(base_fields)) / kValueSize,
-    // We need a minimum of 3 values per internal node in order to perform
-    // splitting (1 value for the two nodes involved in the split and 1 value
-    // propagated to the parent as the delimiter for the split).
-    kNodeValues = kNodeTargetValues >= 3 ? kNodeTargetValues : 3,
-
-    kExactMatch = 1 << 30,
-    kMatchMask = kExactMatch - 1,
-  };
-
-  struct leaf_fields : public base_fields {
-    // The array of values. Only the first count of these values have been
-    // constructed and are valid.
-    mutable_value_type values[kNodeValues];
-  };
-
-  struct internal_fields : public leaf_fields {
-    // The array of child pointers. The keys in children_[i] are all less than
-    // key(i). The keys in children_[i + 1] are all greater than key(i). There
-    // are always count + 1 children.
-    btree_node *children[kNodeValues + 1];
-  };
-
-  struct root_fields : public internal_fields {
-    btree_node *rightmost;
-    size_type size;
-  };
-
- public:
-  // Getter/setter for whether this is a leaf node or not. This value doesn't
-  // change after the node is created.
-  bool leaf() const { return fields_.leaf; }
-
-  // Getter for the position of this node in its parent.
-  int position() const { return fields_.position; }
-  void set_position(int v) { fields_.position = v; }
-
-  // Getter/setter for the number of values stored in this node.
-  int count() const { return fields_.count; }
-  void set_count(int v) { fields_.count = v; }
-  int max_count() const { return fields_.max_count; }
-
-  // Getter for the parent of this node.
-  btree_node* parent() const { return fields_.parent; }
-  // Getter for whether the node is the root of the tree. The parent of the
-  // root of the tree is the leftmost node in the tree which is guaranteed to
-  // be a leaf.
-  bool is_root() const { return parent()->leaf(); }
-  void make_root() {
-    assert(parent()->is_root());
-    fields_.parent = fields_.parent->parent();
-  }
-
-  // Getter for the rightmost root node field. Only valid on the root node.
-  btree_node* rightmost() const { return fields_.rightmost; }
-  btree_node** mutable_rightmost() { return &fields_.rightmost; }
-
-  // Getter for the size root node field. Only valid on the root node.
-  size_type size() const { return fields_.size; }
-  size_type* mutable_size() { return &fields_.size; }
-
-  // Getters for the key/value at position i in the node.
-  const key_type& key(int i) const {
-    return params_type::key(fields_.values[i]);
-  }
-  reference value(int i) {
-    return reinterpret_cast<reference>(fields_.values[i]);
-  }
-  const_reference value(int i) const {
-    return reinterpret_cast<const_reference>(fields_.values[i]);
-  }
-  mutable_value_type* mutable_value(int i) {
-    return &fields_.values[i];
-  }
-
-  // Swap value i in this node with value j in node x.
-  void value_swap(int i, btree_node *x, int j) {
-    params_type::swap(mutable_value(i), x->mutable_value(j));
-  }
-
-  // Getters/setter for the child at position i in the node.
-  btree_node* child(int i) const { return fields_.children[i]; }
-  btree_node** mutable_child(int i) { return &fields_.children[i]; }
-  void set_child(int i, btree_node *c) {
-    *mutable_child(i) = c;
-    c->fields_.parent = this;
-    c->fields_.position = i;
-  }
-
-  // Returns the position of the first value whose key is not less than k.
-  template <typename Compare>
-  int lower_bound(const key_type &k, const Compare &comp) const {
-    return search_type::lower_bound(k, *this, comp);
-  }
-  // Returns the position of the first value whose key is greater than k.
-  template <typename Compare>
-  int upper_bound(const key_type &k, const Compare &comp) const {
-    return search_type::upper_bound(k, *this, comp);
-  }
-
-  // Returns the position of the first value whose key is not less than k using
-  // linear search performed using plain compare.
-  template <typename Compare>
-  int linear_search_plain_compare(
-      const key_type &k, int s, int e, const Compare &comp) const {
-    while (s < e) {
-      if (!btree_compare_keys(comp, key(s), k)) {
-        break;
-      }
-      ++s;
-    }
-    return s;
-  }
-
-  // Returns the position of the first value whose key is not less than k using
-  // linear search performed using compare-to.
-  template <typename Compare>
-  int linear_search_compare_to(
-      const key_type &k, int s, int e, const Compare &comp) const {
-    while (s < e) {
-      int c = comp(key(s), k);
-      if (c == 0) {
-        return s | kExactMatch;
-      } else if (c > 0) {
-        break;
-      }
-      ++s;
-    }
-    return s;
-  }
-
-  // Returns the position of the first value whose key is not less than k using
-  // binary search performed using plain compare.
-  template <typename Compare>
-  int binary_search_plain_compare(
-      const key_type &k, int s, int e, const Compare &comp) const {
-    while (s != e) {
-      int mid = (s + e) / 2;
-      if (btree_compare_keys(comp, key(mid), k)) {
-        s = mid + 1;
-      } else {
-        e = mid;
-      }
-    }
-    return s;
-  }
-
-  // Returns the position of the first value whose key is not less than k using
-  // binary search performed using compare-to.
-  template <typename CompareTo>
-  int binary_search_compare_to(
-      const key_type &k, int s, int e, const CompareTo &comp) const {
-    while (s != e) {
-      int mid = (s + e) / 2;
-      int c = comp(key(mid), k);
-      if (c < 0) {
-        s = mid + 1;
-      } else if (c > 0) {
-        e = mid;
-      } else {
-        // Need to return the first value whose key is not less than k, which
-        // requires continuing the binary search. Note that we are guaranteed
-        // that the result is an exact match because if "key(mid-1) < k" the
-        // call to binary_search_compare_to() will return "mid".
-        s = binary_search_compare_to(k, s, mid, comp);
-        return s | kExactMatch;
-      }
-    }
-    return s;
-  }
-
-  // Inserts the value x at position i, shifting all existing values and
-  // children at positions >= i to the right by 1.
-  void insert_value(int i, const value_type &x);
-
-  // Removes the value at position i, shifting all existing values and children
-  // at positions > i to the left by 1.
-  void remove_value(int i);
-
-  // Rebalances a node with its right sibling.
-  void rebalance_right_to_left(btree_node *sibling, int to_move);
-  void rebalance_left_to_right(btree_node *sibling, int to_move);
-
-  // Splits a node, moving a portion of the node's values to its right sibling.
-  void split(btree_node *sibling, int insert_position);
-
-  // Merges a node with its right sibling, moving all of the values and the
-  // delimiting key in the parent node onto itself.
-  void merge(btree_node *sibling);
-
-  // Swap the contents of "this" and "src".
-  void swap(btree_node *src);
-
-  // Node allocation/deletion routines.
-  static btree_node* init_leaf(
-      leaf_fields *f, btree_node *parent, int max_count) {
-    btree_node *n = reinterpret_cast<btree_node*>(f);
-    f->leaf = 1;
-    f->position = 0;
-    f->max_count = max_count;
-    f->count = 0;
-    f->parent = parent;
-    if (!NDEBUG) {
-      memset(&f->values, 0, max_count * sizeof(value_type));
-    }
-    return n;
-  }
-  static btree_node* init_internal(internal_fields *f, btree_node *parent) {
-    btree_node *n = init_leaf(f, parent, kNodeValues);
-    f->leaf = 0;
-    if (!NDEBUG) {
-      memset(f->children, 0, sizeof(f->children));
-    }
-    return n;
-  }
-  static btree_node* init_root(root_fields *f, btree_node *parent) {
-    btree_node *n = init_internal(f, parent);
-    f->rightmost = parent;
-    f->size = parent->count();
-    return n;
-  }
-  void destroy() {
-    for (int i = 0; i < count(); ++i) {
-      value_destroy(i);
-    }
-  }
-
- private:
-  void value_init(int i) {
-    new (&fields_.values[i]) mutable_value_type;
-  }
-  void value_init(int i, const value_type &x) {
-    new (&fields_.values[i]) mutable_value_type(x);
-  }
-  void value_destroy(int i) {
-    fields_.values[i].~mutable_value_type();
-  }
-
- private:
-  root_fields fields_;
-
- private:
-  btree_node(const btree_node&);
-  void operator=(const btree_node&);
-};
-
-template <typename Node, typename Reference, typename Pointer>
-struct btree_iterator {
-  typedef typename Node::key_type key_type;
-  typedef typename Node::size_type size_type;
-  typedef typename Node::difference_type difference_type;
-  typedef typename Node::params_type params_type;
-
-  typedef Node node_type;
-  typedef typename std::remove_const<Node>::type normal_node;
-  typedef const Node const_node;
-  typedef typename params_type::value_type value_type;
-  typedef typename params_type::pointer normal_pointer;
-  typedef typename params_type::reference normal_reference;
-  typedef typename params_type::const_pointer const_pointer;
-  typedef typename params_type::const_reference const_reference;
-
-  typedef Pointer pointer;
-  typedef Reference reference;
-  typedef std::bidirectional_iterator_tag iterator_category;
-
-  typedef btree_iterator<
-    normal_node, normal_reference, normal_pointer> iterator;
-  typedef btree_iterator<
-    const_node, const_reference, const_pointer> const_iterator;
-  typedef btree_iterator<Node, Reference, Pointer> self_type;
-
-  btree_iterator()
-      : node(NULL),
-        position(-1) {
-  }
-  btree_iterator(Node *n, int p)
-      : node(n),
-        position(p) {
-  }
-  btree_iterator(const iterator &x)
-      : node(x.node),
-        position(x.position) {
-  }
-
-  // Increment/decrement the iterator.
-  void increment() {
-    if (node->leaf() && ++position < node->count()) {
-      return;
-    }
-    increment_slow();
-  }
-  void increment_by(int count);
-  void increment_slow();
-
-  void decrement() {
-    if (node->leaf() && --position >= 0) {
-      return;
-    }
-    decrement_slow();
-  }
-  void decrement_slow();
-
-  bool operator==(const const_iterator &x) const {
-    return node == x.node && position == x.position;
-  }
-  bool operator!=(const const_iterator &x) const {
-    return node != x.node || position != x.position;
-  }
-
-  // Accessors for the key/value the iterator is pointing at.
-  const key_type& key() const {
-    return node->key(position);
-  }
-  reference operator*() const {
-    return node->value(position);
-  }
-  pointer operator->() const {
-    return &node->value(position);
-  }
-
-  self_type& operator++() {
-    increment();
-    return *this;
-  }
-  self_type& operator--() {
-    decrement();
-    return *this;
-  }
-  self_type operator++(int) {
-    self_type tmp = *this;
-    ++*this;
-    return tmp;
-  }
-  self_type operator--(int) {
-    self_type tmp = *this;
-    --*this;
-    return tmp;
-  }
-
-  // The node in the tree the iterator is pointing at.
-  Node *node;
-  // The position within the node of the tree the iterator is pointing at.
-  int position;
-};
-
-// Dispatch helper class for using btree::internal_locate with plain compare.
-struct btree_internal_locate_plain_compare {
-  template <typename K, typename T, typename Iter>
-  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
-    return t.internal_locate_plain_compare(k, iter);
-  }
-};
-
-// Dispatch helper class for using btree::internal_locate with compare-to.
-struct btree_internal_locate_compare_to {
-  template <typename K, typename T, typename Iter>
-  static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) {
-    return t.internal_locate_compare_to(k, iter);
-  }
-};
-
-template <typename Params>
-class btree : public Params::key_compare {
-  typedef btree<Params> self_type;
-  typedef btree_node<Params> node_type;
-  typedef typename node_type::base_fields base_fields;
-  typedef typename node_type::leaf_fields leaf_fields;
-  typedef typename node_type::internal_fields internal_fields;
-  typedef typename node_type::root_fields root_fields;
-  typedef typename Params::is_key_compare_to is_key_compare_to;
-
-  friend class btree_internal_locate_plain_compare;
-  friend class btree_internal_locate_compare_to;
-  typedef typename if_<
-    is_key_compare_to::value,
-    btree_internal_locate_compare_to,
-    btree_internal_locate_plain_compare>::type internal_locate_type;
-
-  enum {
-    kNodeValues = node_type::kNodeValues,
-    kMinNodeValues = kNodeValues / 2,
-    kValueSize = node_type::kValueSize,
-    kExactMatch = node_type::kExactMatch,
-    kMatchMask = node_type::kMatchMask,
-  };
-
-  // A helper class to get the empty base class optimization for 0-size
-  // allocators. Base is internal_allocator_type.
-  // (e.g. empty_base_handle<internal_allocator_type, node_type*>). If Base is
-  // 0-size, the compiler doesn't have to reserve any space for it and
-  // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base
-  // class optimization] for more details.
-  template <typename Base, typename Data>
-  struct empty_base_handle : public Base {
-    empty_base_handle(const Base &b, const Data &d)
-        : Base(b),
-          data(d) {
-    }
-    Data data;
-  };
-
-  struct node_stats {
-    node_stats(ssize_t l, ssize_t i)
-        : leaf_nodes(l),
-          internal_nodes(i) {
-    }
-
-    node_stats& operator+=(const node_stats &x) {
-      leaf_nodes += x.leaf_nodes;
-      internal_nodes += x.internal_nodes;
-      return *this;
-    }
-
-    ssize_t leaf_nodes;
-    ssize_t internal_nodes;
-  };
-
- public:
-  typedef Params params_type;
-  typedef typename Params::key_type key_type;
-  typedef typename Params::data_type data_type;
-  typedef typename Params::mapped_type mapped_type;
-  typedef typename Params::value_type value_type;
-  typedef typename Params::key_compare key_compare;
-  typedef typename Params::pointer pointer;
-  typedef typename Params::const_pointer const_pointer;
-  typedef typename Params::reference reference;
-  typedef typename Params::const_reference const_reference;
-  typedef typename Params::size_type size_type;
-  typedef typename Params::difference_type difference_type;
-  typedef btree_iterator<node_type, reference, pointer> iterator;
-  typedef typename iterator::const_iterator const_iterator;
-  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
-  typedef std::reverse_iterator<iterator> reverse_iterator;
-
-  typedef typename Params::allocator_type allocator_type;
-  typedef typename allocator_type::template rebind<char>::other
-    internal_allocator_type;
-
- public:
-  // Default constructor.
-  btree(const key_compare &comp, const allocator_type &alloc);
-
-  // Copy constructor.
-  btree(const self_type &x);
-
-  // Destructor.
-  ~btree() {
-    clear();
-  }
-
-  // Iterator routines.
-  iterator begin() {
-    return iterator(leftmost(), 0);
-  }
-  const_iterator begin() const {
-    return const_iterator(leftmost(), 0);
-  }
-  iterator end() {
-    return iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
-  }
-  const_iterator end() const {
-    return const_iterator(rightmost(), rightmost() ? rightmost()->count() : 0);
-  }
-  reverse_iterator rbegin() {
-    return reverse_iterator(end());
-  }
-  const_reverse_iterator rbegin() const {
-    return const_reverse_iterator(end());
-  }
-  reverse_iterator rend() {
-    return reverse_iterator(begin());
-  }
-  const_reverse_iterator rend() const {
-    return const_reverse_iterator(begin());
-  }
-
-  // Finds the first element whose key is not less than key.
-  iterator lower_bound(const key_type &key) {
-    return internal_end(
-        internal_lower_bound(key, iterator(root(), 0)));
-  }
-  const_iterator lower_bound(const key_type &key) const {
-    return internal_end(
-        internal_lower_bound(key, const_iterator(root(), 0)));
-  }
-
-  // Finds the first element whose key is greater than key.
-  iterator upper_bound(const key_type &key) {
-    return internal_end(
-        internal_upper_bound(key, iterator(root(), 0)));
-  }
-  const_iterator upper_bound(const key_type &key) const {
-    return internal_end(
-        internal_upper_bound(key, const_iterator(root(), 0)));
-  }
-
-  // Finds the range of values which compare equal to key. The first member of
-  // the returned pair is equal to lower_bound(key). The second member pair of
-  // the pair is equal to upper_bound(key).
-  std::pair<iterator,iterator> equal_range(const key_type &key) {
-    return std::make_pair(lower_bound(key), upper_bound(key));
-  }
-  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
-    return std::make_pair(lower_bound(key), upper_bound(key));
-  }
-
-  // Inserts a value into the btree only if it does not already exist. The
-  // boolean return value indicates whether insertion succeeded or failed. The
-  // ValuePointer type is used to avoid instatiating the value unless the key
-  // is being inserted. Value is not dereferenced if the key already exists in
-  // the btree. See btree_map::operator[].
-  template <typename ValuePointer>
-  std::pair<iterator,bool> insert_unique(const key_type &key, ValuePointer value);
-
-  // Inserts a value into the btree only if it does not already exist. The
-  // boolean return value indicates whether insertion succeeded or failed.
-  std::pair<iterator,bool> insert_unique(const value_type &v) {
-    return insert_unique(params_type::key(v), &v);
-  }
-
-  // Insert with hint. Check to see if the value should be placed immediately
-  // before position in the tree. If it does, then the insertion will take
-  // amortized constant time. If not, the insertion will take amortized
-  // logarithmic time as if a call to insert_unique(v) were made.
-  iterator insert_unique(iterator position, const value_type &v);
-
-  // Insert a range of values into the btree.
-  template <typename InputIterator>
-  void insert_unique(InputIterator b, InputIterator e);
-
-  // Inserts a value into the btree. The ValuePointer type is used to avoid
-  // instatiating the value unless the key is being inserted. Value is not
-  // dereferenced if the key already exists in the btree. See
-  // btree_map::operator[].
-  template <typename ValuePointer>
-  iterator insert_multi(const key_type &key, ValuePointer value);
-
-  // Inserts a value into the btree.
-  iterator insert_multi(const value_type &v) {
-    return insert_multi(params_type::key(v), &v);
-  }
-
-  // Insert with hint. Check to see if the value should be placed immediately
-  // before position in the tree. If it does, then the insertion will take
-  // amortized constant time. If not, the insertion will take amortized
-  // logarithmic time as if a call to insert_multi(v) were made.
-  iterator insert_multi(iterator position, const value_type &v);
-
-  // Insert a range of values into the btree.
-  template <typename InputIterator>
-  void insert_multi(InputIterator b, InputIterator e);
-
-  void assign(const self_type &x);
-
-  // Erase the specified iterator from the btree. The iterator must be valid
-  // (i.e. not equal to end()).  Return an iterator pointing to the node after
-  // the one that was erased (or end() if none exists).
-  iterator erase(iterator iter);
-
-  // Erases range. Returns the number of keys erased.
-  int erase(iterator begin, iterator end);
-
-  // Erases the specified key from the btree. Returns 1 if an element was
-  // erased and 0 otherwise.
-  int erase_unique(const key_type &key);
-
-  // Erases all of the entries matching the specified key from the
-  // btree. Returns the number of elements erased.
-  int erase_multi(const key_type &key);
-
-  // Finds the iterator corresponding to a key or returns end() if the key is
-  // not present.
-  iterator find_unique(const key_type &key) {
-    return internal_end(
-        internal_find_unique(key, iterator(root(), 0)));
-  }
-  const_iterator find_unique(const key_type &key) const {
-    return internal_end(
-        internal_find_unique(key, const_iterator(root(), 0)));
-  }
-  iterator find_multi(const key_type &key) {
-    return internal_end(
-        internal_find_multi(key, iterator(root(), 0)));
-  }
-  const_iterator find_multi(const key_type &key) const {
-    return internal_end(
-        internal_find_multi(key, const_iterator(root(), 0)));
-  }
-
-  // Returns a count of the number of times the key appears in the btree.
-  size_type count_unique(const key_type &key) const {
-    const_iterator begin = internal_find_unique(
-        key, const_iterator(root(), 0));
-    if (!begin.node) {
-      // The key doesn't exist in the tree.
-      return 0;
-    }
-    return 1;
-  }
-  // Returns a count of the number of times the key appears in the btree.
-  size_type count_multi(const key_type &key) const {
-    return distance(lower_bound(key), upper_bound(key));
-  }
-
-  // Clear the btree, deleting all of the values it contains.
-  void clear();
-
-  // Swap the contents of *this and x.
-  void swap(self_type &x);
-
-  // Assign the contents of x to *this.
-  self_type& operator=(const self_type &x) {
-    if (&x == this) {
-      // Don't copy onto ourselves.
-      return *this;
-    }
-    assign(x);
-    return *this;
-  }
-
-  key_compare* mutable_key_comp() {
-    return this;
-  }
-  const key_compare& key_comp() const {
-    return *this;
-  }
-  bool compare_keys(const key_type &x, const key_type &y) const {
-    return btree_compare_keys(key_comp(), x, y);
-  }
-
-  // Dump the btree to the specified ostream. Requires that operator<< is
-  // defined for Key and Value.
-  void dump(std::ostream &os) const {
-    if (root() != NULL) {
-      internal_dump(os, root(), 0);
-    }
-  }
-
-  // Verifies the structure of the btree.
-  void verify() const;
-
-  // Size routines. Note that empty() is slightly faster than doing size()==0.
-  size_type size() const {
-    if (empty()) return 0;
-    if (root()->leaf()) return root()->count();
-    return root()->size();
-  }
-  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
-  bool empty() const { return root() == NULL; }
-
-  // The height of the btree. An empty tree will have height 0.
-  size_type height() const {
-    size_type h = 0;
-    if (root()) {
-      // Count the length of the chain from the leftmost node up to the
-      // root. We actually count from the root back around to the level below
-      // the root, but the calculation is the same because of the circularity
-      // of that traversal.
-      const node_type *n = root();
-      do {
-        ++h;
-        n = n->parent();
-      } while (n != root());
-    }
-    return h;
-  }
-
-  // The number of internal, leaf and total nodes used by the btree.
-  size_type leaf_nodes() const {
-    return internal_stats(root()).leaf_nodes;
-  }
-  size_type internal_nodes() const {
-    return internal_stats(root()).internal_nodes;
-  }
-  size_type nodes() const {
-    node_stats stats = internal_stats(root());
-    return stats.leaf_nodes + stats.internal_nodes;
-  }
-
-  // The total number of bytes used by the btree.
-  size_type bytes_used() const {
-    node_stats stats = internal_stats(root());
-    if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) {
-      return sizeof(*this) +
-          sizeof(base_fields) + root()->max_count() * sizeof(value_type);
-    } else {
-      return sizeof(*this) +
-          sizeof(root_fields) - sizeof(internal_fields) +
-          stats.leaf_nodes * sizeof(leaf_fields) +
-          stats.internal_nodes * sizeof(internal_fields);
-    }
-  }
-
-  // The average number of bytes used per value stored in the btree.
-  static double average_bytes_per_value() {
-    // Returns the number of bytes per value on a leaf node that is 75%
-    // full. Experimentally, this matches up nicely with the computed number of
-    // bytes per value in trees that had their values inserted in random order.
-    return sizeof(leaf_fields) / (kNodeValues * 0.75);
-  }
-
-  // The fullness of the btree. Computed as the number of elements in the btree
-  // divided by the maximum number of elements a tree with the current number
-  // of nodes could hold. A value of 1 indicates perfect space
-  // utilization. Smaller values indicate space wastage.
-  double fullness() const {
-    return double(size()) / (nodes() * kNodeValues);
-  }
-  // The overhead of the btree structure in bytes per node. Computed as the
-  // total number of bytes used by the btree minus the number of bytes used for
-  // storing elements divided by the number of elements.
-  double overhead() const {
-    if (empty()) {
-      return 0.0;
-    }
-    return (bytes_used() - size() * kValueSize) / double(size());
-  }
-
- private:
-  // Internal accessor routines.
-  node_type* root() { return root_.data; }
-  const node_type* root() const { return root_.data; }
-  node_type** mutable_root() { return &root_.data; }
-
-  // The rightmost node is stored in the root node.
-  node_type* rightmost() {
-    return (!root() || root()->leaf()) ? root() : root()->rightmost();
-  }
-  const node_type* rightmost() const {
-    return (!root() || root()->leaf()) ? root() : root()->rightmost();
-  }
-  node_type** mutable_rightmost() { return root()->mutable_rightmost(); }
-
-  // The leftmost node is stored as the parent of the root node.
-  node_type* leftmost() { return root() ? root()->parent() : NULL; }
-  const node_type* leftmost() const { return root() ? root()->parent() : NULL; }
-
-  // The size of the tree is stored in the root node.
-  size_type* mutable_size() { return root()->mutable_size(); }
-
-  // Allocator routines.
-  internal_allocator_type* mutable_internal_allocator() {
-    return static_cast<internal_allocator_type*>(&root_);
-  }
-  const internal_allocator_type& internal_allocator() const {
-    return *static_cast<const internal_allocator_type*>(&root_);
-  }
-
-  // Node creation/deletion routines.
-  node_type* new_internal_node(node_type *parent) {
-    internal_fields *p = reinterpret_cast<internal_fields*>(
-        mutable_internal_allocator()->allocate(sizeof(internal_fields)));
-    return node_type::init_internal(p, parent);
-  }
-  node_type* new_internal_root_node() {
-    root_fields *p = reinterpret_cast<root_fields*>(
-        mutable_internal_allocator()->allocate(sizeof(root_fields)));
-    return node_type::init_root(p, root()->parent());
-  }
-  node_type* new_leaf_node(node_type *parent) {
-    leaf_fields *p = reinterpret_cast<leaf_fields*>(
-        mutable_internal_allocator()->allocate(sizeof(leaf_fields)));
-    return node_type::init_leaf(p, parent, kNodeValues);
-  }
-  node_type* new_leaf_root_node(int max_count) {
-    leaf_fields *p = reinterpret_cast<leaf_fields*>(
-        mutable_internal_allocator()->allocate(
-            sizeof(base_fields) + max_count * sizeof(value_type)));
-    return node_type::init_leaf(p, reinterpret_cast<node_type*>(p), max_count);
-  }
-  void delete_internal_node(node_type *node) {
-    node->destroy();
-    assert(node != root());
-    mutable_internal_allocator()->deallocate(
-        reinterpret_cast<char*>(node), sizeof(internal_fields));
-  }
-  void delete_internal_root_node() {
-    root()->destroy();
-    mutable_internal_allocator()->deallocate(
-        reinterpret_cast<char*>(root()), sizeof(root_fields));
-  }
-  void delete_leaf_node(node_type *node) {
-    node->destroy();
-    mutable_internal_allocator()->deallocate(
-        reinterpret_cast<char*>(node),
-        sizeof(base_fields) + node->max_count() * sizeof(value_type));
-  }
-
-  // Rebalances or splits the node iter points to.
-  void rebalance_or_split(iterator *iter);
-
-  // Merges the values of left, right and the delimiting key on their parent
-  // onto left, removing the delimiting key and deleting right.
-  void merge_nodes(node_type *left, node_type *right);
-
-  // Tries to merge node with its left or right sibling, and failing that,
-  // rebalance with its left or right sibling. Returns true if a merge
-  // occurred, at which point it is no longer valid to access node. Returns
-  // false if no merging took place.
-  bool try_merge_or_rebalance(iterator *iter);
-
-  // Tries to shrink the height of the tree by 1.
-  void try_shrink();
-
-  iterator internal_end(iterator iter) {
-    return iter.node ? iter : end();
-  }
-  const_iterator internal_end(const_iterator iter) const {
-    return iter.node ? iter : end();
-  }
-
-  // Inserts a value into the btree immediately before iter. Requires that
-  // key(v) <= iter.key() and (--iter).key() <= key(v).
-  iterator internal_insert(iterator iter, const value_type &v);
-
-  // Returns an iterator pointing to the first value >= the value "iter" is
-  // pointing at. Note that "iter" might be pointing to an invalid location as
-  // iter.position == iter.node->count(). This routine simply moves iter up in
-  // the tree to a valid location.
-  template <typename IterType>
-  static IterType internal_last(IterType iter);
-
-  // Returns an iterator pointing to the leaf position at which key would
-  // reside in the tree. We provide 2 versions of internal_locate. The first
-  // version (internal_locate_plain_compare) always returns 0 for the second
-  // field of the pair. The second version (internal_locate_compare_to) is for
-  // the key-compare-to specialization and returns either kExactMatch (if the
-  // key was found in the tree) or -kExactMatch (if it wasn't) in the second
-  // field of the pair. The compare_to specialization allows the caller to
-  // avoid a subsequent comparison to determine if an exact match was made,
-  // speeding up string keys.
-  template <typename IterType>
-  std::pair<IterType, int> internal_locate(
-      const key_type &key, IterType iter) const;
-  template <typename IterType>
-  std::pair<IterType, int> internal_locate_plain_compare(
-      const key_type &key, IterType iter) const;
-  template <typename IterType>
-  std::pair<IterType, int> internal_locate_compare_to(
-      const key_type &key, IterType iter) const;
-
-  // Internal routine which implements lower_bound().
-  template <typename IterType>
-  IterType internal_lower_bound(
-      const key_type &key, IterType iter) const;
-
-  // Internal routine which implements upper_bound().
-  template <typename IterType>
-  IterType internal_upper_bound(
-      const key_type &key, IterType iter) const;
-
-  // Internal routine which implements find_unique().
-  template <typename IterType>
-  IterType internal_find_unique(
-      const key_type &key, IterType iter) const;
-
-  // Internal routine which implements find_multi().
-  template <typename IterType>
-  IterType internal_find_multi(
-      const key_type &key, IterType iter) const;
-
-  // Deletes a node and all of its children.
-  void internal_clear(node_type *node);
-
-  // Dumps a node and all of its children to the specified ostream.
-  void internal_dump(std::ostream &os, const node_type *node, int level) const;
-
-  // Verifies the tree structure of node.
-  int internal_verify(const node_type *node,
-                      const key_type *lo, const key_type *hi) const;
-
-  node_stats internal_stats(const node_type *node) const {
-    if (!node) {
-      return node_stats(0, 0);
-    }
-    if (node->leaf()) {
-      return node_stats(1, 0);
-    }
-    node_stats res(0, 1);
-    for (int i = 0; i <= node->count(); ++i) {
-      res += internal_stats(node->child(i));
-    }
-    return res;
-  }
-
- private:
-  empty_base_handle<internal_allocator_type, node_type*> root_;
-
- private:
-  // A never instantiated helper function that returns big_ if we have a
-  // key-compare-to functor or if R is bool and small_ otherwise.
-  template <typename R>
-  static typename if_<
-   if_<is_key_compare_to::value,
-             std::is_same<R, int>,
-             std::is_same<R, bool> >::type::value,
-   big_, small_>::type key_compare_checker(R);
-
-  // A never instantiated helper function that returns the key comparison
-  // functor.
-  static key_compare key_compare_helper();
-
-  // Verify that key_compare returns a bool. This is similar to the way
-  // is_convertible in base/type_traits.h works. Note that key_compare_checker
-  // is never actually invoked. The compiler will select which
-  // key_compare_checker() to instantiate and then figure out the size of the
-  // return type of key_compare_checker() at compile time which we then check
-  // against the sizeof of big_.
-  COMPILE_ASSERT(
-      sizeof(key_compare_checker(key_compare_helper()(key_type(), key_type()))) ==
-      sizeof(big_),
-      key_comparison_function_must_return_bool);
-
-  // Note: We insist on kTargetValues, which is computed from
-  // Params::kTargetNodeSize, must fit the base_fields::field_type.
-  COMPILE_ASSERT(kNodeValues <
-                 (1 << (8 * sizeof(typename base_fields::field_type))),
-                 target_node_size_too_large);
-
-  // Test the assumption made in setting kNodeValueSpace.
-  COMPILE_ASSERT(sizeof(base_fields) >= 2 * sizeof(void*),
-                 node_space_assumption_incorrect);
-};
-
-////
-// btree_node methods
-template <typename P>
-inline void btree_node<P>::insert_value(int i, const value_type &x) {
-  assert(i <= count());
-  value_init(count(), x);
-  for (int j = count(); j > i; --j) {
-    value_swap(j, this, j - 1);
-  }
-  set_count(count() + 1);
-
-  if (!leaf()) {
-    ++i;
-    for (int j = count(); j > i; --j) {
-      *mutable_child(j) = child(j - 1);
-      child(j)->set_position(j);
-    }
-    *mutable_child(i) = NULL;
-  }
-}
-
-template <typename P>
-inline void btree_node<P>::remove_value(int i) {
-  if (!leaf()) {
-    assert(child(i + 1)->count() == 0);
-    for (int j = i + 1; j < count(); ++j) {
-      *mutable_child(j) = child(j + 1);
-      child(j)->set_position(j);
-    }
-    *mutable_child(count()) = NULL;
-  }
-
-  set_count(count() - 1);
-  for (; i < count(); ++i) {
-    value_swap(i, this, i + 1);
-  }
-  value_destroy(i);
-}
-
-template <typename P>
-void btree_node<P>::rebalance_right_to_left(btree_node *src, int to_move) {
-  assert(parent() == src->parent());
-  assert(position() + 1 == src->position());
-  assert(src->count() >= count());
-  assert(to_move >= 1);
-  assert(to_move <= src->count());
-
-  // Make room in the left node for the new values.
-  for (int i = 0; i < to_move; ++i) {
-    value_init(i + count());
-  }
-
-  // Move the delimiting value to the left node and the new delimiting value
-  // from the right node.
-  value_swap(count(), parent(), position());
-  parent()->value_swap(position(), src, to_move - 1);
-
-  // Move the values from the right to the left node.
-  for (int i = 1; i < to_move; ++i) {
-    value_swap(count() + i, src, i - 1);
-  }
-  // Shift the values in the right node to their correct position.
-  for (int i = to_move; i < src->count(); ++i) {
-    src->value_swap(i - to_move, src, i);
-  }
-  for (int i = 1; i <= to_move; ++i) {
-    src->value_destroy(src->count() - i);
-  }
-
-  if (!leaf()) {
-    // Move the child pointers from the right to the left node.
-    for (int i = 0; i < to_move; ++i) {
-      set_child(1 + count() + i, src->child(i));
-    }
-    for (int i = 0; i <= src->count() - to_move; ++i) {
-      assert(i + to_move <= src->max_count());
-      src->set_child(i, src->child(i + to_move));
-      *src->mutable_child(i + to_move) = NULL;
-    }
-  }
-
-  // Fixup the counts on the src and dest nodes.
-  set_count(count() + to_move);
-  src->set_count(src->count() - to_move);
-}
-
-template <typename P>
-void btree_node<P>::rebalance_left_to_right(btree_node *dest, int to_move) {
-  assert(parent() == dest->parent());
-  assert(position() + 1 == dest->position());
-  assert(count() >= dest->count());
-  assert(to_move >= 1);
-  assert(to_move <= count());
-
-  // Make room in the right node for the new values.
-  for (int i = 0; i < to_move; ++i) {
-    dest->value_init(i + dest->count());
-  }
-  for (int i = dest->count() - 1; i >= 0; --i) {
-    dest->value_swap(i, dest, i + to_move);
-  }
-
-  // Move the delimiting value to the right node and the new delimiting value
-  // from the left node.
-  dest->value_swap(to_move - 1, parent(), position());
-  parent()->value_swap(position(), this, count() - to_move);
-  value_destroy(count() - to_move);
-
-  // Move the values from the left to the right node.
-  for (int i = 1; i < to_move; ++i) {
-    value_swap(count() - to_move + i, dest, i - 1);
-    value_destroy(count() - to_move + i);
-  }
-
-  if (!leaf()) {
-    // Move the child pointers from the left to the right node.
-    for (int i = dest->count(); i >= 0; --i) {
-      dest->set_child(i + to_move, dest->child(i));
-      *dest->mutable_child(i) = NULL;
-    }
-    for (int i = 1; i <= to_move; ++i) {
-      dest->set_child(i - 1, child(count() - to_move + i));
-      *mutable_child(count() - to_move + i) = NULL;
-    }
-  }
-
-  // Fixup the counts on the src and dest nodes.
-  set_count(count() - to_move);
-  dest->set_count(dest->count() + to_move);
-}
-
-template <typename P>
-void btree_node<P>::split(btree_node *dest, int insert_position) {
-  assert(dest->count() == 0);
-
-  // We bias the split based on the position being inserted. If we're
-  // inserting at the beginning of the left node then bias the split to put
-  // more values on the right node. If we're inserting at the end of the
-  // right node then bias the split to put more values on the left node.
-  if (insert_position == 0) {
-    dest->set_count(count() - 1);
-  } else if (insert_position == max_count()) {
-    dest->set_count(0);
-  } else {
-    dest->set_count(count() / 2);
-  }
-  set_count(count() - dest->count());
-  assert(count() >= 1);
-
-  // Move values from the left sibling to the right sibling.
-  for (int i = 0; i < dest->count(); ++i) {
-    dest->value_init(i);
-    value_swap(count() + i, dest, i);
-    value_destroy(count() + i);
-  }
-
-  // The split key is the largest value in the left sibling.
-  set_count(count() - 1);
-  parent()->insert_value(position(), value_type());
-  value_swap(count(), parent(), position());
-  value_destroy(count());
-  parent()->set_child(position() + 1, dest);
-
-  if (!leaf()) {
-    for (int i = 0; i <= dest->count(); ++i) {
-      assert(child(count() + i + 1) != NULL);
-      dest->set_child(i, child(count() + i + 1));
-      *mutable_child(count() + i + 1) = NULL;
-    }
-  }
-}
-
-template <typename P>
-void btree_node<P>::merge(btree_node *src) {
-  assert(parent() == src->parent());
-  assert(position() + 1 == src->position());
-
-  // Move the delimiting value to the left node.
-  value_init(count());
-  value_swap(count(), parent(), position());
-
-  // Move the values from the right to the left node.
-  for (int i = 0; i < src->count(); ++i) {
-    value_init(1 + count() + i);
-    value_swap(1 + count() + i, src, i);
-    src->value_destroy(i);
-  }
-
-  if (!leaf()) {
-    // Move the child pointers from the right to the left node.
-    for (int i = 0; i <= src->count(); ++i) {
-      set_child(1 + count() + i, src->child(i));
-      *src->mutable_child(i) = NULL;
-    }
-  }
-
-  // Fixup the counts on the src and dest nodes.
-  set_count(1 + count() + src->count());
-  src->set_count(0);
-
-  // Remove the value on the parent node.
-  parent()->remove_value(position());
-}
-
-template <typename P>
-void btree_node<P>::swap(btree_node *x) {
-  assert(leaf() == x->leaf());
-
-  // Swap the values.
-  for (int i = count(); i < x->count(); ++i) {
-    value_init(i);
-  }
-  for (int i = x->count(); i < count(); ++i) {
-    x->value_init(i);
-  }
-  int n = std::max(count(), x->count());
-  for (int i = 0; i < n; ++i) {
-    value_swap(i, x, i);
-  }
-  for (int i = count(); i < x->count(); ++i) {
-    x->value_destroy(i);
-  }
-  for (int i = x->count(); i < count(); ++i) {
-    value_destroy(i);
-  }
-
-  if (!leaf()) {
-    // Swap the child pointers.
-    for (int i = 0; i <= n; ++i) {
-      btree_swap_helper(*mutable_child(i), *x->mutable_child(i));
-    }
-    for (int i = 0; i <= count(); ++i) {
-      x->child(i)->fields_.parent = x;
-    }
-    for (int i = 0; i <= x->count(); ++i) {
-      child(i)->fields_.parent = this;
-    }
-  }
-
-  // Swap the counts.
-  btree_swap_helper(fields_.count, x->fields_.count);
-}
-
-////
-// btree_iterator methods
-template <typename N, typename R, typename P>
-void btree_iterator<N, R, P>::increment_slow() {
-  if (node->leaf()) {
-    assert(position >= node->count());
-    self_type save(*this);
-    while (position == node->count() && !node->is_root()) {
-      assert(node->parent()->child(node->position()) == node);
-      position = node->position();
-      node = node->parent();
-    }
-    if (position == node->count()) {
-      *this = save;
-    }
-  } else {
-    assert(position < node->count());
-    node = node->child(position + 1);
-    while (!node->leaf()) {
-      node = node->child(0);
-    }
-    position = 0;
-  }
-}
-
-template <typename N, typename R, typename P>
-void btree_iterator<N, R, P>::increment_by(int count) {
-  while (count > 0) {
-    if (node->leaf()) {
-      int rest = node->count() - position;
-      position += std::min(rest, count);
-      count = count - rest;
-      if (position < node->count()) {
-        return;
-      }
-    } else {
-      --count;
-    }
-    increment_slow();
-  }
-}
-
-template <typename N, typename R, typename P>
-void btree_iterator<N, R, P>::decrement_slow() {
-  if (node->leaf()) {
-    assert(position <= -1);
-    self_type save(*this);
-    while (position < 0 && !node->is_root()) {
-      assert(node->parent()->child(node->position()) == node);
-      position = node->position() - 1;
-      node = node->parent();
-    }
-    if (position < 0) {
-      *this = save;
-    }
-  } else {
-    assert(position >= 0);
-    node = node->child(position);
-    while (!node->leaf()) {
-      node = node->child(node->count());
-    }
-    position = node->count() - 1;
-  }
-}
-
-////
-// btree methods
-template <typename P>
-btree<P>::btree(const key_compare &comp, const allocator_type &alloc)
-    : key_compare(comp),
-      root_(alloc, NULL) {
-}
-
-template <typename P>
-btree<P>::btree(const self_type &x)
-    : key_compare(x.key_comp()),
-      root_(x.internal_allocator(), NULL) {
-  assign(x);
-}
-
-template <typename P> template <typename ValuePointer>
-std::pair<typename btree<P>::iterator, bool>
-btree<P>::insert_unique(const key_type &key, ValuePointer value) {
-  if (empty()) {
-    *mutable_root() = new_leaf_root_node(1);
-  }
-
-  std::pair<iterator, int> res = internal_locate(key, iterator(root(), 0));
-  iterator &iter = res.first;
-  if (res.second == kExactMatch) {
-    // The key already exists in the tree, do nothing.
-    return std::make_pair(internal_last(iter), false);
-  } else if (!res.second) {
-    iterator last = internal_last(iter);
-    if (last.node && !compare_keys(key, last.key())) {
-      // The key already exists in the tree, do nothing.
-      return std::make_pair(last, false);
-    }
-  }
-
-  return std::make_pair(internal_insert(iter, *value), true);
-}
-
-template <typename P>
-inline typename btree<P>::iterator
-btree<P>::insert_unique(iterator position, const value_type &v) {
-  if (!empty()) {
-    const key_type &key = params_type::key(v);
-    if (position == end() || compare_keys(key, position.key())) {
-      iterator prev = position;
-      if (position == begin() || compare_keys((--prev).key(), key)) {
-        // prev.key() < key < position.key()
-        return internal_insert(position, v);
-      }
-    } else if (compare_keys(position.key(), key)) {
-      iterator next = position;
-      ++next;
-      if (next == end() || compare_keys(key, next.key())) {
-        // position.key() < key < next.key()
-        return internal_insert(next, v);
-      }
-    } else {
-      // position.key() == key
-      return position;
-    }
-  }
-  return insert_unique(v).first;
-}
-
-template <typename P> template <typename InputIterator>
-void btree<P>::insert_unique(InputIterator b, InputIterator e) {
-  for (; b != e; ++b) {
-    insert_unique(end(), *b);
-  }
-}
-
-template <typename P> template <typename ValuePointer>
-typename btree<P>::iterator
-btree<P>::insert_multi(const key_type &key, ValuePointer value) {
-  if (empty()) {
-    *mutable_root() = new_leaf_root_node(1);
-  }
-
-  iterator iter = internal_upper_bound(key, iterator(root(), 0));
-  if (!iter.node) {
-    iter = end();
-  }
-  return internal_insert(iter, *value);
-}
-
-template <typename P>
-typename btree<P>::iterator
-btree<P>::insert_multi(iterator position, const value_type &v) {
-  if (!empty()) {
-    const key_type &key = params_type::key(v);
-    if (position == end() || !compare_keys(position.key(), key)) {
-      iterator prev = position;
-      if (position == begin() || !compare_keys(key, (--prev).key())) {
-        // prev.key() <= key <= position.key()
-        return internal_insert(position, v);
-      }
-    } else {
-      iterator next = position;
-      ++next;
-      if (next == end() || !compare_keys(next.key(), key)) {
-        // position.key() < key <= next.key()
-        return internal_insert(next, v);
-      }
-    }
-  }
-  return insert_multi(v);
-}
-
-template <typename P> template <typename InputIterator>
-void btree<P>::insert_multi(InputIterator b, InputIterator e) {
-  for (; b != e; ++b) {
-    insert_multi(end(), *b);
-  }
-}
-
-template <typename P>
-void btree<P>::assign(const self_type &x) {
-  clear();
-
-  *mutable_key_comp() = x.key_comp();
-  *mutable_internal_allocator() = x.internal_allocator();
-
-  // Assignment can avoid key comparisons because we know the order of the
-  // values is the same order we'll store them in.
-  for (const_iterator iter = x.begin(); iter != x.end(); ++iter) {
-    if (empty()) {
-      insert_multi(*iter);
-    } else {
-      // If the btree is not empty, we can just insert the new value at the end
-      // of the tree!
-      internal_insert(end(), *iter);
-    }
-  }
-}
-
-template <typename P>
-typename btree<P>::iterator btree<P>::erase(iterator iter) {
-  bool internal_delete = false;
-  if (!iter.node->leaf()) {
-    // Deletion of a value on an internal node. Swap the key with the largest
-    // value of our left child. This is easy, we just decrement iter.
-    iterator tmp_iter(iter--);
-    assert(iter.node->leaf());
-    assert(!compare_keys(tmp_iter.key(), iter.key()));
-    iter.node->value_swap(iter.position, tmp_iter.node, tmp_iter.position);
-    internal_delete = true;
-    --*mutable_size();
-  } else if (!root()->leaf()) {
-    --*mutable_size();
-  }
-
-  // Delete the key from the leaf.
-  iter.node->remove_value(iter.position);
-
-  // We want to return the next value after the one we just erased. If we
-  // erased from an internal node (internal_delete == true), then the next
-  // value is ++(++iter). If we erased from a leaf node (internal_delete ==
-  // false) then the next value is ++iter. Note that ++iter may point to an
-  // internal node and the value in the internal node may move to a leaf node
-  // (iter.node) when rebalancing is performed at the leaf level.
-
-  // Merge/rebalance as we walk back up the tree.
-  iterator res(iter);
-  for (;;) {
-    if (iter.node == root()) {
-      try_shrink();
-      if (empty()) {
-        return end();
-      }
-      break;
-    }
-    if (iter.node->count() >= kMinNodeValues) {
-      break;
-    }
-    bool merged = try_merge_or_rebalance(&iter);
-    if (iter.node->leaf()) {
-      res = iter;
-    }
-    if (!merged) {
-      break;
-    }
-    iter.node = iter.node->parent();
-  }
-
-  // Adjust our return value. If we're pointing at the end of a node, advance
-  // the iterator.
-  if (res.position == res.node->count()) {
-    res.position = res.node->count() - 1;
-    ++res;
-  }
-  // If we erased from an internal node, advance the iterator.
-  if (internal_delete) {
-    ++res;
-  }
-  return res;
-}
-
-template <typename P>
-int btree<P>::erase(iterator begin, iterator end) {
-  int count = distance(begin, end);
-  for (int i = 0; i < count; i++) {
-    begin = erase(begin);
-  }
-  return count;
-}
-
-template <typename P>
-int btree<P>::erase_unique(const key_type &key) {
-  iterator iter = internal_find_unique(key, iterator(root(), 0));
-  if (!iter.node) {
-    // The key doesn't exist in the tree, return nothing done.
-    return 0;
-  }
-  erase(iter);
-  return 1;
-}
-
-template <typename P>
-int btree<P>::erase_multi(const key_type &key) {
-  iterator begin = internal_lower_bound(key, iterator(root(), 0));
-  if (!begin.node) {
-    // The key doesn't exist in the tree, return nothing done.
-    return 0;
-  }
-  // Delete all of the keys between begin and upper_bound(key).
-  iterator end = internal_end(
-      internal_upper_bound(key, iterator(root(), 0)));
-  return erase(begin, end);
-}
-
-template <typename P>
-void btree<P>::clear() {
-  if (root() != NULL) {
-    internal_clear(root());
-  }
-  *mutable_root() = NULL;
-}
-
-template <typename P>
-void btree<P>::swap(self_type &x) {
-  std::swap(static_cast<key_compare&>(*this), static_cast<key_compare&>(x));
-  std::swap(root_, x.root_);
-}
-
-template <typename P>
-void btree<P>::verify() const {
-  if (root() != NULL) {
-    assert(size() == internal_verify(root(), NULL, NULL));
-    assert(leftmost() == (++const_iterator(root(), -1)).node);
-    assert(rightmost() == (--const_iterator(root(), root()->count())).node);
-    assert(leftmost()->leaf());
-    assert(rightmost()->leaf());
-  } else {
-    assert(size() == 0);
-    assert(leftmost() == NULL);
-    assert(rightmost() == NULL);
-  }
-}
-
-template <typename P>
-void btree<P>::rebalance_or_split(iterator *iter) {
-  node_type *&node = iter->node;
-  int &insert_position = iter->position;
-  assert(node->count() == node->max_count());
-
-  // First try to make room on the node by rebalancing.
-  node_type *parent = node->parent();
-  if (node != root()) {
-    if (node->position() > 0) {
-      // Try rebalancing with our left sibling.
-      node_type *left = parent->child(node->position() - 1);
-      if (left->count() < left->max_count()) {
-        // We bias rebalancing based on the position being inserted. If we're
-        // inserting at the end of the right node then we bias rebalancing to
-        // fill up the left node.
-        int to_move = (left->max_count() - left->count()) /
-            (1 + (insert_position < left->max_count()));
-        to_move = std::max(1, to_move);
-
-        if (((insert_position - to_move) >= 0) ||
-            ((left->count() + to_move) < left->max_count())) {
-          left->rebalance_right_to_left(node, to_move);
-
-          assert(node->max_count() - node->count() == to_move);
-          insert_position = insert_position - to_move;
-          if (insert_position < 0) {
-            insert_position = insert_position + left->count() + 1;
-            node = left;
-          }
-
-          assert(node->count() < node->max_count());
-          return;
-        }
-      }
-    }
-
-    if (node->position() < parent->count()) {
-      // Try rebalancing with our right sibling.
-      node_type *right = parent->child(node->position() + 1);
-      if (right->count() < right->max_count()) {
-        // We bias rebalancing based on the position being inserted. If we're
-        // inserting at the beginning of the left node then we bias rebalancing
-        // to fill up the right node.
-        int to_move = (right->max_count() - right->count()) /
-            (1 + (insert_position > 0));
-        to_move = std::max(1, to_move);
-
-        if ((insert_position <= (node->count() - to_move)) ||
-            ((right->count() + to_move) < right->max_count())) {
-          node->rebalance_left_to_right(right, to_move);
-
-          if (insert_position > node->count()) {
-            insert_position = insert_position - node->count() - 1;
-            node = right;
-          }
-
-          assert(node->count() < node->max_count());
-          return;
-        }
-      }
-    }
-
-    // Rebalancing failed, make sure there is room on the parent node for a new
-    // value.
-    if (parent->count() == parent->max_count()) {
-      iterator parent_iter(node->parent(), node->position());
-      rebalance_or_split(&parent_iter);
-    }
-  } else {
-    // Rebalancing not possible because this is the root node.
-    if (root()->leaf()) {
-      // The root node is currently a leaf node: create a new root node and set
-      // the current root node as the child of the new root.
-      parent = new_internal_root_node();
-      parent->set_child(0, root());
-      *mutable_root() = parent;
-      assert(*mutable_rightmost() == parent->child(0));
-    } else {
-      // The root node is an internal node. We do not want to create a new root
-      // node because the root node is special and holds the size of the tree
-      // and a pointer to the rightmost node. So we create a new internal node
-      // and move all of the items on the current root into the new node.
-      parent = new_internal_node(parent);
-      parent->set_child(0, parent);
-      parent->swap(root());
-      node = parent;
-    }
-  }
-
-  // Split the node.
-  node_type *split_node;
-  if (node->leaf()) {
-    split_node = new_leaf_node(parent);
-    node->split(split_node, insert_position);
-    if (rightmost() == node) {
-      *mutable_rightmost() = split_node;
-    }
-  } else {
-    split_node = new_internal_node(parent);
-    node->split(split_node, insert_position);
-  }
-
-  if (insert_position > node->count()) {
-    insert_position = insert_position - node->count() - 1;
-    node = split_node;
-  }
-}
-
-template <typename P>
-void btree<P>::merge_nodes(node_type *left, node_type *right) {
-  left->merge(right);
-  if (right->leaf()) {
-    if (rightmost() == right) {
-      *mutable_rightmost() = left;
-    }
-    delete_leaf_node(right);
-  } else {
-    delete_internal_node(right);
-  }
-}
-
-template <typename P>
-bool btree<P>::try_merge_or_rebalance(iterator *iter) {
-  node_type *parent = iter->node->parent();
-  if (iter->node->position() > 0) {
-    // Try merging with our left sibling.
-    node_type *left = parent->child(iter->node->position() - 1);
-    if ((1 + left->count() + iter->node->count()) <= left->max_count()) {
-      iter->position += 1 + left->count();
-      merge_nodes(left, iter->node);
-      iter->node = left;
-      return true;
-    }
-  }
-  if (iter->node->position() < parent->count()) {
-    // Try merging with our right sibling.
-    node_type *right = parent->child(iter->node->position() + 1);
-    if ((1 + iter->node->count() + right->count()) <= right->max_count()) {
-      merge_nodes(iter->node, right);
-      return true;
-    }
-    // Try rebalancing with our right sibling. We don't perform rebalancing if
-    // we deleted the first element from iter->node and the node is not
-    // empty. This is a small optimization for the common pattern of deleting
-    // from the front of the tree.
-    if ((right->count() > kMinNodeValues) &&
-        ((iter->node->count() == 0) ||
-         (iter->position > 0))) {
-      int to_move = (right->count() - iter->node->count()) / 2;
-      to_move = std::min(to_move, right->count() - 1);
-      iter->node->rebalance_right_to_left(right, to_move);
-      return false;
-    }
-  }
-  if (iter->node->position() > 0) {
-    // Try rebalancing with our left sibling. We don't perform rebalancing if
-    // we deleted the last element from iter->node and the node is not
-    // empty. This is a small optimization for the common pattern of deleting
-    // from the back of the tree.
-    node_type *left = parent->child(iter->node->position() - 1);
-    if ((left->count() > kMinNodeValues) &&
-        ((iter->node->count() == 0) ||
-         (iter->position < iter->node->count()))) {
-      int to_move = (left->count() - iter->node->count()) / 2;
-      to_move = std::min(to_move, left->count() - 1);
-      left->rebalance_left_to_right(iter->node, to_move);
-      iter->position += to_move;
-      return false;
-    }
-  }
-  return false;
-}
-
-template <typename P>
-void btree<P>::try_shrink() {
-  if (root()->count() > 0) {
-    return;
-  }
-  // Deleted the last item on the root node, shrink the height of the tree.
-  if (root()->leaf()) {
-    assert(size() == 0);
-    delete_leaf_node(root());
-    *mutable_root() = NULL;
-  } else {
-    node_type *child = root()->child(0);
-    if (child->leaf()) {
-      // The child is a leaf node so simply make it the root node in the tree.
-      child->make_root();
-      delete_internal_root_node();
-      *mutable_root() = child;
-    } else {
-      // The child is an internal node. We want to keep the existing root node
-      // so we move all of the values from the child node into the existing
-      // (empty) root node.
-      child->swap(root());
-      delete_internal_node(child);
-    }
-  }
-}
-
-template <typename P> template <typename IterType>
-inline IterType btree<P>::internal_last(IterType iter) {
-  while (iter.node && iter.position == iter.node->count()) {
-    iter.position = iter.node->position();
-    iter.node = iter.node->parent();
-    if (iter.node->leaf()) {
-      iter.node = NULL;
-    }
-  }
-  return iter;
-}
-
-template <typename P>
-inline typename btree<P>::iterator
-btree<P>::internal_insert(iterator iter, const value_type &v) {
-  if (!iter.node->leaf()) {
-    // We can't insert on an internal node. Instead, we'll insert after the
-    // previous value which is guaranteed to be on a leaf node.
-    --iter;
-    ++iter.position;
-  }
-  if (iter.node->count() == iter.node->max_count()) {
-    // Make room in the leaf for the new item.
-    if (iter.node->max_count() < kNodeValues) {
-      // Insertion into the root where the root is smaller that the full node
-      // size. Simply grow the size of the root node.
-      assert(iter.node == root());
-      iter.node = new_leaf_root_node(
-          std::min<int>(kNodeValues, 2 * iter.node->max_count()));
-      iter.node->swap(root());
-      delete_leaf_node(root());
-      *mutable_root() = iter.node;
-    } else {
-      rebalance_or_split(&iter);
-      ++*mutable_size();
-    }
-  } else if (!root()->leaf()) {
-    ++*mutable_size();
-  }
-  iter.node->insert_value(iter.position, v);
-  return iter;
-}
-
-template <typename P> template <typename IterType>
-inline std::pair<IterType, int> btree<P>::internal_locate(
-    const key_type &key, IterType iter) const {
-  return internal_locate_type::dispatch(key, *this, iter);
-}
-
-template <typename P> template <typename IterType>
-inline std::pair<IterType, int> btree<P>::internal_locate_plain_compare(
-    const key_type &key, IterType iter) const {
-  for (;;) {
-    iter.position = iter.node->lower_bound(key, key_comp());
-    if (iter.node->leaf()) {
-      break;
-    }
-    iter.node = iter.node->child(iter.position);
-  }
-  return std::make_pair(iter, 0);
-}
-
-template <typename P> template <typename IterType>
-inline std::pair<IterType, int> btree<P>::internal_locate_compare_to(
-    const key_type &key, IterType iter) const {
-  for (;;) {
-    int res = iter.node->lower_bound(key, key_comp());
-    iter.position = res & kMatchMask;
-    if (res & kExactMatch) {
-      return std::make_pair(iter, static_cast<int>(kExactMatch));
-    }
-    if (iter.node->leaf()) {
-      break;
-    }
-    iter.node = iter.node->child(iter.position);
-  }
-  return std::make_pair(iter, -kExactMatch);
-}
-
-template <typename P> template <typename IterType>
-IterType btree<P>::internal_lower_bound(
-    const key_type &key, IterType iter) const {
-  if (iter.node) {
-    for (;;) {
-      iter.position =
-          iter.node->lower_bound(key, key_comp()) & kMatchMask;
-      if (iter.node->leaf()) {
-        break;
-      }
-      iter.node = iter.node->child(iter.position);
-    }
-    iter = internal_last(iter);
-  }
-  return iter;
-}
-
-template <typename P> template <typename IterType>
-IterType btree<P>::internal_upper_bound(
-    const key_type &key, IterType iter) const {
-  if (iter.node) {
-    for (;;) {
-      iter.position = iter.node->upper_bound(key, key_comp());
-      if (iter.node->leaf()) {
-        break;
-      }
-      iter.node = iter.node->child(iter.position);
-    }
-    iter = internal_last(iter);
-  }
-  return iter;
-}
-
-template <typename P> template <typename IterType>
-IterType btree<P>::internal_find_unique(
-    const key_type &key, IterType iter) const {
-  if (iter.node) {
-    std::pair<IterType, int> res = internal_locate(key, iter);
-    if (res.second == kExactMatch) {
-      return res.first;
-    }
-    if (!res.second) {
-      iter = internal_last(res.first);
-      if (iter.node && !compare_keys(key, iter.key())) {
-        return iter;
-      }
-    }
-  }
-  return IterType(NULL, 0);
-}
-
-template <typename P> template <typename IterType>
-IterType btree<P>::internal_find_multi(
-    const key_type &key, IterType iter) const {
-  if (iter.node) {
-    iter = internal_lower_bound(key, iter);
-    if (iter.node) {
-      iter = internal_last(iter);
-      if (iter.node && !compare_keys(key, iter.key())) {
-        return iter;
-      }
-    }
-  }
-  return IterType(NULL, 0);
-}
-
-template <typename P>
-void btree<P>::internal_clear(node_type *node) {
-  if (!node->leaf()) {
-    for (int i = 0; i <= node->count(); ++i) {
-      internal_clear(node->child(i));
-    }
-    if (node == root()) {
-      delete_internal_root_node();
-    } else {
-      delete_internal_node(node);
-    }
-  } else {
-    delete_leaf_node(node);
-  }
-}
-
-template <typename P>
-void btree<P>::internal_dump(
-    std::ostream &os, const node_type *node, int level) const {
-  for (int i = 0; i < node->count(); ++i) {
-    if (!node->leaf()) {
-      internal_dump(os, node->child(i), level + 1);
-    }
-    for (int j = 0; j < level; ++j) {
-      os << "  ";
-    }
-    os << node->key(i) << " [" << level << "]\n";
-  }
-  if (!node->leaf()) {
-    internal_dump(os, node->child(node->count()), level + 1);
-  }
-}
-
-template <typename P>
-int btree<P>::internal_verify(
-    const node_type *node, const key_type *lo, const key_type *hi) const {
-  assert(node->count() > 0);
-  assert(node->count() <= node->max_count());
-  if (lo) {
-    assert(!compare_keys(node->key(0), *lo));
-  }
-  if (hi) {
-    assert(!compare_keys(*hi, node->key(node->count() - 1)));
-  }
-  for (int i = 1; i < node->count(); ++i) {
-    assert(!compare_keys(node->key(i), node->key(i - 1)));
-  }
-  int count = node->count();
-  if (!node->leaf()) {
-    for (int i = 0; i <= node->count(); ++i) {
-      assert(node->child(i) != NULL);
-      assert(node->child(i)->parent() == node);
-      assert(node->child(i)->position() == i);
-      count += internal_verify(
-          node->child(i),
-          (i == 0) ? lo : &node->key(i - 1),
-          (i == node->count()) ? hi : &node->key(i));
-    }
-  }
-  return count;
-}
-
-} // namespace btree
-
-#endif  // UTIL_BTREE_BTREE_H__
diff --git a/include/btree_container.h b/include/btree_container.h
deleted file mode 100755
index c9c62c3..0000000
--- a/include/btree_container.h
+++ /dev/null
@@ -1,350 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_BTREE_BTREE_CONTAINER_H__
-#define UTIL_BTREE_BTREE_CONTAINER_H__
-
-#include <iosfwd>
-#include <utility>
-
-#include "btree.h"
-
-namespace btree {
-
-// A common base class for btree_set, btree_map, btree_multiset and
-// btree_multimap.
-template <typename Tree>
-class btree_container {
-  typedef btree_container<Tree> self_type;
-
- public:
-  typedef typename Tree::params_type params_type;
-  typedef typename Tree::key_type key_type;
-  typedef typename Tree::value_type value_type;
-  typedef typename Tree::key_compare key_compare;
-  typedef typename Tree::allocator_type allocator_type;
-  typedef typename Tree::pointer pointer;
-  typedef typename Tree::const_pointer const_pointer;
-  typedef typename Tree::reference reference;
-  typedef typename Tree::const_reference const_reference;
-  typedef typename Tree::size_type size_type;
-  typedef typename Tree::difference_type difference_type;
-  typedef typename Tree::iterator iterator;
-  typedef typename Tree::const_iterator const_iterator;
-  typedef typename Tree::reverse_iterator reverse_iterator;
-  typedef typename Tree::const_reverse_iterator const_reverse_iterator;
-
- public:
-  // Default constructor.
-  btree_container(const key_compare &comp, const allocator_type &alloc)
-      : tree_(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_container(const self_type &x)
-      : tree_(x.tree_) {
-  }
-
-  // Iterator routines.
-  iterator begin() { return tree_.begin(); }
-  const_iterator begin() const { return tree_.begin(); }
-  iterator end() { return tree_.end(); }
-  const_iterator end() const { return tree_.end(); }
-  reverse_iterator rbegin() { return tree_.rbegin(); }
-  const_reverse_iterator rbegin() const { return tree_.rbegin(); }
-  reverse_iterator rend() { return tree_.rend(); }
-  const_reverse_iterator rend() const { return tree_.rend(); }
-
-  // Lookup routines.
-  iterator lower_bound(const key_type &key) {
-    return tree_.lower_bound(key);
-  }
-  const_iterator lower_bound(const key_type &key) const {
-    return tree_.lower_bound(key);
-  }
-  iterator upper_bound(const key_type &key) {
-    return tree_.upper_bound(key);
-  }
-  const_iterator upper_bound(const key_type &key) const {
-    return tree_.upper_bound(key);
-  }
-  std::pair<iterator,iterator> equal_range(const key_type &key) {
-    return tree_.equal_range(key);
-  }
-  std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const {
-    return tree_.equal_range(key);
-  }
-
-  // Utility routines.
-  void clear() {
-    tree_.clear();
-  }
-  void swap(self_type &x) {
-    tree_.swap(x.tree_);
-  }
-  void dump(std::ostream &os) const {
-    tree_.dump(os);
-  }
-  void verify() const {
-    tree_.verify();
-  }
-
-  // Size routines.
-  size_type size() const { return tree_.size(); }
-  size_type max_size() const { return tree_.max_size(); }
-  bool empty() const { return tree_.empty(); }
-  size_type height() const { return tree_.height(); }
-  size_type internal_nodes() const { return tree_.internal_nodes(); }
-  size_type leaf_nodes() const { return tree_.leaf_nodes(); }
-  size_type nodes() const { return tree_.nodes(); }
-  size_type bytes_used() const { return tree_.bytes_used(); }
-  static double average_bytes_per_value() {
-    return Tree::average_bytes_per_value();
-  }
-  double fullness() const { return tree_.fullness(); }
-  double overhead() const { return tree_.overhead(); }
-
-  bool operator==(const self_type& x) const {
-    if (size() != x.size()) {
-      return false;
-    }
-    for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi) {
-      if (*i != *xi) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool operator!=(const self_type& other) const {
-    return !operator==(other);
-  }
-
-
- protected:
-  Tree tree_;
-};
-
-template <typename T>
-inline std::ostream& operator<<(std::ostream &os, const btree_container<T> &b) {
-  b.dump(os);
-  return os;
-}
-
-// A common base class for btree_set and safe_btree_set.
-template <typename Tree>
-class btree_unique_container : public btree_container<Tree> {
-  typedef btree_unique_container<Tree> self_type;
-  typedef btree_container<Tree> super_type;
-
- public:
-  typedef typename Tree::key_type key_type;
-  typedef typename Tree::value_type value_type;
-  typedef typename Tree::size_type size_type;
-  typedef typename Tree::key_compare key_compare;
-  typedef typename Tree::allocator_type allocator_type;
-  typedef typename Tree::iterator iterator;
-  typedef typename Tree::const_iterator const_iterator;
-
- public:
-  // Default constructor.
-  btree_unique_container(const key_compare &comp = key_compare(),
-                         const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_unique_container(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  btree_unique_container(InputIterator b, InputIterator e,
-                         const key_compare &comp = key_compare(),
-                         const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-    insert(b, e);
-  }
-
-  // Lookup routines.
-  iterator find(const key_type &key) {
-    return this->tree_.find_unique(key);
-  }
-  const_iterator find(const key_type &key) const {
-    return this->tree_.find_unique(key);
-  }
-  size_type count(const key_type &key) const {
-    return this->tree_.count_unique(key);
-  }
-
-  // Insertion routines.
-  std::pair<iterator,bool> insert(const value_type &x) {
-    return this->tree_.insert_unique(x);
-  }
-  iterator insert(iterator position, const value_type &x) {
-    return this->tree_.insert_unique(position, x);
-  }
-  template <typename InputIterator>
-  void insert(InputIterator b, InputIterator e) {
-    this->tree_.insert_unique(b, e);
-  }
-
-  // Deletion routines.
-  int erase(const key_type &key) {
-    return this->tree_.erase_unique(key);
-  }
-  // Erase the specified iterator from the btree. The iterator must be valid
-  // (i.e. not equal to end()).  Return an iterator pointing to the node after
-  // the one that was erased (or end() if none exists).
-  iterator erase(const iterator &iter) {
-    return this->tree_.erase(iter);
-  }
-  void erase(const iterator &first, const iterator &last) {
-    this->tree_.erase(first, last);
-  }
-};
-
-// A common base class for btree_map and safe_btree_map.
-template <typename Tree>
-class btree_map_container : public btree_unique_container<Tree> {
-  typedef btree_map_container<Tree> self_type;
-  typedef btree_unique_container<Tree> super_type;
-
- public:
-  typedef typename Tree::key_type key_type;
-  typedef typename Tree::data_type data_type;
-  typedef typename Tree::value_type value_type;
-  typedef typename Tree::mapped_type mapped_type;
-  typedef typename Tree::key_compare key_compare;
-  typedef typename Tree::allocator_type allocator_type;
-
- private:
-  // A pointer-like object which only generates its value when
-  // dereferenced. Used by operator[] to avoid constructing an empty data_type
-  // if the key already exists in the map.
-  struct generate_value {
-    generate_value(const key_type &k)
-        : key(k) {
-    }
-    value_type operator*() const {
-      return std::make_pair(key, data_type());
-    }
-    const key_type &key;
-  };
-
- public:
-  // Default constructor.
-  btree_map_container(const key_compare &comp = key_compare(),
-                      const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_map_container(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  btree_map_container(InputIterator b, InputIterator e,
-                      const key_compare &comp = key_compare(),
-                      const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-    insert(b, e);
-  }
-
-  // Insertion routines.
-  data_type& operator[](const key_type &key) {
-    return this->tree_.insert_unique(key, generate_value(key)).first->second;
-  }
-};
-
-// A common base class for btree_multiset and btree_multimap.
-template <typename Tree>
-class btree_multi_container : public btree_container<Tree> {
-  typedef btree_multi_container<Tree> self_type;
-  typedef btree_container<Tree> super_type;
-
- public:
-  typedef typename Tree::key_type key_type;
-  typedef typename Tree::value_type value_type;
-  typedef typename Tree::size_type size_type;
-  typedef typename Tree::key_compare key_compare;
-  typedef typename Tree::allocator_type allocator_type;
-  typedef typename Tree::iterator iterator;
-  typedef typename Tree::const_iterator const_iterator;
-
- public:
-  // Default constructor.
-  btree_multi_container(const key_compare &comp = key_compare(),
-                        const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_multi_container(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  btree_multi_container(InputIterator b, InputIterator e,
-                        const key_compare &comp = key_compare(),
-                        const allocator_type &alloc = allocator_type())
-      : super_type(b, e, comp, alloc) {
-    insert(b, e);
-  }
-
-  // Lookup routines.
-  iterator find(const key_type &key) {
-    return this->tree_.find_multi(key);
-  }
-  const_iterator find(const key_type &key) const {
-    return this->tree_.find_multi(key);
-  }
-  size_type count(const key_type &key) const {
-    return this->tree_.count_multi(key);
-  }
-
-  // Insertion routines.
-  iterator insert(const value_type &x) {
-    return this->tree_.insert_multi(x);
-  }
-  iterator insert(iterator position, const value_type &x) {
-    return this->tree_.insert_multi(position, x);
-  }
-  template <typename InputIterator>
-  void insert(InputIterator b, InputIterator e) {
-    this->tree_.insert_multi(b, e);
-  }
-
-  // Deletion routines.
-  int erase(const key_type &key) {
-    return this->tree_.erase_multi(key);
-  }
-  // Erase the specified iterator from the btree. The iterator must be valid
-  // (i.e. not equal to end()).  Return an iterator pointing to the node after
-  // the one that was erased (or end() if none exists).
-  iterator erase(const iterator &iter) {
-    return this->tree_.erase(iter);
-  }
-  void erase(const iterator &first, const iterator &last) {
-    this->tree_.erase(first, last);
-  }
-};
-
-} // namespace btree
-
-#endif  // UTIL_BTREE_BTREE_CONTAINER_H__
diff --git a/include/btree_map.h b/include/btree_map.h
deleted file mode 100755
index 07b799e..0000000
--- a/include/btree_map.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// A btree_map<> implements the STL unique sorted associative container
-// interface and the pair associative container interface (a.k.a map<>) using a
-// btree. A btree_multimap<> implements the STL multiple sorted associative
-// container interface and the pair associtive container interface (a.k.a
-// multimap<>) using a btree. See btree.h for details of the btree
-// implementation and caveats.
-
-#ifndef UTIL_BTREE_BTREE_MAP_H__
-#define UTIL_BTREE_BTREE_MAP_H__
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "btree.h"
-#include "btree_container.h"
-
-namespace btree {
-
-// The btree_map class is needed mainly for it's constructors.
-template <typename Key, typename Value,
-          typename Compare = std::less<Key>,
-          typename Alloc = std::allocator<std::pair<const Key, Value> >,
-          int TargetNodeSize = 256>
-class btree_map : public btree_map_container<
-  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
-
-  typedef btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
-  typedef btree_map_params<
-    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
-  typedef btree<params_type> btree_type;
-  typedef btree_map_container<btree_type> super_type;
-
- public:
-  typedef typename btree_type::key_compare key_compare;
-  typedef typename btree_type::allocator_type allocator_type;
-
- public:
-  // Default constructor.
-  btree_map(const key_compare &comp = key_compare(),
-            const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_map(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  btree_map(InputIterator b, InputIterator e,
-            const key_compare &comp = key_compare(),
-            const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-};
-
-template <typename K, typename V, typename C, typename A, int N>
-inline void swap(btree_map<K, V, C, A, N> &x,
-                 btree_map<K, V, C, A, N> &y) {
-  x.swap(y);
-}
-
-// The btree_multimap class is needed mainly for it's constructors.
-template <typename Key, typename Value,
-          typename Compare = std::less<Key>,
-          typename Alloc = std::allocator<std::pair<const Key, Value> >,
-          int TargetNodeSize = 256>
-class btree_multimap : public btree_multi_container<
-  btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
-
-  typedef btree_multimap<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
-  typedef btree_map_params<
-    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
-  typedef btree<params_type> btree_type;
-  typedef btree_multi_container<btree_type> super_type;
-
- public:
-  typedef typename btree_type::key_compare key_compare;
-  typedef typename btree_type::allocator_type allocator_type;
-  typedef typename btree_type::data_type data_type;
-  typedef typename btree_type::mapped_type mapped_type;
-
- public:
-  // Default constructor.
-  btree_multimap(const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_multimap(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  btree_multimap(InputIterator b, InputIterator e,
-                 const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(b, e, comp, alloc) {
-  }
-};
-
-template <typename K, typename V, typename C, typename A, int N>
-inline void swap(btree_multimap<K, V, C, A, N> &x,
-                 btree_multimap<K, V, C, A, N> &y) {
-  x.swap(y);
-}
-
-} // namespace btree
-
-#endif  // UTIL_BTREE_BTREE_MAP_H__
diff --git a/include/btree_set.h b/include/btree_set.h
deleted file mode 100755
index 2bc9e58..0000000
--- a/include/btree_set.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// A btree_set<> implements the STL unique sorted associative container
-// interface (a.k.a set<>) using a btree. A btree_multiset<> implements the STL
-// multiple sorted associative container interface (a.k.a multiset<>) using a
-// btree. See btree.h for details of the btree implementation and caveats.
-
-#ifndef UTIL_BTREE_BTREE_SET_H__
-#define UTIL_BTREE_BTREE_SET_H__
-
-#include <functional>
-#include <memory>
-#include <string>
-
-#include "btree.h"
-#include "btree_container.h"
-
-namespace btree {
-
-// The btree_set class is needed mainly for it's constructors.
-template <typename Key,
-          typename Compare = std::less<Key>,
-          typename Alloc = std::allocator<Key>,
-          int TargetNodeSize = 256>
-class btree_set : public btree_unique_container<
-  btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
-
-  typedef btree_set<Key, Compare, Alloc, TargetNodeSize> self_type;
-  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
-  typedef btree<params_type> btree_type;
-  typedef btree_unique_container<btree_type> super_type;
-
- public:
-  typedef typename btree_type::key_compare key_compare;
-  typedef typename btree_type::allocator_type allocator_type;
-
- public:
-  // Default constructor.
-  btree_set(const key_compare &comp = key_compare(),
-            const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_set(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  btree_set(InputIterator b, InputIterator e,
-            const key_compare &comp = key_compare(),
-            const allocator_type &alloc = allocator_type())
-      : super_type(b, e, comp, alloc) {
-  }
-};
-
-template <typename K, typename C, typename A, int N>
-inline void swap(btree_set<K, C, A, N> &x, btree_set<K, C, A, N> &y) {
-  x.swap(y);
-}
-
-// The btree_multiset class is needed mainly for it's constructors.
-template <typename Key,
-          typename Compare = std::less<Key>,
-          typename Alloc = std::allocator<Key>,
-          int TargetNodeSize = 256>
-class btree_multiset : public btree_multi_container<
-  btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
-
-  typedef btree_multiset<Key, Compare, Alloc, TargetNodeSize> self_type;
-  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
-  typedef btree<params_type> btree_type;
-  typedef btree_multi_container<btree_type> super_type;
-
- public:
-  typedef typename btree_type::key_compare key_compare;
-  typedef typename btree_type::allocator_type allocator_type;
-
- public:
-  // Default constructor.
-  btree_multiset(const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  btree_multiset(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  btree_multiset(InputIterator b, InputIterator e,
-                 const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(b, e, comp, alloc) {
-  }
-};
-
-template <typename K, typename C, typename A, int N>
-inline void swap(btree_multiset<K, C, A, N> &x,
-                 btree_multiset<K, C, A, N> &y) {
-  x.swap(y);
-}
-
-} // namespace btree
-
-#endif  // UTIL_BTREE_BTREE_SET_H__
diff --git a/include/concurrentqueue.h b/include/concurrentqueue.h
index d43514d..4253d61 100644
--- a/include/concurrentqueue.h
+++ b/include/concurrentqueue.h
@@ -237,7 +237,7 @@ namespace details {
 			: static_cast<T>(-1);
 	};
 
-#if (!defined(__clang__) && defined(__GNUC__) && ((__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)))
+#if defined(__GNUC__) && !defined( __clang__ )
 	typedef ::max_align_t max_align_t;      // GCC forgot to add it to std:: for a while
 #else
 	typedef std::max_align_t max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
@@ -749,7 +749,7 @@ public:
 	{
 		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
 		populate_initial_implicit_producer_hash();
-		size_t blocks = ((((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers)) * BLOCK_SIZE;
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
 		populate_initial_block_list(blocks);
 		
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
diff --git a/include/count_main_cmdline.hpp b/include/count_main_cmdline.hpp
deleted file mode 100644
index 004c886..0000000
--- a/include/count_main_cmdline.hpp
+++ /dev/null
@@ -1,711 +0,0 @@
-/***** This code was generated by Yaggo. Do not edit ******/
-
-/*  This file is part of Jellyfish.
-
-    Jellyfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Jellyfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __COUNT_MAIN_CMDLINE_HPP__
-#define __COUNT_MAIN_CMDLINE_HPP__
-
-#include <stdint.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <getopt.h>
-#include <errno.h>
-#include <string.h>
-#include <stdexcept>
-#include <string>
-#include <limits>
-#include <vector>
-#include <iostream>
-#include <sstream>
-#include <memory>
-
-class count_main_cmdline {
- // Boiler plate stuff. Conversion from string to other formats
-  static bool adjust_double_si_suffix(double &res, const char *suffix) {
-    if(*suffix == '\0')
-      return true;
-    if(*(suffix + 1) != '\0')
-      return false;
-
-    switch(*suffix) {
-    case 'a': res *= 1e-18; break;
-    case 'f': res *= 1e-15; break;
-    case 'p': res *= 1e-12; break;
-    case 'n': res *= 1e-9;  break;
-    case 'u': res *= 1e-6;  break;
-    case 'm': res *= 1e-3;  break;
-    case 'k': res *= 1e3;   break;
-    case 'M': res *= 1e6;   break;
-    case 'G': res *= 1e9;   break;
-    case 'T': res *= 1e12;  break;
-    case 'P': res *= 1e15;  break;
-    case 'E': res *= 1e18;  break;
-    default: return false;
-    }
-    return true;
-  }
-
-  static double conv_double(const char *str, ::std::string &err, bool si_suffix) {
-    char *endptr = 0;
-    errno = 0;
-    double res = strtod(str, &endptr);
-    if(errno) {
-      err.assign(strerror(errno));
-      return (double)0.0;
-    }
-    bool invalid =
-      si_suffix ? !adjust_double_si_suffix(res, endptr) : *endptr != '\0';
-    if(invalid) {
-      err.assign("Invalid character");
-      return (double)0.0;
-    }
-    return res;
-  }
-
-  static int conv_enum(const char* str, ::std::string& err, const char* const strs[]) {
-    int res = 0;
-    for(const char* const* cstr = strs; *cstr; ++cstr, ++res)
-      if(!strcmp(*cstr, str))
-        return res;
-    err += "Invalid constant '";
-    err += str;
-    err += "'. Expected one of { ";
-    for(const char* const* cstr = strs; *cstr; ++cstr) {
-      if(cstr != strs)
-        err += ", ";
-      err += *cstr;
-    }
-    err += " }";
-    return -1;
-  }
-
-  template<typename T>
-  static bool adjust_int_si_suffix(T &res, const char *suffix) {
-    if(*suffix == '\0')
-      return true;
-    if(*(suffix + 1) != '\0')
-      return false;
-
-    switch(*suffix) {
-    case 'k': res *= (T)1000; break;
-    case 'M': res *= (T)1000000; break;
-    case 'G': res *= (T)1000000000; break;
-    case 'T': res *= (T)1000000000000; break;
-    case 'P': res *= (T)1000000000000000; break;
-    case 'E': res *= (T)1000000000000000000; break;
-    default: return false;
-    }
-    return true;
-  }
-
-  template<typename T>
-  static T conv_int(const char *str, ::std::string &err, bool si_suffix) {
-    char *endptr = 0;
-    errno = 0;
-    long long int res = strtoll(str, &endptr, 0);
-    if(errno) {
-      err.assign(strerror(errno));
-      return (T)0;
-    }
-    bool invalid =
-      si_suffix ? !adjust_int_si_suffix(res, endptr) : *endptr != '\0';
-    if(invalid) {
-      err.assign("Invalid character");
-      return (T)0;
-    }
-    if(res > ::std::numeric_limits<T>::max() ||
-       res < ::std::numeric_limits<T>::min()) {
-      err.assign("Value out of range");
-      return (T)0;
-    }
-    return (T)res;
-  }
-
-  template<typename T>
-  static T conv_uint(const char *str, ::std::string &err, bool si_suffix) {
-    char *endptr = 0;
-    errno = 0;
-    while(isspace(*str)) { ++str; }
-    if(*str == '-') {
-      err.assign("Negative value");
-      return (T)0;
-    }
-    unsigned long long int res = strtoull(str, &endptr, 0);
-    if(errno) {
-      err.assign(strerror(errno));
-      return (T)0;
-    }
-    bool invalid =
-      si_suffix ? !adjust_int_si_suffix(res, endptr) : *endptr != '\0';
-    if(invalid) {
-      err.assign("Invalid character");
-      return (T)0;
-    }
-    if(res > ::std::numeric_limits<T>::max()) {
-      err.assign("Value out of range");
-      return (T)0;
-    }
-    return (T)res;
-  }
-
-  template<typename T>
-  static ::std::string vec_str(const std::vector<T> &vec) {
-    ::std::ostringstream os;
-    for(typename ::std::vector<T>::const_iterator it = vec.begin();
-        it != vec.end(); ++it) {
-      if(it != vec.begin())
-        os << ",";
-      os << *it;
-    }
-    return os.str();
-  }
-
-  class string : public ::std::string {
-  public:
-    string() : ::std::string() {}
-    explicit string(const ::std::string &s) : std::string(s) {}
-    explicit string(const char *s) : ::std::string(s) {}
-    int as_enum(const char* const strs[]) {
-      ::std::string err;
-      int res = conv_enum((const char*)this->c_str(), err, strs);
-      if(!err.empty())
-        throw ::std::runtime_error(err);
-      return res;
-    }
-
-
-    uint32_t as_uint32_suffix() const { return as_uint32(true); }
-    uint32_t as_uint32(bool si_suffix = false) const {
-      ::std::string err;
-      uint32_t res = conv_uint<uint32_t>((const char*)this->c_str(), err, si_suffix);
-      if(!err.empty()) {
-        ::std::string msg("Invalid conversion of '");
-        msg += *this;
-        msg += "' to uint32_t: ";
-        msg += err;
-        throw ::std::runtime_error(msg);
-      }
-      return res;
-    }
-    uint64_t as_uint64_suffix() const { return as_uint64(true); }
-    uint64_t as_uint64(bool si_suffix = false) const {
-      ::std::string err;
-      uint64_t res = conv_uint<uint64_t>((const char*)this->c_str(), err, si_suffix);
-      if(!err.empty()) {
-        ::std::string msg("Invalid conversion of '");
-        msg += *this;
-        msg += "' to uint64_t: ";
-        msg += err;
-        throw ::std::runtime_error(msg);
-      }
-      return res;
-    }
-    int32_t as_int32_suffix() const { return as_int32(true); }
-    int32_t as_int32(bool si_suffix = false) const {
-      ::std::string err;
-      int32_t res = conv_int<int32_t>((const char*)this->c_str(), err, si_suffix);
-      if(!err.empty()) {
-        ::std::string msg("Invalid conversion of '");
-        msg += *this;
-        msg += "' to int32_t: ";
-        msg += err;
-        throw ::std::runtime_error(msg);
-      }
-      return res;
-    }
-    int64_t as_int64_suffix() const { return as_int64(true); }
-    int64_t as_int64(bool si_suffix = false) const {
-      ::std::string err;
-      int64_t res = conv_int<int64_t>((const char*)this->c_str(), err, si_suffix);
-      if(!err.empty()) {
-        ::std::string msg("Invalid conversion of '");
-        msg += *this;
-        msg += "' to int64_t: ";
-        msg += err;
-        throw ::std::runtime_error(msg);
-      }
-      return res;
-    }
-    int as_int_suffix() const { return as_int(true); }
-    int as_int(bool si_suffix = false) const {
-      ::std::string err;
-      int res = conv_int<int>((const char*)this->c_str(), err, si_suffix);
-      if(!err.empty()) {
-        ::std::string msg("Invalid conversion of '");
-        msg += *this;
-        msg += "' to int_t: ";
-        msg += err;
-        throw ::std::runtime_error(msg);
-      }
-      return res;
-    }
-    long as_long_suffix() const { return as_long(true); }
-    long as_long(bool si_suffix = false) const {
-      ::std::string err;
-      long res = conv_int<long>((const char*)this->c_str(), err, si_suffix);
-      if(!err.empty()) {
-        ::std::string msg("Invalid conversion of '");
-        msg += *this;
-        msg += "' to long_t: ";
-        msg += err;
-        throw ::std::runtime_error(msg);
-      }
-      return res;
-    }
-    double as_double_suffix() const { return as_double(true); }
-    double as_double(bool si_suffix = false) const {
-      ::std::string err;
-      double res = conv_double((const char*)this->c_str(), err, si_suffix);
-      if(!err.empty()) {
-        ::std::string msg("Invalid conversion of '");
-        msg += *this;
-        msg += "' to double_t: ";
-        msg += err;
-        throw ::std::runtime_error(msg);
-      }
-      return res;
-    }
-  };
-
-public:
-  uint32_t                       mer_len_arg;
-  bool                           mer_len_given;
-  uint64_t                       size_arg;
-  bool                           size_given;
-  uint32_t                       threads_arg;
-  bool                           threads_given;
-  uint32_t                       Files_arg;
-  bool                           Files_given;
-  const char *                   generator_arg;
-  bool                           generator_given;
-  uint32_t                       Generators_arg;
-  bool                           Generators_given;
-  const char *                   shell_arg;
-  bool                           shell_given;
-  const char *                   output_arg;
-  bool                           output_given;
-  uint32_t                       counter_len_arg;
-  bool                           counter_len_given;
-  uint32_t                       out_counter_len_arg;
-  bool                           out_counter_len_given;
-  bool                           canonical_flag;
-  const char *                   bc_arg;
-  bool                           bc_given;
-  uint64_t                       bf_size_arg;
-  bool                           bf_size_given;
-  double                         bf_fp_arg;
-  bool                           bf_fp_given;
-  ::std::vector<const char *>    if_arg;
-  typedef ::std::vector<const char *>::iterator if_arg_it;
-  typedef ::std::vector<const char *>::const_iterator if_arg_const_it;
-  bool                           if_given;
-  string                         min_qual_char_arg;
-  bool                           min_qual_char_given;
-  uint32_t                       reprobes_arg;
-  bool                           reprobes_given;
-  bool                           text_flag;
-  bool                           disk_flag;
-  bool                           no_merge_flag;
-  bool                           no_unlink_flag;
-  uint64_t                       lower_count_arg;
-  bool                           lower_count_given;
-  uint64_t                       upper_count_arg;
-  bool                           upper_count_given;
-  const char *                   timing_arg;
-  bool                           timing_given;
-  bool                           no_write_flag;
-  ::std::vector<const char *>    file_arg;
-  typedef ::std::vector<const char *>::iterator file_arg_it;
-  typedef ::std::vector<const char *>::const_iterator file_arg_const_it;
-
-  enum {
-    START_OPT = 1000,
-    FULL_HELP_OPT,
-    USAGE_OPT,
-    OUT_COUNTER_LEN_OPT,
-    BC_OPT,
-    BF_SIZE_OPT,
-    BF_FP_OPT,
-    IF_OPT,
-    TEXT_OPT,
-    DISK_OPT,
-    NO_MERGE_OPT,
-    NO_UNLINK_OPT,
-    TIMING_OPT,
-    NO_WRITE_OPT
-  };
-
-  count_main_cmdline() :
-    mer_len_arg(), mer_len_given(false),
-    size_arg(), size_given(false),
-    threads_arg(1), threads_given(false),
-    Files_arg(1), Files_given(false),
-    generator_arg(""), generator_given(false),
-    Generators_arg(1), Generators_given(false),
-    shell_arg(""), shell_given(false),
-    output_arg("mer_counts.jf"), output_given(false),
-    counter_len_arg(7), counter_len_given(false),
-    out_counter_len_arg(4), out_counter_len_given(false),
-    canonical_flag(false),
-    bc_arg(""), bc_given(false),
-    bf_size_arg(), bf_size_given(false),
-    bf_fp_arg(0.01), bf_fp_given(false),
-    if_arg(), if_given(false),
-    min_qual_char_arg(""), min_qual_char_given(false),
-    reprobes_arg(126), reprobes_given(false),
-    text_flag(false),
-    disk_flag(false),
-    no_merge_flag(false),
-    no_unlink_flag(false),
-    lower_count_arg(), lower_count_given(false),
-    upper_count_arg(), upper_count_given(false),
-    timing_arg(""), timing_given(false),
-    no_write_flag(false),
-    file_arg()
-  { }
-
-  count_main_cmdline(int argc, char* argv[]) :
-    mer_len_arg(), mer_len_given(false),
-    size_arg(), size_given(false),
-    threads_arg(1), threads_given(false),
-    Files_arg(1), Files_given(false),
-    generator_arg(""), generator_given(false),
-    Generators_arg(1), Generators_given(false),
-    shell_arg(""), shell_given(false),
-    output_arg("mer_counts.jf"), output_given(false),
-    counter_len_arg(7), counter_len_given(false),
-    out_counter_len_arg(4), out_counter_len_given(false),
-    canonical_flag(false),
-    bc_arg(""), bc_given(false),
-    bf_size_arg(), bf_size_given(false),
-    bf_fp_arg(0.01), bf_fp_given(false),
-    if_arg(), if_given(false),
-    min_qual_char_arg(""), min_qual_char_given(false),
-    reprobes_arg(126), reprobes_given(false),
-    text_flag(false),
-    disk_flag(false),
-    no_merge_flag(false),
-    no_unlink_flag(false),
-    lower_count_arg(), lower_count_given(false),
-    upper_count_arg(), upper_count_given(false),
-    timing_arg(""), timing_given(false),
-    no_write_flag(false),
-    file_arg()
-  { parse(argc, argv); }
-
-  void parse(int argc, char* argv[]) {
-    static struct option long_options[] = {
-      {"mer-len", 1, 0, 'm'},
-      {"size", 1, 0, 's'},
-      {"threads", 1, 0, 't'},
-      {"Files", 1, 0, 'F'},
-      {"generator", 1, 0, 'g'},
-      {"Generators", 1, 0, 'G'},
-      {"shell", 1, 0, 'S'},
-      {"output", 1, 0, 'o'},
-      {"counter-len", 1, 0, 'c'},
-      {"out-counter-len", 1, 0, OUT_COUNTER_LEN_OPT},
-      {"canonical", 0, 0, 'C'},
-      {"bc", 1, 0, BC_OPT},
-      {"bf-size", 1, 0, BF_SIZE_OPT},
-      {"bf-fp", 1, 0, BF_FP_OPT},
-      {"if", 1, 0, IF_OPT},
-      {"min-qual-char", 1, 0, 'Q'},
-      {"reprobes", 1, 0, 'p'},
-      {"text", 0, 0, TEXT_OPT},
-      {"disk", 0, 0, DISK_OPT},
-      {"no-merge", 0, 0, NO_MERGE_OPT},
-      {"no-unlink", 0, 0, NO_UNLINK_OPT},
-      {"lower-count", 1, 0, 'L'},
-      {"upper-count", 1, 0, 'U'},
-      {"timing", 1, 0, TIMING_OPT},
-      {"no-write", 0, 0, NO_WRITE_OPT},
-      {"help", 0, 0, 'h'},
-      {"full-help", 0, 0, FULL_HELP_OPT},
-      {"usage", 0, 0, USAGE_OPT},
-      {"version", 0, 0, 'V'},
-      {0, 0, 0, 0}
-    };
-    static const char *short_options = "hVm:s:t:F:g:G:S:o:c:CQ:p:L:U:";
-
-    ::std::string err;
-#define CHECK_ERR(type,val,which) if(!err.empty()) { ::std::cerr << "Invalid " #type " '" << val << "' for [" which "]: " << err << "\n"; exit(1); }
-    while(true) {
-      int index = -1;
-      int c = getopt_long(argc, argv, short_options, long_options, &index);
-      if(c == -1) break;
-      switch(c) {
-      case ':':
-        ::std::cerr << "Missing required argument for "
-                  << (index == -1 ? ::std::string(1, (char)optopt) : std::string(long_options[index].name))
-                  << ::std::endl;
-        exit(1);
-      case 'h':
-        ::std::cout << usage() << "\n\n" << help() << std::endl;
-        exit(0);
-      case USAGE_OPT:
-        ::std::cout << usage() << "\nUse --help for more information." << std::endl;
-        exit(0);
-      case 'V':
-        print_version();
-        exit(0);
-      case '?':
-        ::std::cerr << "Use --usage or --help for some help\n";
-        exit(1);
-      case FULL_HELP_OPT:
-        ::std::cout << usage() << "\n\n" << help() << "\n\n" << hidden() << std::flush;
-        exit(0);
-      case 'm':
-        mer_len_given = true;
-        mer_len_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint32_t, optarg, "-m, --mer-len=uint32")
-        break;
-      case 's':
-        size_given = true;
-        size_arg = conv_uint<uint64_t>((const char*)optarg, err, true);
-        CHECK_ERR(uint64_t, optarg, "-s, --size=uint64")
-        break;
-      case 't':
-        threads_given = true;
-        threads_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint32_t, optarg, "-t, --threads=uint32")
-        break;
-      case 'F':
-        Files_given = true;
-        Files_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint32_t, optarg, "-F, --Files=uint32")
-        break;
-      case 'g':
-        generator_given = true;
-        generator_arg = optarg;
-        break;
-      case 'G':
-        Generators_given = true;
-        Generators_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint32_t, optarg, "-G, --Generators=uint32")
-        break;
-      case 'S':
-        shell_given = true;
-        shell_arg = optarg;
-        break;
-      case 'o':
-        output_given = true;
-        output_arg = optarg;
-        break;
-      case 'c':
-        counter_len_given = true;
-        counter_len_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint32_t, optarg, "-c, --counter-len=Length in bits")
-        break;
-      case OUT_COUNTER_LEN_OPT:
-        out_counter_len_given = true;
-        out_counter_len_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint32_t, optarg, "    --out-counter-len=Length in bytes")
-        break;
-      case 'C':
-        canonical_flag = true;
-        break;
-      case BC_OPT:
-        bc_given = true;
-        bc_arg = optarg;
-        break;
-      case BF_SIZE_OPT:
-        bf_size_given = true;
-        bf_size_arg = conv_uint<uint64_t>((const char*)optarg, err, true);
-        CHECK_ERR(uint64_t, optarg, "    --bf-size=uint64")
-        break;
-      case BF_FP_OPT:
-        bf_fp_given = true;
-        bf_fp_arg = conv_double((const char*)optarg, err, false);
-        CHECK_ERR(double_t, optarg, "    --bf-fp=double")
-        break;
-      case IF_OPT:
-        if_given = true;
-        if_arg.push_back(optarg);
-        break;
-      case 'Q':
-        min_qual_char_given = true;
-        min_qual_char_arg.assign(optarg);
-        break;
-      case 'p':
-        reprobes_given = true;
-        reprobes_arg = conv_uint<uint32_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint32_t, optarg, "-p, --reprobes=uint32")
-        break;
-      case TEXT_OPT:
-        text_flag = true;
-        break;
-      case DISK_OPT:
-        disk_flag = true;
-        break;
-      case NO_MERGE_OPT:
-        no_merge_flag = true;
-        break;
-      case NO_UNLINK_OPT:
-        no_unlink_flag = true;
-        break;
-      case 'L':
-        lower_count_given = true;
-        lower_count_arg = conv_uint<uint64_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint64_t, optarg, "-L, --lower-count=uint64")
-        break;
-      case 'U':
-        upper_count_given = true;
-        upper_count_arg = conv_uint<uint64_t>((const char*)optarg, err, false);
-        CHECK_ERR(uint64_t, optarg, "-U, --upper-count=uint64")
-        break;
-      case TIMING_OPT:
-        timing_given = true;
-        timing_arg = optarg;
-        break;
-      case NO_WRITE_OPT:
-        no_write_flag = true;
-        break;
-      }
-    }
-
-    // Check that required switches are present
-    if(!mer_len_given)
-      error("[-m, --mer-len=uint32] required switch");
-    if(!size_given)
-      error("[-s, --size=uint64] required switch");
-
-    // Check mutually exlusive switches
-    if(bf_size_given && bc_given)
-      error("Switches [    --bf-size=uint64] and [    --bc=peath] are mutually exclusive");
-
-    // Parse arguments
-    if(argc - optind < 0)
-      error("Requires at least 0 argument.");
-    for( ; optind < argc; ++optind) {
-      file_arg.push_back(argv[optind]);
-    }
-  }
-  static const char * usage() { return "Usage: jellyfish count [options] file:path+"; }
-  class error {
-    int code_;
-    std::ostringstream msg_;
-
-    // Select the correct version (GNU or XSI) version of
-    // strerror_r. strerror_ behaves like the GNU version of strerror_r,
-    // regardless of which version is provided by the system.
-    static const char* strerror__(char* buf, int res) {
-      return res != -1 ? buf : "Invalid error";
-    }
-    static const char* strerror__(char* buf, char* res) {
-      return res;
-    }
-    static const char* strerror_(int err, char* buf, size_t buflen) {
-      return strerror__(buf, strerror_r(err, buf, buflen));
-    }
-    struct no_t { };
-
-  public:
-    static no_t no;
-    error(int code = EXIT_FAILURE) : code_(code) { }
-    explicit error(const char* msg, int code = EXIT_FAILURE) : code_(code)
-      { msg_ << msg; }
-    error(const std::string& msg, int code = EXIT_FAILURE) : code_(code)
-      { msg_ << msg; }
-    error& operator<<(no_t) {
-      char buf[1024];
-      msg_ << ": " << strerror_(errno, buf, sizeof(buf));
-      return *this;
-    }
-    template<typename T>
-    error& operator<<(const T& x) { msg_ << x; return (*this); }
-    ~error() {
-      ::std::cerr << "Error: " << msg_.str() << "\n"
-                  << usage() << "\n"
-                  << "Use --help for more information"
-                  << ::std::endl;
-      exit(code_);
-    }
-  };
-  static const char * help() { return
-    "Count k-mers in fasta or fastq files\n\n"
-    "Options (default value in (), *required):\n"
-    " -m, --mer-len=uint32                    *Length of mer\n"
-    " -s, --size=uint64                       *Initial hash size\n"
-    " -t, --threads=uint32                     Number of threads (1)\n"
-    " -F, --Files=uint32                       Number files open simultaneously (1)\n"
-    " -g, --generator=path                     File of commands generating fast[aq]\n"
-    " -G, --Generators=uint32                  Number of generators run simultaneously (1)\n"
-    " -S, --shell=string                       Shell used to run generator commands ($SHELL or /bin/sh)\n"
-    " -o, --output=string                      Output file (mer_counts.jf)\n"
-    " -c, --counter-len=Length in bits         Length bits of counting field (7)\n"
-    "     --out-counter-len=Length in bytes    Length in bytes of counter field in output (4)\n"
-    " -C, --canonical                          Count both strand, canonical representation (false)\n"
-    "     --bc=peath                           Bloom counter to filter out singleton mers\n"
-    "     --bf-size=uint64                     Use bloom filter to count high-frequency mers\n"
-    "     --bf-fp=double                       False positive rate of bloom filter (0.01)\n"
-    "     --if=path                            Count only k-mers in this files\n"
-    " -Q, --min-qual-char=string               Any base with quality below this character is changed to N\n"
-    " -p, --reprobes=uint32                    Maximum number of reprobes (126)\n"
-    "     --text                               Dump in text format (false)\n"
-    "     --disk                               Disk operation. Do not do size doubling (false)\n"
-    " -L, --lower-count=uint64                 Don't output k-mer with count < lower-count\n"
-    " -U, --upper-count=uint64                 Don't output k-mer with count > upper-count\n"
-    "     --timing=Timing file                 Print timing information\n"
-    "     --usage                              Usage\n"
-    " -h, --help                               This message\n"
-    "     --full-help                          Detailed help\n"
-    " -V, --version                            Version";
-  }
-  static const char* hidden() { return
-    "Hidden options:\n"
-    "     --no-merge                           Do not merge files intermediary files (false)\n"
-    "     --no-unlink                          Do not unlink intermediary files after automatic merging (false)\n"
-    "     --no-write                           Don't write database (false)\n"
-    "";
-  }
-  void print_version(::std::ostream &os = std::cout) const {
-#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.0.0"
-#endif
-    os << PACKAGE_VERSION << "\n";
-  }
-  void dump(::std::ostream &os = std::cout) {
-    os << "mer_len_given:" << mer_len_given << " mer_len_arg:" << mer_len_arg << "\n";
-    os << "size_given:" << size_given << " size_arg:" << size_arg << "\n";
-    os << "threads_given:" << threads_given << " threads_arg:" << threads_arg << "\n";
-    os << "Files_given:" << Files_given << " Files_arg:" << Files_arg << "\n";
-    os << "generator_given:" << generator_given << " generator_arg:" << generator_arg << "\n";
-    os << "Generators_given:" << Generators_given << " Generators_arg:" << Generators_arg << "\n";
-    os << "shell_given:" << shell_given << " shell_arg:" << shell_arg << "\n";
-    os << "output_given:" << output_given << " output_arg:" << output_arg << "\n";
-    os << "counter_len_given:" << counter_len_given << " counter_len_arg:" << counter_len_arg << "\n";
-    os << "out_counter_len_given:" << out_counter_len_given << " out_counter_len_arg:" << out_counter_len_arg << "\n";
-    os << "canonical_flag:" << canonical_flag << "\n";
-    os << "bc_given:" << bc_given << " bc_arg:" << bc_arg << "\n";
-    os << "bf_size_given:" << bf_size_given << " bf_size_arg:" << bf_size_arg << "\n";
-    os << "bf_fp_given:" << bf_fp_given << " bf_fp_arg:" << bf_fp_arg << "\n";
-    os << "if_given:" << if_given << " if_arg:" << vec_str(if_arg) << "\n";
-    os << "min_qual_char_given:" << min_qual_char_given << " min_qual_char_arg:" << min_qual_char_arg << "\n";
-    os << "reprobes_given:" << reprobes_given << " reprobes_arg:" << reprobes_arg << "\n";
-    os << "text_flag:" << text_flag << "\n";
-    os << "disk_flag:" << disk_flag << "\n";
-    os << "no_merge_flag:" << no_merge_flag << "\n";
-    os << "no_unlink_flag:" << no_unlink_flag << "\n";
-    os << "lower_count_given:" << lower_count_given << " lower_count_arg:" << lower_count_arg << "\n";
-    os << "upper_count_given:" << upper_count_given << " upper_count_arg:" << upper_count_arg << "\n";
-    os << "timing_given:" << timing_given << " timing_arg:" << timing_arg << "\n";
-    os << "no_write_flag:" << no_write_flag << "\n";
-    os << "file_arg:" << vec_str(file_arg) << "\n";
-  }
-};
-#endif // __COUNT_MAIN_CMDLINE_HPP__"
diff --git a/include/cuckoohash_config.hh b/include/cuckoohash_config.hh
index bfa091a..193220f 100644
--- a/include/cuckoohash_config.hh
+++ b/include/cuckoohash_config.hh
@@ -6,27 +6,29 @@
 #include <cstddef>
 
 //! The default maximum number of keys per bucket
-const size_t DEFAULT_SLOT_PER_BUCKET = 4;
+constexpr size_t LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET = 4;
 
 //! The default number of elements in an empty hash table
-const size_t DEFAULT_SIZE = (1U << 16) * DEFAULT_SLOT_PER_BUCKET;
+constexpr size_t LIBCUCKOO_DEFAULT_SIZE =
+    (1U << 16) * LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET;
 
 //! On a scale of 0 to 16, the memory granularity of the locks array. 0 is the
 //! least granular, meaning the array is a contiguous array and thus offers the
 //! best performance but the greatest memory overhead. 16 is the most granular,
 //! offering the least memory overhead but worse performance.
-const size_t LOCK_ARRAY_GRANULARITY = 0;
+constexpr size_t LIBCUCKOO_LOCK_ARRAY_GRANULARITY = 0;
 
 //! The default minimum load factor that the table allows for automatic
 //! expansion. It must be a number between 0.0 and 1.0. The table will throw
 //! libcuckoo_load_factor_too_low if the load factor falls below this value
 //! during an automatic expansion.
-const double DEFAULT_MINIMUM_LOAD_FACTOR = 0.05;
+constexpr double LIBCUCKOO_DEFAULT_MINIMUM_LOAD_FACTOR = 0.05;
 
 //! An alias for the value that sets no limit on the maximum hashpower. If this
 //! value is set as the maximum hashpower limit, there will be no limit. Since 0
-//! is the only hashpower that can never occur, it should stay at 0.
-const size_t NO_MAXIMUM_HASHPOWER = 0;
+//! is the only hashpower that can never occur, it should stay at 0. This is
+//! also the default initial value for the maximum hashpower in a table.
+constexpr size_t LIBCUCKOO_NO_MAXIMUM_HASHPOWER = 0;
 
 //! set LIBCUCKOO_DEBUG to 1 to enable debug output
 #define LIBCUCKOO_DEBUG 0
diff --git a/include/cuckoohash_map.hh b/include/cuckoohash_map.hh
index fdc0735..08448e8 100644
--- a/include/cuckoohash_map.hh
+++ b/include/cuckoohash_map.hh
@@ -8,427 +8,219 @@
 #include <atomic>
 #include <bitset>
 #include <cassert>
-#include <chrono>
-#include <cmath>
 #include <cstdint>
 #include <cstdlib>
-#include <cstring>
 #include <functional>
 #include <iterator>
 #include <limits>
-#include <list>
 #include <memory>
 #include <mutex>
 #include <stdexcept>
 #include <thread>
-#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "cuckoohash_config.hh"
 #include "cuckoohash_util.hh"
-#include "lazy_array.hh"
-#include "default_hasher.hh"
-
-//! cuckoohash_map is the hash table class.
+#include "libcuckoo_lazy_array.hh"
+
+/**
+ * A concurrent hash table
+ *
+ * @tparam Key type of keys in the table
+ * @tparam T type of values in the table
+ * @tparam Pred type of equality comparison functor
+ * @tparam Alloc type of key-value pair allocator
+ * @tparam SLOT_PER_BUCKET number of slots for each bucket in the table
+ */
 template < class Key,
            class T,
-           class Hash = DefaultHasher<Key>,
+           class Hash = std::hash<Key>,
            class Pred = std::equal_to<Key>,
            class Alloc = std::allocator<std::pair<const Key, T>>,
-           size_t SLOT_PER_BUCKET = DEFAULT_SLOT_PER_BUCKET
+           std::size_t SLOT_PER_BUCKET = LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET
            >
 class cuckoohash_map {
 public:
-    //! key_type is the type of keys.
-    typedef Key                     key_type;
-    //! value_type is the type of key-value pairs.
-    typedef std::pair<const Key, T> value_type;
-    //! mapped_type is the type of values.
-    typedef T                       mapped_type;
-    //! hasher is the type of the hash function.
-    typedef Hash                    hasher;
-    //! key_equal is the type of the equality predicate.
-    typedef Pred                    key_equal;
-    //! allocator_type is the type of the allocator
-    typedef Alloc                   allocator_type;
-
-    //! slot_per_bucket is the number of items each bucket in the table can hold
-    static const size_t slot_per_bucket = SLOT_PER_BUCKET;
-
-    //! For any update operations, the callable passed in must be convertible to
-    //! the following type
-    typedef std::function<void(mapped_type&)> updater_type;
-
-    //! Class returned by operator[] which wraps an entry in the hash table.
-    //! Note that this reference type behave somewhat differently from an STL
-    //! map reference. Most importantly, running this operator will not insert a
-    //! default key-value pair into the map if the given key is not already in
-    //! the map.
-    class reference {
-        // Note that this implementation here is not exactly STL compliant. To
-        // maintain performance and avoid hitting the hash table too many times,
-        // The reference object is *lazy*. In other words,
-        //
-        //  - operator[] does not actually perform an insert. It returns a
-        //    reference object pointing to the requested key.
-        //  - On table[i] = val // reference::operator=(mapped_type)
-        //    an update / insert is called
-        //  - On table[i] = table[j] // reference::operator=(const reference&)
-        //    an update / insert is called with the value of table[j]
-        //  - On val = table[i] // operator mapped_type()
-        //    a find is called
-        //  - On table[i] (i.e. no operation performed)
-        //    the destructor is called immediately (reference::~reference())
-        //    and nothing happens.
-    public:
-        //! Delete the default constructor, which should never be used
-        reference() = delete;
-
-        //! Casting to \p mapped_type runs a find for the stored key. If the
-        //! find fails, it will thrown an exception.
-        operator mapped_type() const {
-            return owner_.find(key_);
-        }
-
-        //! The assignment operator will first try to update the value at the
-        //! reference's key. If the key isn't in the table, it will insert the
-        //! key with \p val.
-        reference& operator=(const mapped_type& val) {
-            owner_.upsert(
-                key_, [&val](mapped_type& v) { v = val; }, val);
-            return *this;
-        }
-
-        //! The copy assignment operator doesn't actually copy the passed-in
-        //! reference. Instead, it has the same behavior as operator=(const
-        //! mapped_type& val).
-        reference& operator=(const reference& ref) {
-            *this = (mapped_type) ref;
-            return *this;
-        }
-
-    private:
-        // private constructor which initializes the owner and key
-        reference(
-            cuckoohash_map<Key, T, Hash, Pred, Alloc, slot_per_bucket>& owner,
-            const key_type& key) : owner_(owner), key_(key) {}
-
-        // reference to the hash map instance
-        cuckoohash_map<Key, T, Hash, Pred, Alloc, slot_per_bucket>& owner_;
-        // the referenced key
-        const key_type& key_;
-
-        // cuckoohash_map needs to call the private constructor
-        friend class cuckoohash_map<Key, T, Hash, Pred, Alloc, slot_per_bucket>;
-    };
-
-    typedef const mapped_type const_reference;
-
-    typedef char partial_t;
+    /** @name Type Declarations */
+    /**@{*/
+
+    using key_type = Key;
+    using mapped_type = T;
+    using value_type = std::pair<const Key, T>;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using hasher = Hash;
+    using key_equal = Pred;
+    using allocator_type = Alloc;
 
 private:
-    // Constants used internally
+    using allocator_traits_ = std::allocator_traits<allocator_type>;
 
-    // true if the key is small and simple, which means using partial keys for
-    // lookup would probably slow us down
-    static const bool is_simple =
-        std::is_pod<key_type>::value && sizeof(key_type) <= 8;
+public:
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = typename allocator_traits_::pointer;
+    using const_pointer = typename allocator_traits_::const_pointer;
+    class locked_table;
 
-    // We enable certain methods only if the mapped_type is copy-assignable
-    static const bool value_copy_assignable = std::is_copy_assignable<
-        mapped_type>::value;
+    /**@}*/
 
-    // number of locks in the locks array
-    static const size_t kNumLocks = 1 << 16;
+    /** @name Table Parameters */
+    /**@{*/
 
-    // number of cores on the machine
-    static size_t kNumCores() {
-        static size_t cores = std::thread::hardware_concurrency();
-        return cores;
+    /**
+     * The number of slots per hash bucket
+     */
+    static constexpr size_type slot_per_bucket() {
+        return SLOT_PER_BUCKET;
     }
 
-    // A fast, lightweight spinlock
-    LIBCUCKOO_SQUELCH_PADDING_WARNING
-    class LIBCUCKOO_ALIGNAS(64) spinlock {
-        std::atomic_flag lock_;
-    public:
-        spinlock() {
-            lock_.clear();
-        }
-
-        inline void lock() {
-            while (lock_.test_and_set(std::memory_order_acquire));
-        }
-
-        inline void unlock() {
-            lock_.clear(std::memory_order_release);
-        }
-
-        inline bool try_lock() {
-            return !lock_.test_and_set(std::memory_order_acquire);
-        }
-
-    };
-
-    typedef enum {
-        ok,
-        failure,
-        failure_key_not_found,
-        failure_key_duplicated,
-        failure_table_full,
-        failure_under_expansion,
-    } cuckoo_status;
-
-    // The Bucket type holds slot_per_bucket partial keys, key-value pairs, and
-    // a occupied bitset, which indicates whether the slot at the given bit
-    // index is in the table or not. It uses aligned_storage arrays to store the
-    // keys and values to allow constructing and destroying key-value pairs in
-    // place. Internally, the values are stored without the const qualifier in
-    // the key, to enable modifying bucket memory.
-    typedef std::pair<Key, T> storage_value_type;
-    class Bucket {
-    private:
-        std::array<partial_t, slot_per_bucket> partials_;
-        std::bitset<slot_per_bucket> occupied_;
-        std::array<typename std::aligned_storage<
-                       sizeof(storage_value_type),
-                       alignof(storage_value_type)>::type,
-                   slot_per_bucket> kvpairs_;
-
-    public:
-        const partial_t& partial(size_t ind) const {
-            return partials_[ind];
-        }
-
-        partial_t& partial(size_t ind) {
-            return partials_[ind];
-        }
-
-        const value_type& kvpair(size_t ind) const {
-            return *static_cast<const value_type*>(
-                static_cast<const void*>(&kvpairs_[ind]));
-        }
-
-        value_type& kvpair(size_t ind) {
-            return *static_cast<value_type*>(
-                static_cast<void*>(&kvpairs_[ind]));
-        }
-
-        storage_value_type& storage_kvpair(size_t ind) {
-            return *static_cast<storage_value_type*>(
-                static_cast<void*>(&kvpairs_[ind]));
-        }
-
-        bool occupied(size_t ind) const {
-            return occupied_[ind];
-        }
-
-        const key_type& key(size_t ind) const {
-            return kvpair(ind).first;
-        }
-
-        const mapped_type& val(size_t ind) const {
-            return kvpair(ind).second;
-        }
-
-        mapped_type& val(size_t ind) {
-            return kvpair(ind).second;
-        }
-
-        template <typename K, typename... Args>
-        void setKV(size_t ind, K&& k, Args&&... args) {
-            static allocator_type pair_allocator;
-            occupied_[ind] = true;
-            pair_allocator.construct(
-                &storage_kvpair(ind),
-                std::piecewise_construct,
-                std::forward_as_tuple(std::forward<K>(k)),
-                std::forward_as_tuple(std::forward<Args>(args)...));
-        }
-
-        void eraseKV(size_t ind) {
-            occupied_[ind] = false;
-            (&kvpair(ind))->~value_type();
-        }
-
-        void clear() {
-            for (size_t i = 0; i < slot_per_bucket; ++i) {
-                if (occupied(i)) {
-                    eraseKV(i);
-                }
-            }
-        }
-
-        // Moves the item in b1[slot1] into b2[slot2] without copying
-        static void move_to_bucket(
-            Bucket& b1, size_t slot1,
-            Bucket& b2, size_t slot2) {
-            assert(b1.occupied(slot1));
-            assert(!b2.occupied(slot2));
-            storage_value_type& tomove = b1.storage_kvpair(slot1);
-            b2.setKV(slot2, std::move(tomove.first), std::move(tomove.second));
-            b2.partial(slot2) = b1.partial(slot1);
-            b1.occupied_.reset(slot1);
-            b2.occupied_.set(slot2);
-        }
-    };
-
-    // The type of the buckets container
-    typedef std::vector<
-        Bucket, typename allocator_type::template rebind<Bucket>::other>
-    buckets_t;
-
-    // The type of the locks container
-    static_assert(LOCK_ARRAY_GRANULARITY >= 0 && LOCK_ARRAY_GRANULARITY <= 16,
-                  "LOCK_ARRAY_GRANULARITY constant must be between 0 and 16,"
-                  " inclusive");
-    typedef lazy_array<
-        16 - LOCK_ARRAY_GRANULARITY, LOCK_ARRAY_GRANULARITY,
-        spinlock,
-        typename allocator_type::template rebind<spinlock>::other> locks_t;
+    /**@}*/
 
-    // The type of the expansion lock
-    typedef std::mutex expansion_lock_t;
-
-    // cacheint is a cache-aligned atomic integer type.
-    LIBCUCKOO_SQUELCH_PADDING_WARNING
-    struct LIBCUCKOO_ALIGNAS(64) cacheint {
-        std::atomic<size_t> num;
-        cacheint(): num(0) {}
-        cacheint(size_t x): num(x) {}
-        cacheint(const cacheint& x): num(x.num.load()) {}
-        cacheint(cacheint&& x): num(x.num.load()) {}
-        cacheint& operator=(const cacheint& x) {
-            num = x.num.load();
-            return *this;
-        }
-        cacheint& operator=(const cacheint&& x) {
-            num = x.num.load();
-            return *this;
-        }
-    };
+    /** @name Constructors and Destructors */
+    /**@{*/
 
-    // Helper methods to read and write hashpower_ with the correct memory
-    // barriers
-    size_t get_hashpower() const {
-        return hashpower_.load(std::memory_order_acquire);
-    }
+    /**
+     * Creates a new cuckohash_map instance
+     *
+     * @param n the number of elements to reserve space for initially
+     * @param hf hash function instance to use
+     * @param eql equality function instance to use
+     * @param alloc allocator instance to use
+     */
+    cuckoohash_map(size_type n = LIBCUCKOO_DEFAULT_SIZE,
+                   const hasher& hf = hasher(),
+                   const key_equal& eql = key_equal(),
+                   const allocator_type& alloc = allocator_type())
+        : hashpower_(reserve_calc(n)),
+          hash_fn_(hf),
+          eq_fn_(eql),
+          allocator_(alloc),
+          buckets_(hashsize(hashpower()), alloc),
+          locks_(hashsize(hashpower()), alloc),
+          expansion_lock_(),
+          minimum_load_factor_(LIBCUCKOO_DEFAULT_MINIMUM_LOAD_FACTOR),
+          maximum_hashpower_(LIBCUCKOO_NO_MAXIMUM_HASHPOWER) {}
 
-    void set_hashpower(size_t val) {
-        hashpower_.store(val, std::memory_order_release);
+    /**
+     * Destroys the table. The destructors of all elements stored in the table
+     * are destroyed, and then the table storage is deallocated.
+     */
+    ~cuckoohash_map() {
+        cuckoo_clear();
     }
 
-    // get_counterid returns the counterid for the current thread.
-    static inline int get_counterid() {
-        // counterid stores the per-thread counter index of each thread. Each
-        // counter value corresponds to a core on the machine.
-        static LIBCUCKOO_THREAD_LOCAL int counterid = -1;
+    /**@}*/
 
-        if (counterid < 0) {
-            counterid = rand() % kNumCores();
-        }
-        return counterid;
-    }
+    /** @name Table Details
+     *
+     * Methods for getting information about the table. Methods that query
+     * changing properties of the table are not synchronized with concurrent
+     * operations, and may return out-of-date information if the table is being
+     * concurrently modified.
+     *
+     */
+    /**@{*/
 
-    // reserve_calc takes in a parameter specifying a certain number of slots
-    // for a table and returns the smallest hashpower that will hold n elements.
-    static size_t reserve_calc(const size_t n) {
-        size_t buckets = (n + slot_per_bucket - 1) / slot_per_bucket;
-        size_t blog2;
-        if (buckets <= 1) {
-            blog2 = 1;
-        } else {
-            blog2 = 0;
-            for (size_t bcounter = buckets; bcounter > 1; bcounter >>= 1) {
-                ++blog2;
-            }
-            if (hashsize(blog2) < buckets) {
-                ++blog2;
-            }
-        }
-        assert(n <= hashsize(blog2) * slot_per_bucket);
-        return blog2;
+    /**
+     * Returns the function that hashes the keys
+     *
+     * @return the hash function
+     */
+    hasher hash_function() const {
+        return hash_fn_;
     }
 
-public:
     /**
-     * Creates a new cuckohash_map instance
+     * Returns the function that compares keys for equality
      *
-     * @param n the number of elements to reserve space for initially
-     * @param mlf the minimum load factor required that the
-     * table allows for automatic expansion.
-     * @param mhp the maximum hashpower that the table can take on (pass in 0
-     * for no limit)
-     * @throw std::invalid_argument if the given minimum load factor is invalid,
-     * or if the initial space exceeds the maximum hashpower
+     * @return the key comparison function
      */
-    cuckoohash_map(size_t n = DEFAULT_SIZE,
-                   double mlf = DEFAULT_MINIMUM_LOAD_FACTOR,
-                   size_t mhp = NO_MAXIMUM_HASHPOWER,
-                   const hasher& hf = hasher(),
-                   const key_equal eql = key_equal())
-        : hash_fn(hf), eq_fn(eql) {
-        minimum_load_factor(mlf);
-        maximum_hashpower(mhp);
-        size_t hp = reserve_calc(n);
-        if (mhp != NO_MAXIMUM_HASHPOWER && hp > mhp) {
-            throw std::invalid_argument(
-                "hashpower for initial size " + std::to_string(hp) +
-                " is greater than the maximum hashpower");
-        }
-        set_hashpower(hp);
-        buckets_.resize(hashsize(hp));
-        locks_.allocate(std::min(locks_t::size(), hashsize(hp)));
-        num_inserts_.resize(kNumCores(), 0);
-        num_deletes_.resize(kNumCores(), 0);
+    key_equal key_eq() const {
+        return eq_fn_;
     }
 
-    ~cuckoohash_map() {
-        cuckoo_clear();
+    /**
+     * Returns the allocator associated with the container
+     *
+     * @return the associated allocator
+     */
+    allocator_type get_allocator() const {
+        return allocator_;
     }
 
-    //! clear removes all the elements in the hash table, calling their
-    //! destructors.
-    void clear() noexcept {
-        auto unlocker = snapshot_and_lock_all();
-        cuckoo_clear();
+    /**
+     * Returns the hashpower of the table, which is log<SUB>2</SUB>(@ref
+     * bucket_count()).
+     *
+     * @return the hashpower
+     */
+    size_type hashpower() const {
+        return hashpower_.load(std::memory_order_acquire);
     }
 
-    //! size returns the number of items currently in the hash table. Since it
-    //! doesn't lock the table, elements can be inserted during the computation,
-    //! so the result may not necessarily be exact.
-    size_t size() const noexcept {
-        return cuckoo_size();
+    /**
+     * Returns the number of buckets in the table.
+     *
+     * @return the bucket count
+     */
+    size_type bucket_count() const {
+        return buckets_.size();
     }
 
-    //! empty returns true if the table is empty.
-    bool empty() const noexcept {
-        return size() == 0;
+    /**
+     * Returns whether the table is empty or not.
+     *
+     * @return true if the table is empty, false otherwise
+     */
+    bool empty() const {
+        for (size_type i = 0; i < locks_.size(); ++i) {
+            if (locks_[i].elem_counter() > 0) {
+                return false;
+            }
+        }
+        return true;
     }
 
-    //! hashpower returns the hashpower of the table, which is
-    //! log<SUB>2</SUB>(the number of buckets).
-    size_t hashpower() const noexcept {
-        return get_hashpower();
+    /**
+     * Returns the number of elements in the table.
+     *
+     * @return number of elements in the table
+     */
+    size_type size() const {
+        size_type s = 0;
+        for (size_type i = 0; i < locks_.size(); ++i) {
+            s += locks_[i].elem_counter();
+        }
+        return s;
     }
 
-    //! bucket_count returns the number of buckets in the table.
-    size_t bucket_count() const noexcept {
-        return hashsize(get_hashpower());
+    /** Returns the current capacity of the table, that is, @ref bucket_count()
+     * × @ref slot_per_bucket().
+     *
+     * @return capacity of table
+     */
+    size_type capacity() const {
+        return bucket_count() * slot_per_bucket();
     }
 
-    //! load_factor returns the ratio of the number of items in the table to the
-    //! total number of available slots in the table.
-    double load_factor() const noexcept {
-        return cuckoo_loadfactor(get_hashpower());
+    /**
+     * Returns the percentage the table is filled, that is, @ref size() ÷
+     * @ref capacity().
+     *
+     * @return load factor of the table
+     */
+    double load_factor() const {
+        return static_cast<double>(size()) / static_cast<double>(capacity());
     }
 
     /**
      * Sets the minimum load factor allowed for automatic expansions. If an
      * expansion is needed when the load factor of the table is lower than this
-     * threshold, the libcuckoo_load_factor_too_low exception is thrown.
+     * threshold, @ref libcuckoo_load_factor_too_low is thrown. It will not be
+     * thrown for an explicitly-triggered expansion.
      *
      * @param mlf the load factor to set the minimum to
      * @throw std::invalid_argument if the given load factor is less than 0.0
@@ -438,160 +230,245 @@ public:
         if (mlf < 0.0) {
             throw std::invalid_argument(
                 "load factor " + std::to_string(mlf) + " cannot be "
-                " less than 0");
+                "less than 0");
         } else if (mlf > 1.0) {
             throw std::invalid_argument(
                 "load factor " + std::to_string(mlf) + " cannot be "
-                " greater than 1");
+                "greater than 1");
         }
         minimum_load_factor_.store(mlf, std::memory_order_release);
     }
 
     /**
-     * @return the minimum load factor of the table
+     * Returns the minimum load factor of the table
+     *
+     * @return the minimum load factor
      */
-    double minimum_load_factor() noexcept {
+    double minimum_load_factor() {
         return minimum_load_factor_.load(std::memory_order_acquire);
     }
 
     /**
-     * Sets the maximum hashpower the table can be. If set to \ref
-     * NO_MAXIMUM_HASHPOWER, there will be no limit on the hashpower.
+     * Sets the maximum hashpower the table can be. If set to @ref
+     * LIBCUCKOO_NO_MAXIMUM_HASHPOWER, there will be no limit on the hashpower.
+     * Otherwise, the table will not be able to expand beyond the given
+     * hashpower, either by an explicit or an automatic expansion.
      *
      * @param mhp the hashpower to set the maximum to
+     * @throw std::invalid_argument if the current hashpower exceeds the limit
      */
-    void maximum_hashpower(size_t mhp) noexcept {
+    void maximum_hashpower(size_type mhp) {
+        if (mhp != LIBCUCKOO_NO_MAXIMUM_HASHPOWER && hashpower() > mhp) {
+            throw std::invalid_argument(
+                "maximum hashpower " + std::to_string(mhp) + " is less than "
+                "current hashpower");
+
+        }
         maximum_hashpower_.store(mhp, std::memory_order_release);
     }
 
     /**
-     * @return the maximum hashpower of the table
+     * Returns the maximum hashpower of the table
+     *
+     * @return the maximum hashpower
      */
-    size_t maximum_hashpower() noexcept {
+    size_type maximum_hashpower() {
         return maximum_hashpower_.load(std::memory_order_acquire);
     }
 
-    //! find searches through the table for \p key, and stores the associated
-    //! value it finds in \p val. must be copy assignable.
-    bool find(const key_type& key, mapped_type& val) const {
-        size_t hv = hashed_key(key);
-        auto b = snapshot_and_lock_two(hv);
-        const cuckoo_status st = cuckoo_find(key, val, hv, b.i[0], b.i[1]);
-        return (st == ok);
-    }
+    /**@}*/
+
+    /** @name Table Operations
+     *
+     * These are operations that affect the data in the table. They are safe to
+     * call concurrently with each other.
+     *
+     */
+    /**@{*/
 
-    //! This version of find does the same thing as the two-argument version,
-    //! except it returns the value it finds, throwing an \p std::out_of_range
-    //! exception if the key isn't in the table.
-    mapped_type find(const key_type& key) const {
-        mapped_type val;
-        bool done = find(key, val);
-        if (done) {
-            return val;
+    /**
+     * Searches the table for @p key, and invokes @p fn on the value. @p fn is
+     * not allowed to modify the contents of the value if found.
+     *
+     * @tparam K type of the key. This can be any type comparable with @c key_type
+     * @tparam F type of the functor. It should implement the method
+     * <tt>void operator()(const mapped_type&)</tt>.
+     * @param key the key to search for
+     * @param fn the functor to invoke if the element is found
+     * @return true if the key was found and functor invoked, false otherwise
+     */
+    template <typename K, typename F>
+    bool find_fn(const K& key, F fn) const {
+        const hash_value hv = hashed_key(key);
+        const auto b = snapshot_and_lock_two<locking_active>(hv);
+        const table_position pos = cuckoo_find(
+            key, hv.partial, b.first(), b.second());
+        if (pos.status == ok) {
+            fn(buckets_[pos.index].val(pos.slot));
+            return true;
         } else {
-            throw std::out_of_range("key not found in table");
+            return false;
         }
     }
 
-    //! contains searches through the table for \p key, and returns true if it
-    //! finds it in the table, and false otherwise.
-    bool contains(const key_type& key) const {
-        size_t hv = hashed_key(key);
-        auto b = snapshot_and_lock_two(hv);
-        const bool result = cuckoo_contains(key, hv, b.i[0], b.i[1]);
-        return result;
+    /**
+     * Searches the table for @p key, and invokes @p fn on the value. @p fn is
+     * allow to modify the contents of the value if found.
+     *
+     * @tparam K type of the key. This can be any type comparable with @c key_type
+     * @tparam F type of the functor. It should implement the method
+     * <tt>void operator()(mapped_type&)</tt>.
+     * @param key the key to search for
+     * @param fn the functor to invoke if the element is found
+     * @return true if the key was found and functor invoked, false otherwise
+     */
+    template <typename K, typename F>
+    bool update_fn(const K& key, F fn) {
+        const hash_value hv = hashed_key(key);
+        const auto b = snapshot_and_lock_two<locking_active>(hv);
+        const table_position pos = cuckoo_find(
+            key, hv.partial, b.first(), b.second());
+        if (pos.status == ok) {
+            fn(buckets_[pos.index].val(pos.slot));
+            return true;
+        } else {
+            return false;
+        }
     }
 
     /**
-     * Puts the given key-value pair into the table. If the key cannot be placed
-     * in the table, it may be automatically expanded to fit more items.
+     * Searches for @p key in the table. If the key is not there, it is inserted
+     * with @p val. If the key is there, then @p fn is called on the value. The
+     * key will be immediately constructed as @c key_type(std::forward<K>(key)).
+     * If the insertion succeeds, this constructed key will be moved into the
+     * table and the value constructed from the @p val parameters. If the
+     * insertion fails, the constructed key will be destroyed, and the @p val
+     * parameters will remain valid. If there is no room left in the table, it
+     * will be automatically expanded. Expansion may throw exceptions.
      *
+     * @tparam K type of the key
+     * @tparam F type of the functor. It should implement the method
+     * <tt>void operator()(mapped_type&)</tt>.
+     * @tparam Args list of types for the value constructor arguments
      * @param key the key to insert into the table
-     * @param val the value to insert
-     * @return true if the insertion succeeded, false if there was a duplicate
-     * key
-     * @throw libcuckoo_load_factor_too_low if the load factor is below the
-     * minimum_load_factor threshold, if expansion is required
-     * @throw libcuckoo_maximum_hashpower_exceeded if expansion is required
-     * beyond the maximum hash power, if one was set
+     * @param fn the functor to invoke if the element is found
+     * @param val a list of constructor arguments with which to create the value
+     * @return true if a new key was inserted, false if the key was already in
+     * the table
+     */
+    template <typename K, typename F, typename... Args>
+    bool upsert(K&& key, F fn, Args&&... val) {
+        K k(std::forward<K>(key));
+        hash_value hv = hashed_key(k);
+        auto b = snapshot_and_lock_two<locking_active>(hv);
+        table_position pos = cuckoo_insert_loop(hv, b, k);
+        if (pos.status == ok) {
+            add_to_bucket(pos.index, pos.slot, hv.partial, k,
+                          std::forward<Args>(val)...);
+        } else {
+            fn(buckets_[pos.index].val(pos.slot));
+        }
+        return pos.status == ok;
+    }
+
+    /**
+     * Searches for @p key in the table, and invokes @p fn on the value if the
+     * key is found. The functor can mutate the value, and should return @c true
+     * in order to erase the element, and @c false otherwise.
+     *
+     * @tparam K type of the key
+     * @tparam F type of the functor. It should implement the method
+     * <tt>bool operator()(mapped_type&)</tt>.
+     * @param key the key to possibly erase from the table
+     * @param fn the functor to invoke if the element is found
+     * @return true if @p key was found and @p fn invoked, false otherwise
+     */
+    template <typename K, typename F>
+    bool erase_fn(const K& key, F fn) {
+        const hash_value hv = hashed_key(key);
+        const auto b = snapshot_and_lock_two<locking_active>(hv);
+        const table_position pos = cuckoo_find(
+            key, hv.partial, b.first(), b.second());
+        if (pos.status == ok) {
+            if (fn(buckets_[pos.index].val(pos.slot))) {
+                del_from_bucket(buckets_[pos.index], pos.index, pos.slot);
+            }
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * Copies the value associated with @p key into @p val. Equivalent to
+     * calling @ref find_fn with a functor that copies the value into @p val. @c
+     * mapped_type must be @c CopyAssignable.
+     */
+    template <typename K>
+    bool find(const K& key, mapped_type& val) const {
+        return find_fn(key, [&val](const mapped_type& v) mutable {
+                val = v;
+            });
+    }
+
+    /** Searches the table for @p key, and returns the associated value it
+     * finds. @c mapped_type must be @c CopyConstructible.
+     *
+     * @tparam K type of the key
+     * @param key the key to search for
+     * @return the value associated with the given key
+     * @throw std::out_of_range if the key is not found
+     */
+    template <typename K>
+    mapped_type find(const K& key) const {
+        const hash_value hv = hashed_key(key);
+        const auto b = snapshot_and_lock_two<locking_active>(hv);
+        const table_position pos = cuckoo_find(
+            key, hv.partial, b.first(), b.second());
+        if (pos.status == ok) {
+            return buckets_[pos.index].val(pos.slot);
+        } else {
+            throw std::out_of_range("key not found in table");
+        }
+    }
+
+    /** Returns whether or not @p key is in the table. Equivalent to @ref
+     * find_fn with a functor that does nothing.
+     */
+    template <typename K>
+    bool contains(const K& key) const {
+        return find_fn(key, [](const mapped_type&) {});
+    }
+
+    /**
+     * Updates the value associated with @p key to @p val. Equivalent to calling
+     * @ref update_fn with a functor that copies @p val into the associated
+     * value. @c mapped_type must be @c MoveAssignable or @c CopyAssignable.
+     */
+    template <typename K, typename V>
+    bool update(const K& key, V&& val) {
+        return update_fn(key, [&val](mapped_type& v) {
+                v = std::forward<V>(val);
+            });
+    }
+
+    /**
+     * Inserts the key-value pair into the table. Equivalent to calling @ref
+     * upsert with a functor that does nothing.
      */
     template <typename K, typename... Args>
     bool insert(K&& key, Args&&... val) {
-        return cuckoo_insert_loop(hashed_key(key), std::forward<K>(key),
-                                  std::forward<Args>(val)...);
-    }
-
-    //! erase removes \p key and it's associated value from the table, calling
-    //! their destructors. If \p key is not there, it returns false, otherwise
-    //! it returns true.
-    bool erase(const key_type& key) {
-        size_t hv = hashed_key(key);
-        auto b = snapshot_and_lock_two(hv);
-        const cuckoo_status st = cuckoo_delete(key, hv, b.i[0], b.i[1]);
-        return (st == ok);
-    }
-
-    //! update changes the value associated with \p key to \p val. If \p key is
-    //! not there, it returns false, otherwise it returns true.
-    template <typename V>
-    bool update(const key_type& key, V&& val) {
-        size_t hv = hashed_key(key);
-        auto b = snapshot_and_lock_two(hv);
-        const cuckoo_status st = cuckoo_update(hv, b.i[0], b.i[1],
-                                               key, std::forward<V>(val));
-        return (st == ok);
-    }
-
-    //! update_fn changes the value associated with \p key with the function \p
-    //! fn. \p fn will be passed one argument of type \p mapped_type& and can
-    //! modify the argument as desired, returning nothing. If \p key is not
-    //! there, it returns false, otherwise it returns true.
-    template <typename Updater>
-    typename std::enable_if<
-        std::is_convertible<Updater, updater_type>::value,
-        bool>::type update_fn(const key_type& key, Updater fn) {
-        size_t hv = hashed_key(key);
-        auto b = snapshot_and_lock_two(hv);
-        const cuckoo_status st = cuckoo_update_fn(key, fn, hv, b.i[0], b.i[1]);
-        return (st == ok);
-    }
-
-    //! upsert is a combination of update_fn and insert. It first tries updating
-    //! the value associated with \p key using \p fn. If \p key is not in the
-    //! table, then it runs an insert with \p key and \p val. It will always
-    //! succeed, since if the update fails and the insert finds the key already
-    //! inserted, it can retry the update.
-    template <typename Updater, typename K, typename... Args>
-    typename std::enable_if<
-        std::is_convertible<Updater, updater_type>::value,
-        void>::type upsert(K&& key, Updater fn, Args&&... val) {
-        size_t hv = hashed_key(key);
-        cuckoo_status st;
-        do {
-            auto b = snapshot_and_lock_two(hv);
-            size_t hp = get_hashpower();
-            st = cuckoo_update_fn(key, fn, hv, b.i[0], b.i[1]);
-            if (st == ok) {
-                break;
-            }
+        return upsert(std::forward<K>(key), [](mapped_type&) {},
+                      std::forward<Args>(val)...);
+    }
 
-            // We run an insert, since the update failed. Since we already have
-            // the locks, we don't run cuckoo_insert_loop immediately, to avoid
-            // releasing and re-grabbing the locks. Recall, that the locks will
-            // be released at the end of this call to cuckoo_insert.
-            st = cuckoo_insert(hv, std::move(b), std::forward<K>(key),
-                               std::forward<Args>(val)...);
-            if (st == failure_table_full) {
-                cuckoo_fast_double(hp);
-                // Retry until the insert doesn't fail due to expansion.
-                if (cuckoo_insert_loop(hv, std::forward<K>(key),
-                                       std::forward<Args>(val)...)) {
-                    break;
-                }
-                // The only valid reason for failure is a duplicate key. In this
-                // case, we retry the entire upsert operation.
-            }
-        } while (st != ok);
+    /**
+     * Erases the key from the table. Equivalent to calling @ref erase_fn with a
+     * functor that just returns true.
+     */
+    template <typename K>
+    bool erase(const K& key) {
+        return erase_fn(key, [](mapped_type&) { return true; });
     }
 
     /**
@@ -602,15 +479,9 @@ public:
      *
      * @param n the hashpower to set for the table
      * @return true if the table changed size, false otherwise
-     * @throw libcuckoo_maximum_hashpower_exceeded if the specified hashpower is
-     * greater than the maximum, if one was set
      */
-    bool rehash(size_t n) {
-        size_t hp = get_hashpower();
-        if (n == hp) {
-            return false;
-        }
-        return cuckoo_expand_simple(n, n > hp) == ok;
+    bool rehash(size_type n) {
+        return cuckoo_rehash<locking_active>(n);
     }
 
     /**
@@ -621,125 +492,255 @@ public:
      *
      * @param n the number of elements to reserve space for
      * @return true if the size of the table changed, false otherwise
-     * @throw libcuckoo_maximum_hashpower_exceeded if the specified hashpower is
-     * greater than the maximum, if one was set
      */
-    bool reserve(size_t n) {
-        size_t hp = get_hashpower();
-        size_t new_hp = reserve_calc(n);
-        if (new_hp == hp) {
-            return false;
-        }
-        return cuckoo_expand_simple(new_hp, new_hp > hp) == ok;
-    }
-
-    //! hash_function returns the hash function object used by the table.
-    hasher hash_function() const noexcept {
-        return hash_fn;
+    bool reserve(size_type n) {
+        return cuckoo_reserve<locking_active>(n);
     }
 
-    //! key_eq returns the equality predicate object used by the table.
-    key_equal key_eq() const noexcept {
-        return eq_fn;
+    /**
+     * Removes all elements in the table, calling their destructors.
+     */
+    void clear() {
+        auto unlocker = snapshot_and_lock_all<locking_active>();
+        cuckoo_clear();
     }
 
-    //! Returns a \ref reference to the mapped value stored at the given key.
-    //! Note that the reference behaves somewhat differently from an STL map
-    //! reference (see the \ref reference documentation for details).
-    reference operator[](const key_type& key) {
-        return (reference(*this, key));
+    /**
+     * Construct a @ref locked_table object that owns all the locks in the
+     * table.
+     *
+     * @return a \ref locked_table instance
+     */
+    locked_table lock_table() {
+        return locked_table(*this);
     }
 
-    //! Returns a \ref const_reference to the mapped value stored at the given
-    //! key. This is equivalent to running the overloaded \ref find function
-    //! with no value parameter.
-    const_reference operator[](const key_type& key) const {
-        return find(key);
-    }
+    /**@}*/
 
 private:
+    // Hashing types and functions
 
-    template <size_t N>
-    struct BucketContainer {
-        static_assert(N >= 1 && N <= 3, "BucketContainer should only be used"
-                      " for between 1 and 3 locks");
-        const cuckoohash_map* map;
-        std::array<size_t, N> i;
+    // Type of the partial key
+    using partial_t = uint8_t;
+
+    // true if the key is small and simple, which means using partial keys for
+    // lookup would probably slow us down
+    static constexpr bool is_simple =
+        std::is_pod<key_type>::value && sizeof(key_type) <= 8;
 
-        BucketContainer() : map(nullptr), i() {}
+    // Contains a hash and partial for a given key. The partial key is used for
+    // partial-key cuckoohashing, and for finding the alternate bucket of that a
+    // key hashes to.
+    struct hash_value {
+        size_type hash;
+        partial_t partial;
+    };
 
-        template <typename... Args>
-        BucketContainer(const cuckoohash_map* _map, Args&&... inds)
-            : map(_map), i{{inds...}} {}
+    template <typename K>
+    hash_value hashed_key(const K& key) const {
+        const size_type hash = hash_function()(key);
+        return { hash, partial_key(hash) };
+    }
 
-        BucketContainer(const cuckoohash_map* _map, std::array<size_t, N> _i)
-            : map(_map), i(_i) {}
+    template <typename K>
+    size_type hashed_key_only_hash(const K& key) const {
+        return hash_function()(key);
+    }
 
-        BucketContainer(const BucketContainer&) = delete;
-        BucketContainer& operator=(const BucketContainer&) = delete;
+    // hashsize returns the number of buckets corresponding to a given
+    // hashpower.
+    static inline size_type hashsize(const size_type hp) {
+        return size_type(1) << hp;
+    }
+
+    // hashmask returns the bitmask for the buckets array corresponding to a
+    // given hashpower.
+    static inline size_type hashmask(const size_type hp) {
+        return hashsize(hp) - 1;
+    }
+
+    // The partial key must only depend on the hash value. It cannot change with
+    // the hashpower, because, in order for `cuckoo_fast_double` to work
+    // properly, the alt_index must only grow by one bit at the top each time we
+    // expand the table.
+    static partial_t partial_key(const size_type hash) {
+        const uint64_t hash_64bit = hash;
+        const uint32_t hash_32bit = (
+            static_cast<uint32_t>(hash_64bit) ^
+            static_cast<uint32_t>(hash_64bit >> 32));
+        const uint16_t hash_16bit = (
+            static_cast<uint16_t>(hash_32bit) ^
+            static_cast<uint16_t>(hash_32bit >> 16));
+        const uint16_t hash_8bit = (
+            static_cast<uint8_t>(hash_16bit) ^
+            static_cast<uint8_t>(hash_16bit >> 8));
+        return hash_8bit;
+    }
+
+    // index_hash returns the first possible bucket that the given hashed key
+    // could be.
+    static inline size_type index_hash(const size_type hp, const size_type hv) {
+        return hv & hashmask(hp);
+    }
+
+    // alt_index returns the other possible bucket that the given hashed key
+    // could be. It takes the first possible bucket as a parameter. Note that
+    // this function will return the first possible bucket if index is the
+    // second possible bucket, so alt_index(ti, partial, alt_index(ti, partial,
+    // index_hash(ti, hv))) == index_hash(ti, hv).
+    static inline size_type alt_index(const size_type hp, const partial_t partial,
+                                   const size_type index) {
+        // ensure tag is nonzero for the multiply. 0xc6a4a7935bd1e995 is the
+        // hash constant from 64-bit MurmurHash2
+        const size_type nonzero_tag = static_cast<size_type>(partial) + 1;
+        return (index ^ (nonzero_tag * 0xc6a4a7935bd1e995)) & hashmask(hp);
+    }
+
+    // Locking types and functions
+
+    using locking_active = std::integral_constant<bool, true>;
+    using locking_inactive = std::integral_constant<bool, false>;
 
-        // Moving will not invalidate the bucket bucket indices
-        BucketContainer(BucketContainer&& bp) {
-            *this = std::move(bp);
+    // A fast, lightweight spinlock
+    LIBCUCKOO_SQUELCH_PADDING_WARNING
+    class LIBCUCKOO_ALIGNAS(64) spinlock {
+    public:
+        spinlock() noexcept : elem_counter_(0) {
+            lock_.clear();
         }
 
-        BucketContainer& operator=(BucketContainer&& bp) {
-            map = bp.map;
-            i = bp.i;
-            bp.map = nullptr;
-            return *this;
+        void lock(locking_active) {
+            while (lock_.test_and_set(std::memory_order_acq_rel));
         }
 
-        void release() {
-            this->~BucketContainer();
-            map = nullptr;
+        void lock(locking_inactive) {}
+
+        void unlock(locking_active) {
+            lock_.clear(std::memory_order_release);
         }
 
-        bool is_active() const {
-            return map != nullptr;
+        void unlock(locking_inactive) {}
+
+        bool try_lock(locking_active) {
+            return !lock_.test_and_set(std::memory_order_acq_rel);
         }
 
-        ~BucketContainer() {
-            if (map) {
-                unlock(i);
-            }
+        bool try_lock(locking_inactive) {
+            return true;
+        }
+
+        size_type& elem_counter() {
+            return elem_counter_;
         }
 
     private:
-        // unlocks the given bucket index.
-        void unlock(std::array<size_t, 1> inds) const {
-            map->locks_[lock_ind(inds[0])].unlock();
-        }
-
-        // unlocks both of the given bucket indexes, or only one if they are
-        // equal. Order doesn't matter here.
-        void unlock(std::array<size_t, 2> inds) const {
-            const size_t l0 = lock_ind(inds[0]);
-            const size_t l1 = lock_ind(inds[1]);
-            map->locks_[l0].unlock();
-            if (l0 != l1) {
-                map->locks_[l1].unlock();
+        std::atomic_flag lock_;
+        size_type elem_counter_;
+    };
+
+    // The type of the locks container
+    static_assert(LIBCUCKOO_LOCK_ARRAY_GRANULARITY >= 0 &&
+                  LIBCUCKOO_LOCK_ARRAY_GRANULARITY <= 16,
+                  "LIBCUCKOO_LOCK_ARRAY_GRANULARITY constant must be between "
+                  "0 and 16, inclusive");
+    using locks_t = libcuckoo_lazy_array<
+        16 - LIBCUCKOO_LOCK_ARRAY_GRANULARITY, LIBCUCKOO_LOCK_ARRAY_GRANULARITY,
+        spinlock,
+        typename allocator_traits_::template rebind_alloc<spinlock>
+        >;
+
+    // The type of the expansion lock
+    using expansion_lock_t = std::mutex;
+
+    // Classes for managing locked buckets. By storing and moving around sets of
+    // locked buckets in these classes, we can ensure that they are unlocked
+    // properly.
+
+    template <typename LOCK_T>
+    class OneBucket {
+    public:
+        OneBucket() {}
+        OneBucket(locks_t* locks, size_type i)
+            : locks_(locks, OneUnlocker{i}) {}
+
+    private:
+        struct OneUnlocker {
+            size_type i;
+            void operator()(locks_t* p) const {
+                (*p)[lock_ind(i)].unlock(LOCK_T());
             }
+        };
+
+        std::unique_ptr<locks_t, OneUnlocker> locks_;
+    };
+
+    template <typename LOCK_T>
+    class TwoBuckets {
+    public:
+        TwoBuckets() {}
+        TwoBuckets(locks_t* locks, size_type i1, size_type i2)
+            : locks_(locks, TwoUnlocker{i1, i2}) {}
+
+        size_type first() const {
+            return locks_.get_deleter().i1;
         }
 
-        // unlocks the three given buckets
-        void unlock(std::array<size_t, 3> inds) const {
-            const size_t l0 = lock_ind(inds[0]);
-            const size_t l1 = lock_ind(inds[1]);
-            const size_t l2 = lock_ind(inds[2]);
-            map->locks_[l0].unlock();
-            if (l1 != l0) {
-                map->locks_[l1].unlock();
-            }
-            if (l2 != l0 && l2 != l1) {
-                map->locks_[l2].unlock();
-            }
+        size_type second() const {
+            return locks_.get_deleter().i2;
+        }
+
+        bool is_active() const {
+            return static_cast<bool>(locks_);
+        }
+
+        void unlock() {
+            locks_.reset(nullptr);
         }
+
+    private:
+        struct TwoUnlocker {
+            size_type i1, i2;
+            void operator()(locks_t* p) const {
+                const size_type l1 = lock_ind(i1);
+                const size_type l2 = lock_ind(i2);
+                (*p)[l1].unlock(LOCK_T());
+                if (l1 != l2) {
+                    (*p)[l2].unlock(LOCK_T());
+                }
+            }
+        };
+
+        std::unique_ptr<locks_t, TwoUnlocker> locks_;
     };
 
-    typedef BucketContainer<1> OneBucket;
-    typedef BucketContainer<2> TwoBuckets;
-    typedef BucketContainer<3> ThreeBuckets;
+    template <typename LOCK_T>
+    class AllBuckets {
+    public:
+        AllBuckets(locks_t* locks) : locks_(locks) {}
+
+        bool is_active() const {
+            return static_cast<bool>(locks_);
+        }
+
+        void unlock() {
+            locks_.reset(nullptr);
+        }
+
+        void release() {
+            (void)locks_.release();
+        }
+
+    private:
+        struct AllUnlocker {
+            void operator()(locks_t* p) const {
+                for (size_type i = 0; i < p->size(); ++i) {
+                    (*p)[i].unlock(LOCK_T());
+                }
+            }
+        };
+
+        std::unique_ptr<locks_t, AllUnlocker> locks_;
+    };
 
     // This exception is thrown whenever we try to lock a bucket, but the
     // hashpower is not what was expected
@@ -749,9 +750,10 @@ private:
     // check the hashpower to make sure it is the same as what it was before the
     // lock was taken. If it isn't unlock the bucket and throw a
     // hashpower_changed exception.
-    inline void check_hashpower(const size_t hp, const size_t lock) const {
-        if (get_hashpower() != hp) {
-            locks_[lock].unlock();
+    template <typename LOCK_T>
+    inline void check_hashpower(const size_type hp, const size_type lock) const {
+        if (hashpower() != hp) {
+            locks_[lock].unlock(LOCK_T());
             LIBCUCKOO_DBG("%s", "hashpower changed\n");
             throw hashpower_changed();
         }
@@ -760,30 +762,32 @@ private:
     // locks the given bucket index.
     //
     // throws hashpower_changed if it changed after taking the lock.
-    inline OneBucket lock_one(const size_t hp, const size_t i) const {
-        const size_t l = lock_ind(i);
-        locks_[l].lock();
-        check_hashpower(hp, l);
-        return OneBucket{this, i};
+    template <typename LOCK_T>
+    inline OneBucket<LOCK_T> lock_one(const size_type hp, const size_type i) const {
+        const size_type l = lock_ind(i);
+        locks_[l].lock(LOCK_T());
+        check_hashpower<LOCK_T>(hp, l);
+        return OneBucket<LOCK_T>(&locks_, i);
     }
 
     // locks the two bucket indexes, always locking the earlier index first to
     // avoid deadlock. If the two indexes are the same, it just locks one.
     //
     // throws hashpower_changed if it changed after taking the lock.
-    TwoBuckets lock_two(const size_t hp, const size_t i1,
-                        const size_t i2) const {
-        size_t l1 = lock_ind(i1);
-        size_t l2 = lock_ind(i2);
+    template <typename LOCK_T>
+    TwoBuckets<LOCK_T> lock_two(const size_type hp, const size_type i1,
+                        const size_type i2) const {
+        size_type l1 = lock_ind(i1);
+        size_type l2 = lock_ind(i2);
         if (l2 < l1) {
             std::swap(l1, l2);
         }
-        locks_[l1].lock();
-        check_hashpower(hp, l1);
+        locks_[l1].lock(LOCK_T());
+        check_hashpower<LOCK_T>(hp, l1);
         if (l2 != l1) {
-            locks_[l2].lock();
+            locks_[l2].lock(LOCK_T());
         }
-        return TwoBuckets{this, i1, i2};
+        return TwoBuckets<LOCK_T>(&locks_, i1, i2);
     }
 
     // lock_two_one locks the three bucket indexes in numerical order, returning
@@ -791,26 +795,29 @@ private:
     // active if i3 shares a lock index with i1 or i2.
     //
     // throws hashpower_changed if it changed after taking the lock.
-    std::pair<TwoBuckets, OneBucket>
-    lock_three(const size_t hp, const size_t i1,
-               const size_t i2, const size_t i3) const {
-        std::array<size_t, 3> l{{
-                lock_ind(i1), lock_ind(i2), lock_ind(i3)}};
-        std::sort(l.begin(), l.end());
-        locks_[l[0]].lock();
-        check_hashpower(hp, l[0]);
+    template <typename LOCK_T>
+    std::pair<TwoBuckets<LOCK_T>, OneBucket<LOCK_T>>
+    lock_three(const size_type hp, const size_type i1,
+               const size_type i2, const size_type i3) const {
+        std::array<size_type, 3> l{{lock_ind(i1), lock_ind(i2), lock_ind(i3)}};
+	// Lock in order.
+	if (l[2] < l[1]) std::swap(l[2], l[1]);
+	if (l[2] < l[0]) std::swap(l[2], l[0]);
+	if (l[1] < l[0]) std::swap(l[1], l[0]);
+        locks_[l[0]].lock(LOCK_T());
+        check_hashpower<LOCK_T>(hp, l[0]);
         if (l[1] != l[0]) {
-            locks_[l[1]].lock();
+            locks_[l[1]].lock(LOCK_T());
         }
         if (l[2] != l[1]) {
-            locks_[l[2]].lock();
+            locks_[l[2]].lock(LOCK_T());
         }
         return std::make_pair(
-            TwoBuckets{this, i1, i2},
-            OneBucket{
-                (lock_ind(i3) == lock_ind(i1) ||
-                 lock_ind(i3) == lock_ind(i2)) ?
-                    nullptr : this, i3});
+            TwoBuckets<LOCK_T>(&locks_, i1, i2),
+            OneBucket<LOCK_T>(
+                (lock_ind(i3) == lock_ind(i1) || lock_ind(i3) == lock_ind(i2)) ?
+                nullptr : &locks_, i3)
+            );
     }
 
     // snapshot_and_lock_two loads locks the buckets associated with the given
@@ -819,15 +826,15 @@ private:
     // hash value will stay correct as long as the locks are held. It returns
     // the bucket indices associated with the hash value and the current
     // hashpower.
-    TwoBuckets
-    snapshot_and_lock_two(const size_t hv) const noexcept {
+    template <typename LOCK_T>
+    TwoBuckets<LOCK_T> snapshot_and_lock_two(const hash_value& hv) const {
         while (true) {
             // Store the current hashpower we're using to compute the buckets
-            size_t hp = get_hashpower();
-            size_t i1 = index_hash(hp, hv);
-            size_t i2 = alt_index(hp, partial_key(hv), i1);
+            const size_type hp = hashpower();
+            const size_type i1 = index_hash(hp, hv.hash);
+            const size_type i2 = alt_index(hp, hv.partial, i1);
             try {
-                return lock_two(hp, i1, i2);
+                return lock_two<LOCK_T>(hp, i1, i2);
             } catch (hashpower_changed&) {
                 // The hashpower changed while taking the locks. Try again.
                 continue;
@@ -835,245 +842,487 @@ private:
         }
     }
 
-    // A resource manager which releases all the locks upon destruction. It can
-    // only be moved, not copied.
-    class AllUnlocker {
-    private:
-        // If nullptr, do nothing
-        locks_t* locks_;
+    // snapshot_and_lock_all takes all the locks, and returns a deleter object
+    // that releases the locks upon destruction. Note that after taking all the
+    // locks, it is okay to change the buckets_ vector and the hashpower_, since
+    // no other threads should be accessing the buckets.
+    template <typename LOCK_T>
+    AllBuckets<LOCK_T> snapshot_and_lock_all() const {
+        for (size_type i = 0; i < locks_.size(); ++i) {
+            locks_[i].lock(LOCK_T());
+        }
+        return AllBuckets<LOCK_T>(&locks_);
+    }
+
+    // lock_ind converts an index into buckets to an index into locks.
+    static inline size_type lock_ind(const size_type bucket_ind) {
+        return bucket_ind & (locks_t::max_size() - 1);
+    }
+
+    // Data storage types and functions
+
+    // Value type without const Key, used for storage
+    using storage_value_type = std::pair<key_type, mapped_type>;
+
+    // The Bucket type holds slot_per_bucket() partial keys, key-value pairs,
+    // and a occupied bitset, which indicates whether the slot at the given bit
+    // index is in the table or not. It uses aligned_storage arrays to store the
+    // keys and values to allow constructing and destroying key-value pairs in
+    // place. Internally, the values are stored without the const qualifier in
+    // the key, to enable modifying bucket memory.
+    class Bucket {
     public:
-        AllUnlocker(locks_t* locks): locks_(locks) {}
+        Bucket() noexcept {}
+        // The destructor does nothing to the key-value pairs, since we'd need
+        // an allocator to properly destroy the elements.
+        ~Bucket() noexcept {}
+
+        // No move or copy constructors, since we'd need an
+        // instance of the allocator to do any constructions or destructions
+        Bucket(const Bucket&) = delete;
+        Bucket(Bucket&&) = delete;
+        Bucket& operator=(const Bucket&) = delete;
+        Bucket& operator=(Bucket&&) = delete;
+
+        partial_t partial(size_type ind) const {
+            return partials_[ind];
+        }
 
-        AllUnlocker(const AllUnlocker&) = delete;
-        AllUnlocker(AllUnlocker&& au) : locks_(au.locks_) {
-            au.locks_ = nullptr;
+        const value_type& kvpair(size_type ind) const {
+            return *static_cast<const value_type*>(
+                static_cast<const void*>(std::addressof(kvpairs_[ind])));
         }
 
-        AllUnlocker& operator=(const AllUnlocker&) = delete;
-        AllUnlocker& operator=(AllUnlocker&& au) {
-            locks_ = au.locks_;
-            au.locks_ = nullptr;
+        value_type& kvpair(size_type ind) {
+            return *static_cast<value_type*>(
+                static_cast<void*>(std::addressof(kvpairs_[ind])));
         }
 
-        void deactivate() {
-            locks_ = nullptr;
+        storage_value_type& storage_kvpair(size_type ind) {
+            return *static_cast<storage_value_type*>(
+                static_cast<void*>(std::addressof(kvpairs_[ind])));
         }
 
-        void release() {
-            if (locks_) {
-                for (size_t i = 0; i < locks_->allocated_size(); ++i) {
-                    (*locks_)[i].unlock();
-                }
-                deactivate();
-            }
+        bool occupied(size_type ind) const {
+            return occupied_[ind];
         }
 
-        ~AllUnlocker() {
-            release();
+        const key_type& key(size_type ind) const {
+            return kvpair(ind).first;
         }
-    };
 
-    // snapshot_and_lock_all takes all the locks, and returns a deleter object,
-    // that releases the locks upon destruction. Note that after taking all the
-    // locks, it is okay to change the buckets_ vector and the hashpower_, since
-    // no other threads should be accessing the buckets.
-    AllUnlocker snapshot_and_lock_all() const noexcept {
-        for (size_t i = 0; i < locks_.allocated_size(); ++i) {
-            locks_[i].lock();
+        const mapped_type& val(size_type ind) const {
+            return kvpair(ind).second;
         }
-        return AllUnlocker(&locks_);
-    }
 
-    // lock_ind converts an index into buckets to an index into locks.
-    static inline size_t lock_ind(const size_t bucket_ind) {
-        return bucket_ind & (kNumLocks - 1);
-    }
+        mapped_type& val(size_type ind) {
+            return kvpair(ind).second;
+        }
 
-    // hashsize returns the number of buckets corresponding to a given
-    // hashpower.
-    static inline size_t hashsize(const size_t hp) {
-        return size_t(1) << hp;
-    }
+        template <typename K, typename... Args>
+        void setKV(allocator_type& allocator, size_type ind, partial_t p,
+                   K& k, Args&&... args) {
+            partials_[ind] = p;
+            occupied_[ind] = true;
+            allocator_traits_::construct(
+                allocator, &storage_kvpair(ind), std::piecewise_construct,
+                std::forward_as_tuple(std::move(k)),
+                std::forward_as_tuple(std::forward<Args>(args)...));
+        }
 
-    // hashmask returns the bitmask for the buckets array corresponding to a
-    // given hashpower.
-    static inline size_t hashmask(const size_t hp) {
-        return hashsize(hp) - 1;
-    }
+        void eraseKV(allocator_type& allocator, size_type ind) {
+            occupied_[ind] = false;
+            allocator_traits_::destroy(
+                allocator, std::addressof(storage_kvpair(ind)));
+        }
 
-    // hashed_key hashes the given key.
-    inline size_t hashed_key(const key_type &key) const {
-        return hash_function()(key);
-    }
+        void clear(allocator_type& allocator) {
+            for (size_type i = 0; i < slot_per_bucket(); ++i) {
+                if (occupied(i)) {
+                    eraseKV(allocator, i);
+                }
+            }
+        }
 
-    // index_hash returns the first possible bucket that the given hashed key
-    // could be.
-    static inline size_t index_hash(const size_t hp, const size_t hv) {
-        return hv & hashmask(hp);
-    }
+        // Moves the item in b1[slot1] into b2[slot2] without copying
+        static void move_to_bucket(allocator_type& allocator,
+                                   Bucket& b1, size_type slot1,
+                                   Bucket& b2, size_type slot2) {
+            assert(b1.occupied(slot1));
+            assert(!b2.occupied(slot2));
+            storage_value_type& tomove = b1.storage_kvpair(slot1);
+            b2.setKV(allocator, slot2, b1.partial(slot1),
+                     tomove.first, std::move(tomove.second));
+            b1.eraseKV(allocator, slot1);
+        }
 
-    // alt_index returns the other possible bucket that the given hashed key
-    // could be. It takes the first possible bucket as a parameter. Note that
-    // this function will return the first possible bucket if index is the
-    // second possible bucket, so alt_index(ti, partial, alt_index(ti, partial,
-    // index_hash(ti, hv))) == index_hash(ti, hv).
-    static inline size_t alt_index(const size_t hp, const partial_t partial,
-                                   const size_t index) {
-        // ensure tag is nonzero for the multiply.
-        const partial_t nonzero_tag = (partial >> 1 << 1) + 1;
-        // 0xc6a4a7935bd1e995 is the hash constant from 64-bit MurmurHash2
-        const size_t hash_of_tag =
-            static_cast<size_t>(nonzero_tag * 0xc6a4a7935bd1e995);
-        return (index ^ hash_of_tag) & hashmask(hp);
-    }
+        // Moves the contents of b1 to b2
+        static void move_bucket(allocator_type& allocator, Bucket& b1,
+                                Bucket& b2) {
+            for (size_type i = 0; i < slot_per_bucket(); ++i) {
+                if (b1.occupied(i)) {
+                    move_to_bucket(allocator, b1, i, b2, i);
+                }
+            }
+        }
 
-    // partial_key returns a partial_t representing the upper sizeof(partial_t)
-    // bytes of the hashed key. This is used for partial-key cuckoohashing, and
-    // for finding the alternate bucket of that a key hashes to.
-    static inline partial_t partial_key(const size_t hv) {
-        return (partial_t)(hv >> ((sizeof(size_t)-sizeof(partial_t)) * 8));
-    }
+    private:
+        std::array<partial_t, slot_per_bucket()> partials_;
+        std::bitset<slot_per_bucket()> occupied_;
+        std::array<typename std::aligned_storage<
+                       sizeof(storage_value_type),
+                       alignof(storage_value_type)>::type,
+                   slot_per_bucket()> kvpairs_;
+    };
 
-    // A constexpr version of pow that we can use for static_asserts
-    static constexpr size_t const_pow(size_t a, size_t b) {
-        return (b == 0) ? 1 : a * const_pow(a, b - 1);
-    }
+    class BucketContainer {
+        using traits_ = typename allocator_traits_::
+            template rebind_traits<Bucket>;
+    public:
+        BucketContainer(size_type n, typename traits_::allocator_type alloc)
+            : buckets_(traits_::allocate(allocator_, n)),
+              allocator_(alloc), size_(n) {
+            // The Bucket default constructor is nothrow, so we don't have to
+            // worry about dealing with exceptions when constructing all the
+            // elements.
+            static_assert(
+                std::is_nothrow_constructible<Bucket>::value,
+                "BucketContainer requires Bucket to be nothrow constructible");
+            for (size_type i = 0; i < size_; ++i) {
+                traits_::construct(allocator_, &buckets_[i]);
+            }
+        }
+
+        BucketContainer(const BucketContainer&) = delete;
+        BucketContainer(BucketContainer&&) = delete;
+        BucketContainer& operator=(const BucketContainer&) = delete;
+        BucketContainer& operator=(BucketContainer&&) = delete;
+
+        ~BucketContainer() noexcept {
+            static_assert(
+                std::is_nothrow_destructible<Bucket>::value,
+                "BucketContainer requires Bucket to be nothrow destructible");
+            for (size_type i = 0; i < size_; ++i) {
+                traits_::destroy(allocator_, &buckets_[i]);
+            }
+            traits_::deallocate(allocator_, buckets_, size());
+        }
 
-    // The maximum number of items in a BFS path.
-    static const uint8_t MAX_BFS_PATH_LEN = 5;
+        size_type size() const {
+            return size_;
+        }
 
-    // CuckooRecord holds one position in a cuckoo path. Since cuckoopath
-    // elements only define a sequence of alternate hashings for different hash
-    // values, we only need to keep track of the hash values being moved, rather
-    // than the keys themselves.
-    typedef struct {
-        size_t bucket;
-        size_t slot;
-        size_t hv;
-    } CuckooRecord;
+        void swap(BucketContainer& other) noexcept {
+            std::swap(buckets_, other.buckets_);
+            // If propagate_container_on_swap is false, we do nothing if the
+            // allocators are equal. If they're not equal, behavior is
+            // undefined, so we can still do nothing.
+            if (traits_::propagate_on_container_swap::value) {
+                std::swap(allocator_, other.allocator_);
+            }
+            std::swap(size_, other.size_);
+        }
 
-    typedef std::array<CuckooRecord, MAX_BFS_PATH_LEN> CuckooRecords;
+        Bucket& operator[](size_type i) {
+            return buckets_[i];
+        }
 
-    // b_slot holds the information for a BFS path through the table
-    #pragma pack(push,1)
-    struct b_slot {
-        // The bucket of the last item in the path
-        size_t bucket;
-        // a compressed representation of the slots for each of the buckets in
-        // the path. pathcode is sort of like a base-slot_per_bucket number, and
-        // we need to hold at most MAX_BFS_PATH_LEN slots. Thus we need the
-        // maximum pathcode to be at least slot_per_bucket^(MAX_BFS_PATH_LEN)
-        size_t pathcode;
-        static_assert(const_pow(slot_per_bucket, MAX_BFS_PATH_LEN) <
-                      std::numeric_limits<decltype(pathcode)>::max(),
-                      "pathcode may not be large enough to encode a cuckoo"
-                      " path");
-        // The 0-indexed position in the cuckoo path this slot occupies. It must
-        // be less than MAX_BFS_PATH_LEN, and also able to hold negative values.
-        int_fast8_t depth;
-        static_assert(MAX_BFS_PATH_LEN - 1 <=
-                      std::numeric_limits<decltype(depth)>::max(),
-                      "The depth type must able to hold a value of"
-                      " MAX_BFS_PATH_LEN - 1");
-        static_assert(-1 >= std::numeric_limits<decltype(depth)>::min(),
-                      "The depth type must be able to hold a value of -1");
-        b_slot() {}
-        b_slot(const size_t b, const size_t p, const decltype(depth) d)
-            : bucket(b), pathcode(p), depth(d) {
-            assert(d < MAX_BFS_PATH_LEN);
+        const Bucket& operator[](size_type i) const {
+            return buckets_[i];
         }
+
+    private:
+        typename traits_::pointer buckets_;
+        typename allocator_traits_::template rebind_alloc<Bucket> allocator_;
+        size_type size_;
     };
-    #pragma pack(pop)
 
-    // b_queue is the queue used to store b_slots for BFS cuckoo hashing.
-    #pragma pack(push,1)
-    class b_queue {
-        // The maximum size of the BFS queue. Note that unless it's less than
-        // SLOT_PER_BUCKET^MAX_BFS_PATH_LEN, it won't really mean anything.
-        static const size_t MAX_CUCKOO_COUNT = 512;
-        static_assert((MAX_CUCKOO_COUNT & (MAX_CUCKOO_COUNT - 1)) == 0,
-                      "MAX_CUCKOO_COUNT should be a power of 2");
-        // A circular array of b_slots
-        b_slot slots[MAX_CUCKOO_COUNT];
-        // The index of the head of the queue in the array
-        size_t first;
-        // One past the index of the last item of the queue in the array.
-        size_t last;
+    // The type of the buckets container
+    using buckets_t = BucketContainer;
 
-        // returns the index in the queue after ind, wrapping around if
-        // necessary.
-        size_t increment(size_t ind) {
-            return (ind + 1) & (MAX_CUCKOO_COUNT - 1);
-        }
+    // Status codes for internal functions
 
-    public:
-        b_queue() : first(0), last(0) {}
+    enum cuckoo_status {
+        ok,
+        failure,
+        failure_key_not_found,
+        failure_key_duplicated,
+        failure_table_full,
+        failure_under_expansion,
+    };
 
-        void enqueue(b_slot x) {
-            assert(!full());
-            slots[last] = x;
-            last = increment(last);
-        }
 
-        b_slot dequeue() {
-            assert(!empty());
-            b_slot& x = slots[first];
-            first = increment(first);
-            return x;
-        }
+    // A composite type for functions that need to return a table position, and
+    // a status code.
+    struct table_position {
+        size_type index;
+        size_type slot;
+        cuckoo_status status;
+    };
 
-        bool empty() {
-            return first == last;
-        }
+    // Searching types and functions
 
-        bool full() {
-            return increment(last) == first;
+    // cuckoo_find searches the table for the given key, returning the position
+    // of the element found, or a failure status code if the key wasn't found.
+    // It expects the locks to be taken and released outside the function.
+    template <typename K>
+    table_position cuckoo_find(const K &key, const partial_t partial,
+                               const size_type i1, const size_type i2) const {
+        int slot = try_read_from_bucket(buckets_[i1], partial, key);
+        if (slot != -1) {
+            return table_position{i1, static_cast<size_type>(slot), ok};
         }
-    };
-    #pragma pack(pop)
+        slot = try_read_from_bucket(buckets_[i2], partial, key);
+        if (slot != -1) {
+            return table_position{i2, static_cast<size_type>(slot), ok};
+        }
+        return table_position{0, 0, failure_key_not_found};
+    }
 
-    // slot_search searches for a cuckoo path using breadth-first search. It
-    // starts with the i1 and i2 buckets, and, until it finds a bucket with an
-    // empty slot, adds each slot of the bucket in the b_slot. If the queue runs
-    // out of space, it fails.
+    // try_read_from_bucket will search the bucket for the given key and return
+    // the index of the slot if found, or -1 if not found.
+    template <typename K>
+    int try_read_from_bucket(const Bucket& b, const partial_t partial,
+                             const K &key) const {
+        // Silence a warning from MSVC about partial being unused if is_simple.
+        (void)partial;
+        for (size_type i = 0; i < slot_per_bucket(); ++i) {
+            if (!b.occupied(i) || (!is_simple && partial != b.partial(i))) {
+                continue;
+            } else if (key_eq()(b.key(i), key)) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    // Insertion types and function
+
+    /**
+     * Runs cuckoo_insert in a loop until it succeeds in insert and upsert, so
+     * we pulled out the loop to avoid duplicating logic.
+     *
+     * @param hv the hash value of the key
+     * @param b bucket locks
+     * @param key the key to insert
+     * @return table_position of the location to insert the new element, or the
+     * site of the duplicate element with a status code if there was a duplicate.
+     * In either case, the locks will still be held after the function ends.
+     * @throw libcuckoo_load_factor_too_low if expansion is necessary, but the
+     * load factor of the table is below the threshold
+     */
+    template <typename K, typename LOCK_T>
+    table_position cuckoo_insert_loop(hash_value hv, TwoBuckets<LOCK_T>& b,
+                                      K& key) {
+        table_position pos;
+        while (true) {
+            assert(b.is_active());
+            const size_type hp = hashpower();
+            pos = cuckoo_insert(hv, b, key);
+            switch (pos.status) {
+            case ok:
+            case failure_key_duplicated:
+                return pos;
+            case failure_table_full:
+                // Expand the table and try again, re-grabbing the locks
+                cuckoo_fast_double<LOCK_T, automatic_resize>(hp);
+            case failure_under_expansion:
+                b = snapshot_and_lock_two<LOCK_T>(hv);
+                break;
+            default:
+                assert(false);
+            }
+        }
+    }
+
+    // cuckoo_insert tries to find an empty slot in either of the buckets to
+    // insert the given key into, performing cuckoo hashing if necessary. It
+    // expects the locks to be taken outside the function. Before inserting, it
+    // checks that the key isn't already in the table. cuckoo hashing presents
+    // multiple concurrency issues, which are explained in the function. The
+    // following return states are possible:
     //
-    // throws hashpower_changed if it changed during the search
-    b_slot slot_search(const size_t hp, const size_t i1,
-                       const size_t i2) {
-        b_queue q;
-        // The initial pathcode informs cuckoopath_search which bucket the path
-        // starts on
-        q.enqueue(b_slot(i1, 0, 0));
-        q.enqueue(b_slot(i2, 1, 0));
-        while (!q.full() && !q.empty()) {
-            b_slot x = q.dequeue();
-            // Picks a (sort-of) random slot to start from
-            size_t starting_slot = x.pathcode % slot_per_bucket;
-            for (size_t i = 0; i < slot_per_bucket && !q.full();
-                 ++i) {
-                size_t slot = (starting_slot + i) % slot_per_bucket;
-                OneBucket ob = lock_one(hp, x.bucket);
-                Bucket& b = buckets_[x.bucket];
-                if (!b.occupied(slot)) {
-                    // We can terminate the search here
-                    x.pathcode = x.pathcode * slot_per_bucket + slot;
-                    return x;
+    // ok -- Found an empty slot, locks will be held on both buckets after the
+    // function ends, and the position of the empty slot is returned
+    //
+    // failure_key_duplicated -- Found a duplicate key, locks will be held, and
+    // the position of the duplicate key will be returned
+    //
+    // failure_under_expansion -- Failed due to a concurrent expansion
+    // operation. Locks are released. No meaningful position is returned.
+    //
+    // failure_table_full -- Failed to find an empty slot for the table. Locks
+    // are released. No meaningful position is returned.
+    template <typename K, typename LOCK_T>
+    table_position cuckoo_insert(const hash_value hv, TwoBuckets<LOCK_T>& b,
+                                 K& key) {
+        int res1, res2;
+        Bucket& b1 = buckets_[b.first()];
+        if (!try_find_insert_bucket(b1, res1, hv.partial, key)) {
+            return table_position{b.first(), static_cast<size_type>(res1),
+                    failure_key_duplicated};
+        }
+        Bucket& b2 = buckets_[b.second()];
+        if (!try_find_insert_bucket(b2, res2, hv.partial, key)) {
+            return table_position{b.second(), static_cast<size_type>(res2),
+                    failure_key_duplicated};
+        }
+        if (res1 != -1) {
+            return table_position{b.first(), static_cast<size_type>(res1), ok};
+        }
+        if (res2 != -1) {
+            return table_position{b.second(), static_cast<size_type>(res2), ok};
+        }
+
+        // We are unlucky, so let's perform cuckoo hashing.
+        size_type insert_bucket = 0;
+        size_type insert_slot = 0;
+        cuckoo_status st = run_cuckoo<LOCK_T>(b, insert_bucket, insert_slot);
+        if (st == failure_under_expansion) {
+            // The run_cuckoo operation operated on an old version of the table,
+            // so we have to try again. We signal to the calling insert method
+            // to try again by returning failure_under_expansion.
+            return table_position{0, 0, failure_under_expansion};
+        } else if (st == ok) {
+            assert(!locks_[lock_ind(b.first())].try_lock(LOCK_T()));
+            assert(!locks_[lock_ind(b.second())].try_lock(LOCK_T()));
+            assert(!buckets_[insert_bucket].occupied(insert_slot));
+            assert(insert_bucket == index_hash(hashpower(), hv.hash) ||
+                   insert_bucket == alt_index(
+                       hashpower(), hv.partial,
+                       index_hash(hashpower(), hv.hash)));
+            // Since we unlocked the buckets during run_cuckoo, another insert
+            // could have inserted the same key into either b.first() or
+            // b.second(), so we check for that before doing the insert.
+            table_position pos = cuckoo_find(
+                key, hv.partial, b.first(), b.second());
+            if (pos.status == ok) {
+                pos.status = failure_key_duplicated;
+                return pos;
+            }
+            return table_position{insert_bucket, insert_slot, ok};
+        }
+        assert(st == failure);
+        LIBCUCKOO_DBG("hash table is full (hashpower = %zu, hash_items = %zu,"
+                      "load factor = %.2f), need to increase hashpower\n",
+                      hashpower(), size(), load_factor());
+        return table_position{0, 0, failure_table_full};
+    }
+
+    // add_to_bucket will insert the given key-value pair into the slot. The key
+    // and value will be move-constructed into the table, so they are not valid
+    // for use afterwards.
+    template <typename K, typename... Args>
+    void add_to_bucket(const size_type bucket_ind, const size_type slot,
+                       const partial_t partial, K& key, Args&&... val) {
+        Bucket& b = buckets_[bucket_ind];
+        assert(!b.occupied(slot));
+        b.setKV(allocator_, slot, partial,
+                key, std::forward<Args>(val)...);
+        ++locks_[lock_ind(bucket_ind)].elem_counter();
+    }
+
+    // try_find_insert_bucket will search the bucket for the given key, and for
+    // an empty slot. If the key is found, we store the slot of the key in
+    // `slot` and return false. If we find an empty slot, we store its position
+    // in `slot` and return true. If no duplicate key is found and no empty slot
+    // is found, we store -1 in `slot` and return true.
+    template <typename K>
+    bool try_find_insert_bucket(const Bucket& b, int& slot,
+                                const partial_t partial, const K &key) const {
+        // Silence a warning from MSVC about partial being unused if is_simple.
+        (void)partial;
+        slot = -1;
+        for (size_type i = 0; i < slot_per_bucket(); ++i) {
+            if (b.occupied(i)) {
+                if (!is_simple && partial != b.partial(i)) {
+                    continue;
                 }
+                if (key_eq()(b.key(i), key)) {
+                    slot = i;
+                    return false;
+                }
+            } else {
+                slot = i;
+            }
+        }
+        return true;
+    }
 
-                // If x has less than the maximum number of path components,
-                // create a new b_slot item, that represents the bucket we would
-                // have come from if we kicked out the item at this slot.
-                const partial_t partial = b.partial(slot);
-                if (x.depth < MAX_BFS_PATH_LEN - 1) {
-                    b_slot y(alt_index(hp, partial, x.bucket),
-                             x.pathcode * slot_per_bucket + slot, x.depth+1);
-                    q.enqueue(y);
+    // CuckooRecord holds one position in a cuckoo path. Since cuckoopath
+    // elements only define a sequence of alternate hashings for different hash
+    // values, we only need to keep track of the hash values being moved, rather
+    // than the keys themselves.
+    typedef struct {
+        size_type bucket;
+        size_type slot;
+        hash_value hv;
+    } CuckooRecord;
+
+    // The maximum number of items in a cuckoo BFS path.
+    static constexpr uint8_t MAX_BFS_PATH_LEN = 5;
+
+    // An array of CuckooRecords
+    using CuckooRecords = std::array<CuckooRecord, MAX_BFS_PATH_LEN>;
+
+    // run_cuckoo performs cuckoo hashing on the table in an attempt to free up
+    // a slot on either of the insert buckets, which are assumed to be locked
+    // before the start. On success, the bucket and slot that was freed up is
+    // stored in insert_bucket and insert_slot. In order to perform the search
+    // and the swaps, it has to release the locks, which can lead to certain
+    // concurrency issues, the details of which are explained in the function.
+    // If run_cuckoo returns ok (success), then `b` will be active, otherwise it
+    // will not.
+    template <typename LOCK_T>
+    cuckoo_status run_cuckoo(TwoBuckets<LOCK_T>& b, size_type &insert_bucket,
+                             size_type &insert_slot) {
+        // We must unlock the buckets here, so that cuckoopath_search and
+        // cuckoopath_move can lock buckets as desired without deadlock.
+        // cuckoopath_move has to move something out of one of the original
+        // buckets as its last operation, and it will lock both buckets and
+        // leave them locked after finishing. This way, we know that if
+        // cuckoopath_move succeeds, then the buckets needed for insertion are
+        // still locked. If cuckoopath_move fails, the buckets are unlocked and
+        // we try again. This unlocking does present two problems. The first is
+        // that another insert on the same key runs and, finding that the key
+        // isn't in the table, inserts the key into the table. Then we insert
+        // the key into the table, causing a duplication. To check for this, we
+        // search the buckets for the key we are trying to insert before doing
+        // so (this is done in cuckoo_insert, and requires that both buckets are
+        // locked). Another problem is that an expansion runs and changes the
+        // hashpower, meaning the buckets may not be valid anymore. In this
+        // case, the cuckoopath functions will have thrown a hashpower_changed
+        // exception, which we catch and handle here.
+        size_type hp = hashpower();
+        b.unlock();
+        CuckooRecords cuckoo_path;
+        bool done = false;
+        try {
+            while (!done) {
+                const int depth = cuckoopath_search<LOCK_T>(
+                    hp, cuckoo_path, b.first(), b.second());
+                if (depth < 0) {
+                    break;
+                }
+
+                if (cuckoopath_move(hp, cuckoo_path, depth, b)) {
+                    insert_bucket = cuckoo_path[0].bucket;
+                    insert_slot = cuckoo_path[0].slot;
+                    assert(insert_bucket == b.first() || insert_bucket == b.second());
+                    assert(!locks_[lock_ind(b.first())].try_lock(LOCK_T()));
+                    assert(!locks_[lock_ind(b.second())].try_lock(LOCK_T()));
+                    assert(!buckets_[insert_bucket].occupied(insert_slot));
+                    done = true;
+                    break;
                 }
             }
+        } catch (hashpower_changed&) {
+            // The hashpower changed while we were trying to cuckoo, which means
+            // we want to retry. b.first() and b.second() should not be locked
+            // in this case.
+            return failure_under_expansion;
         }
-        // We didn't find a short-enough cuckoo path, so the queue ran out of
-        // space. Return a failure value.
-        return b_slot(0, 0, -1);
+        return done ? ok : failure;
     }
 
     // cuckoopath_search finds a cuckoo path from one of the starting buckets to
@@ -1083,18 +1332,19 @@ private:
     // cuckoopath_move. Thus cuckoopath_move checks that the data matches the
     // cuckoo path before changing it.
     //
-    // throws hashpower_changed if it changed during the search
-    int cuckoopath_search(const size_t hp,
+    // throws hashpower_changed if it changed during the search.
+    template <typename LOCK_T>
+    int cuckoopath_search(const size_type hp,
                           CuckooRecords& cuckoo_path,
-                          const size_t i1, const size_t i2) {
-        b_slot x = slot_search(hp, i1, i2);
+                          const size_type i1, const size_type i2) {
+        b_slot x = slot_search<LOCK_T>(hp, i1, i2);
         if (x.depth == -1) {
             return -1;
         }
-        // Fill in the cuckoo path slots from the end to the beginning
+        // Fill in the cuckoo path slots from the end to the beginning.
         for (int i = x.depth; i >= 0; i--) {
-            cuckoo_path[i].slot = x.pathcode % slot_per_bucket;
-            x.pathcode /= slot_per_bucket;
+            cuckoo_path[i].slot = x.pathcode % slot_per_bucket();
+            x.pathcode /= slot_per_bucket();
         }
         // Fill in the cuckoo_path buckets and keys from the beginning to the
         // end, using the final pathcode to figure out which bucket the path
@@ -1109,8 +1359,8 @@ private:
             first.bucket = i2;
         }
         {
-            OneBucket ob = lock_one(hp, first.bucket);
-            Bucket& b = buckets_[first.bucket];
+            const auto ob = lock_one<LOCK_T>(hp, first.bucket);
+            const Bucket& b = buckets_[first.bucket];
             if (!b.occupied(first.slot)) {
                 // We can terminate here
                 return 0;
@@ -1119,15 +1369,15 @@ private:
         }
         for (int i = 1; i <= x.depth; ++i) {
             CuckooRecord& curr = cuckoo_path[i];
-            CuckooRecord& prev = cuckoo_path[i-1];
-            assert(prev.bucket == index_hash(hp, prev.hv) ||
-                   prev.bucket == alt_index(hp, partial_key(prev.hv),
-                                            index_hash(hp, prev.hv)));
+            const CuckooRecord& prev = cuckoo_path[i-1];
+            assert(prev.bucket == index_hash(hp, prev.hv.hash) ||
+                   prev.bucket == alt_index(hp, prev.hv.partial,
+                                            index_hash(hp, prev.hv.hash)));
             // We get the bucket that this slot is on by computing the alternate
             // index of the previous bucket
-            curr.bucket = alt_index(hp, partial_key(prev.hv), prev.bucket);
-            OneBucket ob = lock_one(hp, curr.bucket);
-            Bucket& b = buckets_[curr.bucket];
+            curr.bucket = alt_index(hp, prev.hv.partial, prev.bucket);
+            const auto ob = lock_one<LOCK_T>(hp, curr.bucket);
+            const Bucket& b = buckets_[curr.bucket];
             if (!b.occupied(curr.slot)) {
                 // We can terminate here
                 return i;
@@ -1141,12 +1391,13 @@ private:
     // an empty slot in one of the buckets in cuckoo_insert. Before the start of
     // this function, the two insert-locked buckets were unlocked in run_cuckoo.
     // At the end of the function, if the function returns true (success), then
-    // the both insert-locked buckets remain locked. If the function is
+    // both insert-locked buckets remain locked. If the function is
     // unsuccessful, then both insert-locked buckets will be unlocked.
     //
-    // throws hashpower_changed if it changed during the move
-    bool cuckoopath_move(const size_t hp, CuckooRecords& cuckoo_path,
-                         size_t depth, TwoBuckets& b) {
+    // throws hashpower_changed if it changed during the move.
+    template <typename LOCK_T>
+    bool cuckoopath_move(const size_type hp, CuckooRecords& cuckoo_path,
+                         size_type depth, TwoBuckets<LOCK_T>& b) {
         assert(!b.is_active());
         if (depth == 0) {
             // There is a chance that depth == 0, when try_add_to_bucket sees
@@ -1155,13 +1406,13 @@ private:
             // cuckoopath_search found empty isn't empty anymore, we unlock them
             // and return false. Otherwise, the bucket is empty and insertable,
             // so we hold the locks and return true.
-            const size_t bucket = cuckoo_path[0].bucket;
-            assert(bucket == b.i[0] || bucket == b.i[1]);
-            b = lock_two(hp, b.i[0], b.i[1]);
+            const size_type bucket = cuckoo_path[0].bucket;
+            assert(bucket == b.first() || bucket == b.second());
+            b = lock_two<LOCK_T>(hp, b.first(), b.second());
             if (!buckets_[bucket].occupied(cuckoo_path[0].slot)) {
                 return true;
             } else {
-                b.release();
+                b.unlock();
                 return false;
             }
         }
@@ -1169,24 +1420,25 @@ private:
         while (depth > 0) {
             CuckooRecord& from = cuckoo_path[depth-1];
             CuckooRecord& to   = cuckoo_path[depth];
-            Bucket& fb = buckets_[from.bucket];
-            const size_t fs = from.slot;
-            Bucket& tb = buckets_[to.bucket];
-            const size_t ts = to.slot;
-            TwoBuckets twob;
-            OneBucket extrab;
+            const size_type fs = from.slot;
+            const size_type ts = to.slot;
+            TwoBuckets<LOCK_T> twob;
+            OneBucket<LOCK_T> extrab;
             if (depth == 1) {
                 // Even though we are only swapping out of one of the original
                 // buckets, we have to lock both of them along with the slot we
                 // are swapping to, since at the end of this function, they both
                 // must be locked. We store tb inside the extrab container so it
                 // is unlocked at the end of the loop.
-                std::tie(twob, extrab) = lock_three(hp, b.i[0], b.i[1],
-                                                    to.bucket);
+                std::tie(twob, extrab) = lock_three<LOCK_T>(
+                    hp, b.first(), b.second(), to.bucket);
             } else {
-                twob = lock_two(hp, from.bucket, to.bucket);
+                twob = lock_two<LOCK_T>(hp, from.bucket, to.bucket);
             }
 
+            Bucket& fb = buckets_[from.bucket];
+            Bucket& tb = buckets_[to.bucket];
+
             // We plan to kick out fs, but let's check if it is still there;
             // there's a small chance we've gotten scooped by a later cuckoo. If
             // that happened, just... try again. Also the slot we are filling in
@@ -1195,12 +1447,12 @@ private:
             // We only need to check that the hash value is the same, because,
             // even if the keys are different and have the same hash value, then
             // the cuckoopath is still valid.
-            if (hashed_key(fb.key(fs)) != from.hv || tb.occupied(ts) ||
-                !fb.occupied(fs)) {
+            if (hashed_key_only_hash(fb.key(fs)) != from.hv.hash ||
+                tb.occupied(ts) || !fb.occupied(fs)) {
                 return false;
             }
 
-            Bucket::move_to_bucket(fb, fs, tb, ts);
+            Bucket::move_to_bucket(allocator_, fb, fs, tb, ts);
             if (depth == 1) {
                 // Hold onto the locks contained in twob
                 b = std::move(twob);
@@ -1210,456 +1462,240 @@ private:
         return true;
     }
 
-    // run_cuckoo performs cuckoo hashing on the table in an attempt to free up
-    // a slot on either of the insert buckets, which are assumed to be locked
-    // before the start. On success, the bucket and slot that was freed up is
-    // stored in insert_bucket and insert_slot. In order to perform the search
-    // and the swaps, it has to release the locks, which can lead to certain
-    // concurrency issues, the details of which are explained in the function.
-    // If run_cuckoo returns ok (success), then the bucket container will be
-    // active, otherwise it will not.
-    cuckoo_status run_cuckoo(TwoBuckets& b, size_t &insert_bucket,
-                             size_t &insert_slot) {
-        // We must unlock the buckets here, so that cuckoopath_search and
-        // cuckoopath_move can lock buckets as desired without deadlock.
-        // cuckoopath_move has to move something out of one of the original
-        // buckets as its last operation, and it will lock both buckets and
-        // leave them locked after finishing. This way, we know that if
-        // cuckoopath_move succeeds, then the buckets needed for insertion are
-        // still locked. If cuckoopath_move fails, the buckets are unlocked and
-        // we try again. This unlocking does present two problems. The first is
-        // that another insert on the same key runs and, finding that the key
-        // isn't in the table, inserts the key into the table. Then we insert
-        // the key into the table, causing a duplication. To check for this, we
-        // search the buckets for the key we are trying to insert before doing
-        // so (this is done in cuckoo_insert, and requires that both buckets are
-        // locked). Another problem is that an expansion runs and changes the
-        // hashpower, meaning the buckets may not be valid anymore. In this
-        // case, the cuckoopath functions will have thrown a hashpower_changed
-        // exception, which we catch and handle here.
-        size_t hp = get_hashpower();
-        assert(b.is_active());
-        b.release();
-        CuckooRecords cuckoo_path;
-        bool done = false;
-        try {
-            while (!done) {
-                int depth = cuckoopath_search(hp, cuckoo_path, b.i[0], b.i[1]);
-                if (depth < 0) {
-                    break;
-                }
+    // A constexpr version of pow that we can use for static_asserts
+    static constexpr size_type const_pow(size_type a, size_type b) {
+        return (b == 0) ? 1 : a * const_pow(a, b - 1);
+    }
 
-                if (cuckoopath_move(hp, cuckoo_path, depth, b)) {
-                    insert_bucket = cuckoo_path[0].bucket;
-                    insert_slot = cuckoo_path[0].slot;
-                    assert(insert_bucket == b.i[0] || insert_bucket == b.i[1]);
-                    assert(!locks_[lock_ind(b.i[0])].try_lock());
-                    assert(!locks_[lock_ind(b.i[1])].try_lock());
-                    assert(!buckets_[insert_bucket].occupied(insert_slot));
-                    done = true;
-                    break;
-                }
-            }
-        } catch (hashpower_changed&) {
-            // The hashpower changed while we were trying to cuckoo, which means
-            // we want to retry. b.i[0] and b.i[1] should not be locked in this
-            // case.
-            return failure_under_expansion;
+    // b_slot holds the information for a BFS path through the table.
+    #pragma pack(push, 1)
+    struct b_slot {
+        // The bucket of the last item in the path.
+        size_type bucket;
+        // a compressed representation of the slots for each of the buckets in
+        // the path. pathcode is sort of like a base-slot_per_bucket number, and
+        // we need to hold at most MAX_BFS_PATH_LEN slots. Thus we need the
+        // maximum pathcode to be at least slot_per_bucket()^(MAX_BFS_PATH_LEN).
+        size_type pathcode;
+        static_assert(const_pow(slot_per_bucket(), MAX_BFS_PATH_LEN) <
+                      std::numeric_limits<decltype(pathcode)>::max(),
+                      "pathcode may not be large enough to encode a cuckoo "
+                      "path");
+        // The 0-indexed position in the cuckoo path this slot occupies. It must
+        // be less than MAX_BFS_PATH_LEN, and also able to hold negative values.
+        int_fast8_t depth;
+        static_assert(MAX_BFS_PATH_LEN - 1 <=
+                      std::numeric_limits<decltype(depth)>::max(),
+                      "The depth type must able to hold a value of"
+                      " MAX_BFS_PATH_LEN - 1");
+        static_assert(-1 >= std::numeric_limits<decltype(depth)>::min(),
+                      "The depth type must be able to hold a value of -1");
+        b_slot() {}
+        b_slot(const size_type b, const size_type p, const decltype(depth) d)
+            : bucket(b), pathcode(p), depth(d) {
+            assert(d < MAX_BFS_PATH_LEN);
         }
-        return done ? ok : failure;
-    }
+    };
+    #pragma pack(pop)
 
-    // try_read_from_bucket will search the bucket for the given key and store
-    // the associated value if it finds it.
-    bool try_read_from_bucket(const partial_t partial, const key_type &key,
-                              mapped_type &val, const Bucket& b) const {
-        // Silence a warning from MSVC about partial being unused if is_simple.
-        (void)partial;
-        for (size_t i = 0; i < slot_per_bucket; ++i) {
-            if (!b.occupied(i)) {
-                continue;
-            }
-            if (!is_simple && partial != b.partial(i)) {
-                continue;
-            }
-            if (key_eq()(key, b.key(i))) {
-                val = b.val(i);
-                return true;
-            }
+    // b_queue is the queue used to store b_slots for BFS cuckoo hashing.
+    #pragma pack(push, 1)
+    class b_queue {
+    public:
+        b_queue() noexcept : first_(0), last_(0) {}
+
+        void enqueue(b_slot x) {
+            assert(!full());
+            slots_[last_] = x;
+            last_ = increment(last_);
         }
-        return false;
-    }
 
-    // check_in_bucket will search the bucket for the given key and return true
-    // if the key is in the bucket, and false if it isn't.
-    bool check_in_bucket(const partial_t partial, const key_type &key,
-                         const Bucket& b) const {
-        // Silence a warning from MSVC about partial being unused if is_simple.
-        (void)partial;
-        for (size_t i = 0; i < slot_per_bucket; ++i) {
-            if (!b.occupied(i)) {
-                continue;
-            }
-            if (!is_simple && partial != b.partial(i)) {
-                continue;
-            }
-            if (key_eq()(key, b.key(i))) {
-                return true;
-            }
+        b_slot dequeue() {
+            assert(!empty());
+            b_slot& x = slots_[first_];
+            first_ = increment(first_);
+            return x;
         }
-        return false;
-    }
 
-    // add_to_bucket will insert the given key-value pair into the slot. The key
-    // and value will be move-constructed into the table, so they are not valid
-    // for use afterwards.
-    template <typename K, typename... Args>
-    void add_to_bucket(const partial_t partial, Bucket& b,
-                       const size_t slot, K&& key, Args&&... val) {
-        assert(!b.occupied(slot));
-        b.partial(slot) = partial;
-        b.setKV(slot, std::forward<K>(key), std::forward<Args>(val)...);
-        num_inserts_[get_counterid()].num.fetch_add(
-            1, std::memory_order_relaxed);
-    }
-
-    // try_find_insert_bucket will search the bucket and store the index of an
-    // empty slot if it finds one, or -1 if it doesn't. Regardless, it will
-    // search the entire bucket and return false if it finds the key already in
-    // the table (duplicate key error) and true otherwise.
-    bool try_find_insert_bucket(const partial_t partial, const key_type &key,
-                                const Bucket& b, int& slot) const {
-        // Silence a warning from MSVC about partial being unused if is_simple.
-        (void)partial;
-        slot = -1;
-        bool found_empty = false;
-        for (int i = 0; i < static_cast<int>(slot_per_bucket); ++i) {
-            if (b.occupied(i)) {
-                if (!is_simple && partial != b.partial(i)) {
-                    continue;
-                }
-                if (key_eq()(key, b.key(i))) {
-                    return false;
-                }
-            } else {
-                if (!found_empty) {
-                    found_empty = true;
-                    slot = i;
-                }
-            }
-        }
-        return true;
-    }
-
-    // try_del_from_bucket will search the bucket for the given key, and set the
-    // slot of the key to empty if it finds it.
-    bool try_del_from_bucket(const partial_t partial,
-                             const key_type &key, Bucket& b) {
-        for (size_t i = 0; i < slot_per_bucket; ++i) {
-            if (!b.occupied(i)) {
-                continue;
-            }
-            if (!is_simple && b.partial(i) != partial) {
-                continue;
-            }
-            if (key_eq()(b.key(i), key)) {
-                b.eraseKV(i);
-                num_deletes_[get_counterid()].num.fetch_add(
-                    1, std::memory_order_relaxed);
-                return true;
-            }
-        }
-        return false;
-    }
-
-    // try_update_bucket will search the bucket for the given key and change its
-    // associated value if it finds it.
-    template <typename V>
-    bool try_update_bucket(const partial_t partial, Bucket& b,
-                           const key_type &key, V&& val) {
-        for (size_t i = 0; i < slot_per_bucket; ++i) {
-            if (!b.occupied(i)) {
-                continue;
-            }
-            if (!is_simple && b.partial(i) != partial) {
-                continue;
-            }
-            if (key_eq()(b.key(i), key)) {
-                b.val(i) = std::forward<V>(val);
-                return true;
-            }
-        }
-        return false;
-    }
-
-    // try_update_bucket_fn will search the bucket for the given key and change
-    // its associated value with the given function if it finds it.
-    template <typename Updater>
-    bool try_update_bucket_fn(const partial_t partial, const key_type &key,
-                              Updater fn, Bucket& b) {
-        // Silence a warning from MSVC about partial being unused if is_simple.
-        (void)partial;
-        for (size_t i = 0; i < slot_per_bucket; ++i) {
-            if (!b.occupied(i)) {
-                continue;
-            }
-            if (!is_simple && b.partial(i) != partial) {
-                continue;
-            }
-            if (key_eq()(b.key(i), key)) {
-                fn(b.val(i));
-                return true;
-            }
+        bool empty() const {
+            return first_ == last_;
         }
-        return false;
-    }
 
-    // cuckoo_find searches the table for the given key and value, storing the
-    // value in the val if it finds the key. It expects the locks to be taken
-    // and released outside the function.
-    cuckoo_status cuckoo_find(const key_type& key, mapped_type& val,
-                              const size_t hv, const size_t i1,
-                              const size_t i2) const {
-        const partial_t partial = partial_key(hv);
-        if (try_read_from_bucket(partial, key, val, buckets_[i1])) {
-            return ok;
+        bool full() const {
+            return increment(last_) == first_;
         }
-        if (try_read_from_bucket(partial, key, val, buckets_[i2])) {
-            return ok;
-        }
-        return failure_key_not_found;
-    }
 
-    // cuckoo_contains searches the table for the given key, returning true if
-    // it's in the table and false otherwise. It expects the locks to be taken
-    // and released outside the function.
-    bool cuckoo_contains(const key_type& key, const size_t hv, const size_t i1,
-                         const size_t i2) const {
-        const partial_t partial = partial_key(hv);
-        if (check_in_bucket(partial, key, buckets_[i1])) {
-            return true;
-        }
-        if (check_in_bucket(partial, key, buckets_[i2])) {
-            return true;
-        }
-        return false;
-    }
+    private:
+        // The maximum size of the BFS queue. Note that unless it's less than
+        // slot_per_bucket()^MAX_BFS_PATH_LEN, it won't really mean anything.
+        static constexpr size_type MAX_CUCKOO_COUNT = 256;
+        static_assert((MAX_CUCKOO_COUNT & (MAX_CUCKOO_COUNT - 1)) == 0,
+                      "MAX_CUCKOO_COUNT should be a power of 2");
+        // A circular array of b_slots
+        b_slot slots_[MAX_CUCKOO_COUNT];
+        // The index of the head of the queue in the array
+        size_type first_;
+        // One past the index of the last_ item of the queue in the array.
+        size_type last_;
 
-    // cuckoo_insert tries to insert the given key-value pair into an empty slot
-    // in either of the buckets, performing cuckoo hashing if necessary. It
-    // expects the locks to be taken outside the function, but they are released
-    // here, since different scenarios require different handling of the locks.
-    // Before inserting, it checks that the key isn't already in the table.
-    // cuckoo hashing presents multiple concurrency issues, which are explained
-    // in the function. If the insert fails, the key and value won't be
-    // move-constructed, so they can be retried.
-    template <typename K, typename... Args>
-    cuckoo_status cuckoo_insert(const size_t hv, TwoBuckets b,
-                                K&& key, Args&&... val) {
-        int res1, res2;
-        const partial_t partial = partial_key(hv);
-        Bucket& b0 = buckets_[b.i[0]];
-        if (!try_find_insert_bucket(partial, key, b0, res1)) {
-            return failure_key_duplicated;
-        }
-        Bucket& b1 = buckets_[b.i[1]];
-        if (!try_find_insert_bucket(partial, key, b1, res2)) {
-            return failure_key_duplicated;
-        }
-        if (res1 != -1) {
-            add_to_bucket(partial, b0, res1, std::forward<K>(key),
-                          std::forward<Args>(val)...);
-            return ok;
-        }
-        if (res2 != -1) {
-            add_to_bucket(partial, b1, res2, std::forward<K>(key),
-                          std::forward<Args>(val)...);
-            return ok;
+        // returns the index in the queue after ind, wrapping around if
+        // necessary.
+        size_type increment(size_type ind) const {
+            return (ind + 1) & (MAX_CUCKOO_COUNT - 1);
         }
+    };
+    #pragma pack(pop)
 
-        // we are unlucky, so let's perform cuckoo hashing
-        size_t insert_bucket = 0;
-        size_t insert_slot = 0;
-        cuckoo_status st = run_cuckoo(b, insert_bucket, insert_slot);
-        if (st == failure_under_expansion) {
-            // The run_cuckoo operation operated on an old version of the table,
-            // so we have to try again. We signal to the calling insert method
-            // to try again by returning failure_under_expansion.
-            return failure_under_expansion;
-        } else if (st == ok) {
-            assert(!locks_[lock_ind(b.i[0])].try_lock());
-            assert(!locks_[lock_ind(b.i[1])].try_lock());
-            assert(!buckets_[insert_bucket].occupied(insert_slot));
-            assert(insert_bucket == index_hash(get_hashpower(), hv) ||
-                   insert_bucket == alt_index(get_hashpower(), partial,
-                                              index_hash(get_hashpower(), hv)));
-            // Since we unlocked the buckets during run_cuckoo, another insert
-            // could have inserted the same key into either b.i[0] or b.i[1], so
-            // we check for that before doing the insert.
-            if (cuckoo_contains(key, hv, b.i[0], b.i[1])) {
-                return failure_key_duplicated;
-            }
-            add_to_bucket(partial, buckets_[insert_bucket], insert_slot,
-                          std::forward<K>(key), std::forward<Args>(val)...);
-            return ok;
-        }
-        assert(st == failure);
-        LIBCUCKOO_DBG("hash table is full (hashpower = %zu, hash_items = %zu,"
-                      "load factor = %.2f), need to increase hashpower\n",
-                      get_hashpower(), cuckoo_size(),
-                      cuckoo_loadfactor(get_hashpower()));
-        return failure_table_full;
-    }
+    // slot_search searches for a cuckoo path using breadth-first search. It
+    // starts with the i1 and i2 buckets, and, until it finds a bucket with an
+    // empty slot, adds each slot of the bucket in the b_slot. If the queue runs
+    // out of space, it fails.
+    //
+    // throws hashpower_changed if it changed during the search
+    template <typename LOCK_T>
+    b_slot slot_search(const size_type hp, const size_type i1,
+                       const size_type i2) {
+        b_queue q;
+        // The initial pathcode informs cuckoopath_search which bucket the path
+        // starts on
+        q.enqueue(b_slot(i1, 0, 0));
+        q.enqueue(b_slot(i2, 1, 0));
+        while (!q.full() && !q.empty()) {
+            b_slot x = q.dequeue();
+            // Picks a (sort-of) random slot to start from
+            size_type starting_slot = x.pathcode % slot_per_bucket();
+            for (size_type i = 0; i < slot_per_bucket() && !q.full();
+                 ++i) {
+                size_type slot = (starting_slot + i) % slot_per_bucket();
+                auto ob = lock_one<LOCK_T>(hp, x.bucket);
+                Bucket& b = buckets_[x.bucket];
+                if (!b.occupied(slot)) {
+                    // We can terminate the search here
+                    x.pathcode = x.pathcode * slot_per_bucket() + slot;
+                    return x;
+                }
 
-    /**
-     * We run cuckoo_insert in a loop until it succeeds in insert and upsert, so
-     * we pulled out the loop to avoid duplicating logic
-     *
-     * @param key the key to insert
-     * @param val the value to insert
-     * @param hv the hash value of the key
-     * @return true if the insert succeeded, false if there was a duplicate key
-     * @throw libcuckoo_load_factor_too_low if expansion is necessary, but the
-     * load factor of the table is below the threshold
-     */
-    template <typename K, typename... Args>
-    bool cuckoo_insert_loop(size_t hv, K&& key, Args&&... val) {
-        cuckoo_status st;
-        do {
-            auto b = snapshot_and_lock_two(hv);
-            size_t hp = get_hashpower();
-            st = cuckoo_insert(hv, std::move(b), std::forward<K>(key),
-                               std::forward<Args>(val)...);
-            if (st == failure_key_duplicated) {
-                return false;
-            } else if (st == failure_table_full) {
-                if (cuckoo_loadfactor(hp) < minimum_load_factor()) {
-                    throw libcuckoo_load_factor_too_low(minimum_load_factor());
+                // If x has less than the maximum number of path components,
+                // create a new b_slot item, that represents the bucket we would
+                // have come from if we kicked out the item at this slot.
+                const partial_t partial = b.partial(slot);
+                if (x.depth < MAX_BFS_PATH_LEN - 1) {
+                    b_slot y(alt_index(hp, partial, x.bucket),
+                             x.pathcode * slot_per_bucket() + slot, x.depth+1);
+                    q.enqueue(y);
                 }
-                // Expand the table and try again
-                cuckoo_fast_double(hp);
             }
-        } while (st != ok);
-        return true;
-    }
-
-    // cuckoo_delete searches the table for the given key and sets the slot with
-    // that key to empty if it finds it. It expects the locks to be taken and
-    // released outside the function.
-    cuckoo_status cuckoo_delete(const key_type &key, const size_t hv,
-                                const size_t i1, const size_t i2) {
-        const partial_t partial = partial_key(hv);
-        if (try_del_from_bucket(partial, key, buckets_[i1])) {
-            return ok;
-        }
-        if (try_del_from_bucket(partial, key, buckets_[i2])) {
-            return ok;
         }
-        return failure_key_not_found;
+        // We didn't find a short-enough cuckoo path, so the queue ran out of
+        // space. Return a failure value.
+        return b_slot(0, 0, -1);
     }
 
-    // cuckoo_update searches the table for the given key and updates its value
-    // if it finds it. It expects the locks to be taken and released outside the
-    // function.
-    template <typename V>
-    cuckoo_status cuckoo_update(const size_t hv, const size_t i1,
-                                const size_t i2, const key_type &key, V&& val) {
-        const partial_t partial = partial_key(hv);
-        if (try_update_bucket(partial, buckets_[i1], key,
-                              std::forward<V>(val))) {
-            return ok;
-        }
-        if (try_update_bucket(partial, buckets_[i2], key,
-                              std::forward<V>(val))) {
-            return ok;
+    // cuckoo_fast_double will double the size of the table by taking advantage
+    // of the properties of index_hash and alt_index. If the key's move
+    // constructor is not noexcept, we use cuckoo_expand_simple, since that
+    // provides a strong exception guarantee.
+    template <typename LOCK_T, typename AUTO_RESIZE>
+    cuckoo_status cuckoo_fast_double(size_type current_hp) {
+        if (!std::is_nothrow_move_constructible<storage_value_type>::value) {
+            LIBCUCKOO_DBG("%s", "cannot run cuckoo_fast_double because kv-pair "
+                          "is not nothrow move constructible");
+            return cuckoo_expand_simple<LOCK_T, AUTO_RESIZE>(current_hp + 1);
+        }
+        const size_type new_hp = current_hp + 1;
+        std::lock_guard<expansion_lock_t> l(expansion_lock_);
+        cuckoo_status st = check_resize_validity<AUTO_RESIZE>(current_hp, new_hp);
+        if (st != ok) {
+            return st;
         }
-        return failure_key_not_found;
-    }
 
-    // cuckoo_update_fn searches the table for the given key and runs the given
-    // function on its value if it finds it, assigning the result of the
-    // function to the value. It expects the locks to be taken and released
-    // outside the function.
-    template <typename Updater>
-    cuckoo_status cuckoo_update_fn(const key_type &key, Updater fn,
-                                   const size_t hv, const size_t i1,
-                                   const size_t i2) {
-        const partial_t partial = partial_key(hv);
-        if (try_update_bucket_fn(partial, key, fn, buckets_[i1])) {
-            return ok;
-        }
-        if (try_update_bucket_fn(partial, key, fn, buckets_[i2])) {
-            return ok;
+        locks_.resize(hashsize(new_hp));
+        auto unlocker = snapshot_and_lock_all<LOCK_T>();
+        // We can't just resize, since the Bucket is non-copyable and
+        // non-movable. Instead, we allocate a new array of buckets, and move
+        // the contents of each bucket manually.
+        {
+            buckets_t new_buckets(buckets_.size() * 2, get_allocator());
+            for (size_type i = 0; i < buckets_.size(); ++i) {
+                Bucket::move_bucket(allocator_, buckets_[i], new_buckets[i]);
+            }
+            buckets_.swap(new_buckets);
         }
-        return failure_key_not_found;
-    }
+        set_hashpower(new_hp);
 
-    // cuckoo_clear empties the table, calling the destructors of all the
-    // elements it removes from the table. It assumes the locks are taken as
-    // necessary.
-    cuckoo_status cuckoo_clear() noexcept {
-        for (Bucket& b : buckets_) {
-            b.clear();
-        }
-        for (size_t i = 0; i < num_inserts_.size(); ++i) {
-            num_inserts_[i].num.store(0);
-            num_deletes_[i].num.store(0);
-        }
+        // We gradually unlock the new table, by processing each of the buckets
+        // corresponding to each lock we took. For each slot in an old bucket,
+        // we either leave it in the old bucket, or move it to the corresponding
+        // new bucket. After we're done with the bucket, we release the lock on
+        // it and the new bucket, letting other threads using the new map
+        // gradually. We only unlock the locks being used by the old table,
+        // because unlocking new locks would enable operations on the table
+        // before we want them. We also re-evaluate the partial key stored at
+        // each slot, since it depends on the hashpower.
+        const size_type locks_to_move = std::min(
+            locks_.size(), hashsize(current_hp));
+        parallel_exec(0, locks_to_move,
+                      [this, current_hp, new_hp]
+                      (size_type start, size_type end, std::exception_ptr& eptr) {
+                          try {
+                              move_buckets<LOCK_T>(current_hp, new_hp, start, end);
+                          } catch (...) {
+                              eptr = std::current_exception();
+                          }
+                      });
+        parallel_exec(locks_to_move, locks_.size(),
+                      [this](size_type i, size_type end, std::exception_ptr&) {
+                          for (; i < end; ++i) {
+                              locks_[i].unlock(LOCK_T());
+                          }
+                      });
+        // Since we've unlocked the buckets ourselves, we don't need the
+        // unlocker to do it for us.
+        unlocker.release();
         return ok;
     }
 
-    // cuckoo_size returns the number of elements in the given table.
-    size_t cuckoo_size() const noexcept {
-        size_t inserts = 0;
-        size_t deletes = 0;
-        for (size_t i = 0; i < num_inserts_.size(); ++i) {
-            inserts += num_inserts_[i].num.load();
-            deletes += num_deletes_[i].num.load();
-        }
-        return inserts-deletes;
-    }
-
-    // cuckoo_loadfactor returns the load factor of the given table.
-    double cuckoo_loadfactor(const size_t hp) const noexcept {
-        return (static_cast<double>(cuckoo_size()) / slot_per_bucket /
-                hashsize(hp));
-    }
-
-    void move_buckets(size_t current_hp, size_t new_hp,
-                      size_t start_lock_ind, size_t end_lock_ind) {
+    template <typename LOCK_T>
+    void move_buckets(size_type current_hp, size_type new_hp,
+                      size_type start_lock_ind, size_type end_lock_ind) {
         for (; start_lock_ind < end_lock_ind; ++start_lock_ind) {
-            for (size_t bucket_i = start_lock_ind;
+            for (size_type bucket_i = start_lock_ind;
                  bucket_i < hashsize(current_hp);
-                 bucket_i += locks_t::size()) {
+                 bucket_i += locks_t::max_size()) {
                 // By doubling the table size, the index_hash and alt_index of
                 // each key got one bit added to the top, at position
                 // current_hp, which means anything we have to move will either
                 // be at the same bucket position, or exactly
                 // hashsize(current_hp) later than the current bucket
                 Bucket& old_bucket = buckets_[bucket_i];
-                const size_t new_bucket_i = bucket_i + hashsize(current_hp);
+                const size_type new_bucket_i = bucket_i + hashsize(current_hp);
                 Bucket& new_bucket = buckets_[new_bucket_i];
-                size_t new_bucket_slot = 0;
+                size_type new_bucket_slot = 0;
 
                 // Move each item from the old bucket that needs moving into the
                 // new bucket
-                for (size_t slot = 0; slot < slot_per_bucket; ++slot) {
+                for (size_type slot = 0; slot < slot_per_bucket(); ++slot) {
                     if (!old_bucket.occupied(slot)) {
                         continue;
                     }
-                    const size_t hv = hashed_key(old_bucket.key(slot));
-                    const size_t old_ihash = index_hash(current_hp, hv);
-                    const size_t old_ahash = alt_index(
-                        current_hp, old_bucket.partial(slot), old_ihash);
-                    const size_t new_ihash = index_hash(new_hp, hv);
-                    const size_t new_ahash = alt_index(
-                        new_hp, old_bucket.partial(slot), new_ihash);
+                    const hash_value hv = hashed_key(old_bucket.key(slot));
+                    const size_type old_ihash = index_hash(current_hp, hv.hash);
+                    const size_type old_ahash = alt_index(
+                        current_hp, hv.partial, old_ihash);
+                    const size_type new_ihash = index_hash(new_hp, hv.hash);
+                    const size_type new_ahash = alt_index(
+                        new_hp, hv.partial, new_ihash);
                     if ((bucket_i == old_ihash && new_ihash == new_bucket_i) ||
                         (bucket_i == old_ahash && new_ahash == new_bucket_i)) {
                         // We're moving the key from the old bucket to the new
                         // one
                         Bucket::move_to_bucket(
+                            allocator_,
                             old_bucket, slot, new_bucket, new_bucket_slot++);
+                        // Also update the lock counts, in case we're moving to
+                        // a different lock.
+                        --locks_[lock_ind(bucket_i)].elem_counter();
+                        ++locks_[lock_ind(new_bucket_i)].elem_counter();
                     } else {
                         // Check that we don't want to move the new key
                         assert(
@@ -1670,484 +1706,832 @@ private:
             }
             // Now we can unlock the lock, because all the buckets corresponding
             // to it have been unlocked
-            locks_[start_lock_ind].unlock();
+            locks_[start_lock_ind].unlock(LOCK_T());
         }
     }
 
-    // cuckoo_fast_double will double the size of the table by taking advantage
-    // of the properties of index_hash and alt_index.
-    cuckoo_status cuckoo_fast_double(size_t current_hp) {
-        size_t new_hp = current_hp + 1;
-        size_t mhp = maximum_hashpower();
-        if (mhp != NO_MAXIMUM_HASHPOWER && new_hp > mhp) {
+    // Checks whether the resize is okay to proceed. Returns a status code, or
+    // throws an exception, depending on the error type.
+    using automatic_resize = std::integral_constant<bool, true>;
+    using manual_resize = std::integral_constant<bool, false>;
+
+    template <typename AUTO_RESIZE>
+    cuckoo_status check_resize_validity(const size_type orig_hp,
+                                        const size_type new_hp) {
+        const size_type mhp = maximum_hashpower();
+        if (mhp != LIBCUCKOO_NO_MAXIMUM_HASHPOWER && new_hp > mhp) {
             throw libcuckoo_maximum_hashpower_exceeded(new_hp);
         }
-
-        std::lock_guard<expansion_lock_t> l(expansion_lock_);
-        if (get_hashpower() != current_hp) {
+        if (AUTO_RESIZE::value && load_factor() < minimum_load_factor()) {
+            throw libcuckoo_load_factor_too_low(minimum_load_factor());
+        }
+        if (hashpower() != orig_hp) {
             // Most likely another expansion ran before this one could grab the
             // locks
             LIBCUCKOO_DBG("%s", "another expansion is on-going\n");
             return failure_under_expansion;
         }
-
-        locks_.allocate(std::min(locks_t::size(), hashsize(new_hp)));
-        auto unlocker = snapshot_and_lock_all();
-        buckets_.resize(buckets_.size() * 2);
-        set_hashpower(new_hp);
-
-        // We gradually unlock the new table, by processing each of the buckets
-        // corresponding to each lock we took. For each slot in an old bucket,
-        // we either leave it in the old bucket, or move it to the corresponding
-        // new bucket. After we're done with the bucket, we release the lock on
-        // it and the new bucket, letting other threads using the new map
-        // gradually. We only unlock the locks being used by the old table,
-        // because unlocking new locks would enable operations on the table
-        // before we want them.
-        const size_t locks_to_move = std::min(locks_t::size(),
-                                              hashsize(current_hp));
-        parallel_exec(0, locks_to_move, kNumCores(),
-                      [this, current_hp, new_hp](size_t start, size_t end) {
-                          move_buckets(current_hp, new_hp, start, end);
-                      });
-        parallel_exec(locks_to_move, locks_.allocated_size(), kNumCores(),
-                      [this](size_t i, size_t end) {
-                          for (; i < end; ++i) {
-                              locks_[i].unlock();
-                          }
-                      });
-        // Since we've unlocked the buckets ourselves, we don't need the
-        // unlocker to do it for us.
-        unlocker.deactivate();
         return ok;
     }
 
-    // insert_into_table is a helper function used by cuckoo_expand_simple to
-    // fill up the new table. It moves data out of the original table into the
-    // new one.
-    static void insert_into_table(
-        cuckoohash_map<Key, T, Hash, Pred, Alloc, slot_per_bucket>& new_map,
-        buckets_t& buckets, size_t i, size_t end) {
-        for (; i < end; ++i) {
-            for (size_t j = 0; j < slot_per_bucket; ++j) {
-                if (buckets[i].occupied(j)) {
-                    storage_value_type& kvpair = buckets[i].storage_kvpair(j);
-                    new_map.insert(
-                        std::move(kvpair.first),
-                        std::move(kvpair.second));
-                }
-            }
-        }
-    }
-
     // cuckoo_expand_simple will resize the table to at least the given
-    // new_hashpower. If is_expansion is true, new_hashpower must be greater
-    // than the current size of the table. If it's false, then new_hashpower
-    // must be less. When we're shrinking the table, if the current table
+    // new_hashpower. When we're shrinking the table, if the current table
     // contains more elements than can be held by new_hashpower, the resulting
     // hashpower will be greater than new_hashpower. It needs to take all the
     // bucket locks, since no other operations can change the table during
     // expansion. Throws libcuckoo_maximum_hashpower_exceeded if we're expanding
     // beyond the maximum hashpower, and we have an actual limit.
-    cuckoo_status cuckoo_expand_simple(size_t new_hp,
-                                       bool is_expansion) {
-        size_t mhp = maximum_hashpower();
-        if (mhp != NO_MAXIMUM_HASHPOWER && new_hp > mhp) {
-            throw libcuckoo_maximum_hashpower_exceeded(new_hp);
+    template <typename LOCK_T, typename AUTO_RESIZE>
+    cuckoo_status cuckoo_expand_simple(size_type new_hp) {
+        const auto unlocker = snapshot_and_lock_all<LOCK_T>();
+        const size_type hp = hashpower();
+        cuckoo_status st = check_resize_validity<AUTO_RESIZE>(hp, new_hp);
+        if (st != ok) {
+            return st;
         }
-        auto unlocker = snapshot_and_lock_all();
-        const size_t hp = get_hashpower();
-        if ((is_expansion && new_hp <= hp) ||
-            (!is_expansion && new_hp >= hp)) {
-            // Most likely another expansion ran before this one could grab the
-            // locks
-            LIBCUCKOO_DBG("%s", "another expansion is on-going\n");
-            return failure_under_expansion;
-        }
-
         // Creates a new hash table with hashpower new_hp and adds all
-        // the elements from the old buckets
-        cuckoohash_map<Key, T, Hash, Pred, Alloc, slot_per_bucket> new_map(
-            hashsize(new_hp) * slot_per_bucket);
-        const size_t threadnum = kNumCores();
-        const size_t buckets_per_thread = (
-            (hashsize(hp) + threadnum - 1) / threadnum);
-        std::vector<std::thread> insertion_threads(threadnum);
-        for (size_t i = 0; i < threadnum; ++i) {
-            insertion_threads[i] = std::thread(
-                insert_into_table, std::ref(new_map), std::ref(buckets_),
-                i*buckets_per_thread, std::min((i+1)*buckets_per_thread,
-                                               hashsize(hp)));
-        }
-        for (size_t i = 0; i < threadnum; ++i) {
-            insertion_threads[i].join();
-        }
+        // the elements from the old buckets.
+        cuckoohash_map new_map(
+            hashsize(new_hp) * slot_per_bucket(),
+            hash_function(),
+            key_eq(),
+            get_allocator());
+
+        parallel_exec(
+            0, hashsize(hp),
+            [this, &new_map]
+            (size_type i, size_type end, std::exception_ptr& eptr) {
+                try {
+                    for (; i < end; ++i) {
+                        for (size_type j = 0; j < slot_per_bucket(); ++j) {
+                            if (buckets_[i].occupied(j)) {
+                                storage_value_type& kvpair = (
+                                    buckets_[i].storage_kvpair(j));
+                                new_map.insert(kvpair.first,
+                                               std::move(kvpair.second));
+                            }
+                        }
+                    }
+                } catch (...) {
+                    eptr = std::current_exception();
+                }
+            });
 
         // Swap the current buckets vector with new_map's and set the hashpower.
         // This is okay, because we have all the locks, so nobody else should be
         // reading from the buckets array. Then the old buckets array will be
         // deleted when new_map is deleted. All the locks should be released by
         // the unlocker as well.
-        std::swap(buckets_, new_map.buckets_);
+        buckets_.swap(new_map.buckets_);
         set_hashpower(new_map.hashpower_);
         return ok;
     }
 
-public:
-    //! A locked_table is an ownership wrapper around a \ref cuckoohash_map
-    //! table instance. When given a table instance, it takes all the locks on
-    //! the table, blocking all outside operations on the table. Because the
-    //! locked_table has unique ownership of the table, it can provide a set of
-    //! operations on the table that aren't possible in a concurrent context.
-    //! Right now, this includes the ability to construct STL-compatible
-    //! iterators on the table. When the locked_table is destroyed (or the \ref
-    //! release method is called), it will release all locks on the table. This
-    //! will invalidate all existing iterators.
-    class locked_table {
-        // A manager for all the locks we took on the table.
-        AllUnlocker unlocker_;
-        // A reference to the buckets owned by the table
-        std::reference_wrapper<buckets_t> buckets_;
-        // A boolean shared to all iterators, indicating whether the
-        // locked_table has ownership of the hashtable or not.
-        std::shared_ptr<bool> has_table_lock_;
-
-        // The constructor locks the entire table, retrying until
-        // snapshot_and_lock_all succeeds. We keep this constructor private (but
-        // expose it to the cuckoohash_map class), since we don't want users
-        // calling it.
-        locked_table(cuckoohash_map<Key, T, Hash, Pred, Alloc,
-                     SLOT_PER_BUCKET>& hm)
-            : unlocker_(std::move(hm.snapshot_and_lock_all())),
-              buckets_(hm.buckets_),
-              has_table_lock_(new bool(true)) {}
+    // Executes the function over the given range split over num_threads threads
+    template <typename F>
+    static void parallel_exec(size_type start, size_type end, F func) {
+        static const size_type num_threads = (
+            std::thread::hardware_concurrency() == 0 ?
+            1 : std::thread::hardware_concurrency());
+        size_type work_per_thread = (end - start) / num_threads;
+        std::vector<std::thread, typename allocator_traits_::
+        template rebind_alloc<std::thread> > threads(num_threads);
+        std::vector<std::exception_ptr, typename allocator_traits_::
+        template rebind_alloc<std::exception_ptr>> eptrs(num_threads, nullptr);
+        for (size_type i = 0; i < num_threads - 1; ++i) {
+            threads[i] = std::thread(func, start, start + work_per_thread,
+                                     std::ref(eptrs[i]));
+            start += work_per_thread;
+        }
+        threads.back() = std::thread(func, start, end, std::ref(eptrs.back()));
+        for (std::thread& t : threads) {
+            t.join();
+        }
+        for (std::exception_ptr& eptr : eptrs) {
+            if (eptr) {
+                std::rethrow_exception(eptr);
+            }
+        }
+    }
 
-    public:
-        locked_table(locked_table&& lt)
-            : unlocker_(std::move(lt.unlocker_)),
-              buckets_(std::move(lt.buckets_)),
-              has_table_lock_(std::move(lt.has_table_lock_)) {}
+    // Deletion functions
 
-        locked_table& operator=(locked_table&& lt) {
-            release();
-            unlocker_ = std::move(lt.unlocker_);
-            buckets_ = std::move(lt.buckets_);
-            has_table_lock_ = std::move(lt.has_table_lock_);
-            return *this;
-        }
+    // Removes an item from a bucket, decrementing the associated counter as
+    // well.
+    void del_from_bucket(Bucket& b, const size_type bucket_ind,
+                         const size_type slot) {
+        b.eraseKV(allocator_, slot);
+        --locks_[lock_ind(bucket_ind)].elem_counter();
+    }
 
-        //! Returns true if the locked table still has ownership of the
-        //! hashtable, false otherwise.
-        bool has_table_lock() const noexcept {
-            return has_table_lock_ && *has_table_lock_;
+    // Empties the table, calling the destructors of all the elements it removes
+    // from the table. It assumes the locks are taken as necessary.
+    cuckoo_status cuckoo_clear() {
+        for (size_type i = 0; i < buckets_.size(); ++i) {
+            buckets_[i].clear(allocator_);
+        }
+        for (size_type i = 0; i < locks_.size(); ++i) {
+            locks_[i].elem_counter() = 0;
         }
+        return ok;
+    }
 
-        //! release unlocks the table, thereby freeing it up for other
-        //! operations, but also invalidating all iterators and future
-        //! operations with this table. It is idempotent.
-        void release() noexcept {
-            if (has_table_lock()) {
-                unlocker_.release();
-                *has_table_lock_ = false;
-            }
+    // Rehashing functions
+
+    template <typename LOCK_T>
+    bool cuckoo_rehash(size_type n) {
+        const size_type hp = hashpower();
+        if (n == hp) {
+            return false;
         }
+        return cuckoo_expand_simple<LOCK_T, manual_resize>(n) == ok;
+    }
 
-        ~locked_table() {
-            release();
+    template <typename LOCK_T>
+    bool cuckoo_reserve(size_type n) {
+        const size_type hp = hashpower();
+        const size_type new_hp = reserve_calc(n);
+        if (new_hp == hp) {
+            return false;
         }
+        return cuckoo_expand_simple<LOCK_T, manual_resize>(new_hp) == ok;
+    }
 
-    private:
-        //! A templated iterator whose implementation works for both const and
-        //! non_const iterators. It is an STL-style BidirectionalIterator that
-        //! can be used to iterate over a locked table.
-        template <bool IS_CONST>
-        class templated_iterator :
-            public std::iterator<std::bidirectional_iterator_tag, value_type> {
+    // Miscellaneous functions
 
-            typedef typename std::conditional<
-                IS_CONST, const buckets_t, buckets_t>::type
-            maybe_const_buckets_t;
+    void set_hashpower(size_type val) {
+        hashpower_.store(val, std::memory_order_release);
+    }
 
-            // The buckets locked and owned by the locked table being iterated
-            // over.
-            std::reference_wrapper<maybe_const_buckets_t> buckets_;
+    // reserve_calc takes in a parameter specifying a certain number of slots
+    // for a table and returns the smallest hashpower that will hold n elements.
+    static size_type reserve_calc(const size_type n) {
+        const size_type buckets = (n + slot_per_bucket() - 1) / slot_per_bucket();
+        size_type blog2;
+        for (blog2 = 1; (1UL << blog2) < buckets; ++blog2);
+        assert(n <= hashsize(blog2) * slot_per_bucket());
+        return blog2;
+    }
 
-            // The shared boolean indicating whether the iterator points to a
-            // still-locked table or not. It should never be nullptr.
-            std::shared_ptr<bool> has_table_lock_;
+    // This class is a friend for unit testing
+    friend class UnitTestInternalAccess;
 
-            // The bucket index of the item being pointed to. For implementation
-            // convenience, we let it take on negative values.
-            intmax_t index_;
-            // The slot in the bucket of the item being pointed to. For
-            // implementation convenience, we let it take on negative values.
-            intmax_t slot_;
+    // Member variables
 
-        public:
-            //! Return true if the iterators are from the same locked table and
-            //! location, false otherwise. This will return false if either of
-            //! the iterators has lost ownership of its table.
-            template <bool OTHER_CONST>
-            bool operator==(const templated_iterator<OTHER_CONST>&
-                            it) const noexcept {
-                return (*has_table_lock_ && *it.has_table_lock_
-                        && &buckets_.get() == &it.buckets_.get()
-                        && index_ == it.index_ && slot_ == it.slot_);
-            }
+    // 2**hashpower is the number of buckets. This cannot be changed unless all
+    // the locks are taken on the table. Since it is still read and written by
+    // multiple threads not necessarily synchronized by a lock, we keep it
+    // atomic
+    std::atomic<size_type> hashpower_;
 
-            //! Equivalent to !operator==(it)
-            template <bool OTHER_CONST>
-            bool operator!=(const templated_iterator<OTHER_CONST>&
-                            it) const noexcept {
-                return !(operator==(it));
-            }
+    // The hash function
+    hasher hash_fn_;
 
-            //! Return the key-value pair pointed to by the iterator. Behavior
-            //! is undefined if the iterator is at the end.
-            const value_type& operator*() const {
-                check_iterator();
-                return buckets_.get()[index_].kvpair(slot_);
-            }
+    // The equality function
+    key_equal eq_fn_;
+
+    // The allocator
+    allocator_type allocator_;
+
+    // vector of buckets. The size or memory location of the buckets cannot be
+    // changed unless al the locks are taken on the table. Thus, it is only safe
+    // to access the buckets_ vector when you have at least one lock held.
+    buckets_t buckets_;
+
+    // array of locks. marked mutable, so that const methods can take locks.
+    // Even though it's a vector, it should not ever change in size after the
+    // initial allocation.
+    mutable locks_t locks_;
+
+    // a lock to synchronize expansions
+    expansion_lock_t expansion_lock_;
+
+    // stores the minimum load factor allowed for automatic expansions. Whenever
+    // an automatic expansion is triggered (during an insertion where cuckoo
+    // hashing fails, for example), we check the load factor against this
+    // double, and throw an exception if it's lower than this value. It can be
+    // used to signal when the hash function is bad or the input adversarial.
+    std::atomic<double> minimum_load_factor_;
 
-            //! Returns a mutable reference to the current key-value pair
-            //! pointed to by the iterator. Behavior is undefined if the
-            //! iterator is at the end.
-            ENABLE_IF(, !IS_CONST, value_type&) operator*() {
-                check_iterator();
-                return buckets_.get()[static_cast<size_t>(index_)].
-                    kvpair(static_cast<size_t>(slot_));
+    // stores the maximum hashpower allowed for any expansions. If set to
+    // NO_MAXIMUM_HASHPOWER, this limit will be disregarded.
+    std::atomic<size_type> maximum_hashpower_;
+
+public:
+    /**
+     * An ownership wrapper around a @ref cuckoohash_map table instance. When
+     * given a table instance, it takes all the locks on the table, blocking all
+     * outside operations on the table. Because the locked_table has unique
+     * ownership of the table, it can provide a set of operations on the table
+     * that aren't possible in a concurrent context.
+     *
+     * The locked_table interface is very similar to the STL unordered_map
+     * interface, and for functions whose signatures correspond to unordered_map
+     * methods, the behavior should be mostly the same.
+     */
+    class locked_table {
+    public:
+        /** @name Type Declarations */
+        /**@{*/
+
+        using key_type = cuckoohash_map::key_type;
+        using mapped_type = cuckoohash_map::mapped_type;
+        using value_type = cuckoohash_map::value_type;
+        using size_type = cuckoohash_map::size_type;
+        using difference_type = cuckoohash_map::difference_type;
+        using hasher = cuckoohash_map::hasher;
+        using key_equal = cuckoohash_map::key_equal;
+        using allocator_type = cuckoohash_map::allocator_type;
+        using reference = cuckoohash_map::reference;
+        using const_reference = cuckoohash_map::const_reference;
+        using pointer = cuckoohash_map::pointer;
+        using const_pointer = cuckoohash_map::const_pointer;
+
+        /**
+         * A constant iterator over a @ref locked_table, which allows read-only
+         * access to the elements of the table. It fulfills the
+         * BidirectionalIterator concept.
+         */
+        class const_iterator {
+        public:
+            using difference_type = locked_table::difference_type;
+            using value_type = locked_table::value_type;
+            using pointer = locked_table::const_pointer;
+            using reference = locked_table::const_reference;
+            using iterator_category = std::bidirectional_iterator_tag;
+
+            const_iterator() {}
+
+            // Return true if the iterators are from the same locked table and
+            // location, false otherwise.
+            bool operator==(const const_iterator& it) const {
+                return buckets_ == it.buckets_ &&
+                    index_ == it.index_ && slot_ == it.slot_;
             }
 
-            //! Return a pointer to the immutable key-value pair pointed to by
-            //! the iterator. Behavior is undefined if the iterator is at the
-            //! end.
-            const value_type* operator->() const {
-                check_iterator();
-                return &buckets_.get()[index_].kvpair(slot_);
+            bool operator!=(const const_iterator& it) const {
+                return !(operator==(it));
             }
 
-            //! Returns a mutable pointer to the current key-value pair pointed
-            //! to by the iterator. Behavior is undefined if the iterator is at
-            //! the end.
-            ENABLE_IF(, !IS_CONST, value_type*) operator->() {
-                check_iterator();
-                return &buckets_.get()[index_].kvpair(slot_);
+            reference operator*() const {
+                return (*buckets_)[index_].kvpair(slot_);
             }
 
+            pointer operator->() const {
+                return &(*buckets_)[index_].kvpair(slot_);
+            }
 
-            //! Advance the iterator to the next item in the table, or to the
-            //! end of the table. Returns the iterator at its new position.
-            //! Behavior is undefined if the iterator is at the end.
-            templated_iterator& operator++() {
+            // Advance the iterator to the next item in the table, or to the end
+            // of the table. Returns the iterator at its new position.
+            const_iterator& operator++() {
                 // Move forward until we get to a slot that is occupied, or we
                 // get to the end
-                check_iterator();
-                for (; static_cast<size_t>(index_) < buckets_.get().size();
-                     ++index_) {
-                    while (static_cast<size_t>(++slot_) < SLOT_PER_BUCKET) {
-                        if (buckets_.get()[static_cast<size_t>(index_)].
-                            occupied(static_cast<size_t>(slot_))) {
+                ++slot_;
+                for (; index_ < buckets_->size(); ++index_) {
+                    for (; slot_ < slot_per_bucket(); ++slot_) {
+                        if ((*buckets_)[index_].occupied(slot_)) {
                             return *this;
                         }
                     }
-                    slot_ = -1;
+                    slot_ = 0;
                 }
-                // We're at the end, so set index_ and slot_ to the end position
-                std::tie(index_, slot_) = end_pos(buckets_.get());
+                assert(std::make_pair(index_, slot_) == end_pos(*buckets_));
                 return *this;
             }
 
-            //! Advance the iterator to the next item in the table, or to the
-            //! end of the table. Returns the iterator at its old position.
-            //! Behavior is undefined if the iterator is at the end.
-            templated_iterator operator++(int) {
-                templated_iterator old(*this);
+            // Advance the iterator to the next item in the table, or to the end
+            // of the table. Returns the iterator at its old position.
+            const_iterator operator++(int) {
+                const_iterator old(*this);
                 ++(*this);
                 return old;
             }
 
-            //! Move the iterator back to the previous item in the table.
-            //! Returns the iterator at its new position. Behavior is undefined
-            //! if the iterator is at the beginning.
-            templated_iterator& operator--() {
-                // Move backward until we get to the beginning. If we try to
-                // move before that, we stop.
-                check_iterator();
-                for (; index_ >= 0; --index_) {
-                    while (--slot_ >= 0) {
-                        if (buckets_.get()[static_cast<size_t>(index_)]
-                            .occupied(static_cast<size_t>(slot_))) {
-                            return *this;
-                        }
+            // Move the iterator back to the previous item in the table. Returns
+            // the iterator at its new position.
+            const_iterator& operator--() {
+                // Move backward until we get to the beginning. Behavior is
+                // undefined if we are iterating at the first element, so we can
+                // assume we'll reach an element. This means we'll never reach
+                // index_ == 0 and slot_ == 0.
+                if (slot_ == 0) {
+                    --index_;
+                    slot_ = slot_per_bucket() - 1;
+                } else {
+                    --slot_;
+                }
+                while (!(*buckets_)[index_].occupied(slot_)) {
+                    if (slot_ == 0) {
+                        --index_;
+                        slot_ = slot_per_bucket() - 1;
+                    } else {
+                        --slot_;
                     }
-                    slot_ = SLOT_PER_BUCKET;
                 }
-                // Either we iterated before begin(), which means we're in
-                // undefined territory, or we iterated from the end of the table
-                // back, which means the table is empty. Either way, setting the
-                // index_ and slot_ to end_pos() is okay.
-                std::tie(index_, slot_) = end_pos(buckets_.get());
                 return *this;
             }
 
             //! Move the iterator back to the previous item in the table.
             //! Returns the iterator at its old position. Behavior is undefined
             //! if the iterator is at the beginning.
-            templated_iterator operator--(int) {
-                templated_iterator old(*this);
+            const_iterator operator--(int) {
+                const_iterator old(*this);
                 --(*this);
                 return old;
             }
 
-        private:
-            static const std::pair<intmax_t, intmax_t> end_pos(
-                const buckets_t& buckets) {
-                // When index_ == buckets.size() and slot_ == 0, we're at the
-                // end of the table. When index_ and slot_ point to the data
-                // with the lowest bucket and slot, we're at the beginning of
-                // the table. If there is nothing in the table, index_ ==
-                // buckets.size() and slot_ == 0 also means we're at the
-                // beginning of the table (so begin() == end()).
-                return {buckets.size(), 0};
+        protected:
+            // The buckets owned by the locked table being iterated over. Even
+            // though const_iterator cannot modify the buckets, we don't mark
+            // them const so that the mutable iterator can derive from this
+            // class. Also, since iterators should be default constructible,
+            // copyable, and movable, we have to make this a raw pointer type.
+            buckets_t* buckets_;
+
+            // The bucket index of the item being pointed to. For implementation
+            // convenience, we let it take on negative values.
+            size_type index_;
+
+            // The slot in the bucket of the item being pointed to. For
+            // implementation convenience, we let it take on negative values.
+            size_type slot_;
+
+            // Returns the position signifying the end of the table
+            static std::pair<size_type, size_type>
+            end_pos(const buckets_t& buckets) {
+                return std::make_pair(buckets.size(), 0);
             }
 
             // The private constructor is used by locked_table to create
             // iterators from scratch. If the given index_-slot_ pair is at the
-            // end of the table, or that spot is occupied, stay. Otherwise, step
-            // forward to the next data item, or to the end of the table.
-            templated_iterator(
-                maybe_const_buckets_t& buckets,
-                std::shared_ptr<bool> has_table_lock, size_t index, size_t slot)
-                : buckets_(buckets), has_table_lock_(has_table_lock),
-                  index_(static_cast<intmax_t>(index)),
-                  slot_(static_cast<intmax_t>(slot)) {
-                if (std::make_pair(index_, slot_) != end_pos(buckets) &&
-                    !buckets[static_cast<size_t>(index_)]
-                    .occupied(static_cast<size_t>(slot_))) {
+            // end of the table, or the given spot is occupied, stay. Otherwise,
+            // step forward to the next data item, or to the end of the table.
+            const_iterator(buckets_t& buckets, size_type index,
+                           size_type slot) noexcept
+                : buckets_(std::addressof(buckets)), index_(index), slot_(slot) {
+                if (std::make_pair(index_, slot_) != end_pos(*buckets_) &&
+                    !(*buckets_)[index_].occupied(slot_)) {
                     operator++();
                 }
             }
 
-            // Throws an exception if the iterator has been invalidated because
-            // the locked_table lost ownership of the table info.
-            void check_iterator() const {
-                if (!(*has_table_lock_)) {
-                    throw std::runtime_error("Iterator has been invalidated");
-                }
+            friend class locked_table;
+        };
+
+        /**
+         * An iterator over a @ref locked_table, which allows read-write access
+         * to elements of the table. It fulfills the BidirectionalIterator
+         * concept.
+         */
+        class iterator : public const_iterator {
+        public:
+            using pointer = cuckoohash_map::pointer;
+            using reference = cuckoohash_map::reference;
+
+            iterator() {}
+
+            bool operator==(const iterator& it) const {
+                return const_iterator::operator==(it);
+            }
+
+            bool operator!=(const iterator& it) const {
+                return const_iterator::operator!=(it);
+            }
+
+            using const_iterator::operator*;
+            reference operator*() {
+                return (*const_iterator::buckets_)[
+                    const_iterator::index_].kvpair(const_iterator::slot_);
+            }
+
+            using const_iterator::operator->;
+            pointer operator->() {
+                return &(*const_iterator::buckets_)[
+                    const_iterator::index_].kvpair(const_iterator::slot_);
+            }
+
+            iterator& operator++() {
+                const_iterator::operator++();
+                return *this;
+            }
+
+            iterator operator++(int) {
+                iterator old(*this);
+                const_iterator::operator++();
+                return old;
+            }
+
+            iterator& operator--() {
+                const_iterator::operator--();
+                return *this;
             }
 
-            friend class cuckoohash_map<Key, T, Hash, Pred,
-                                        Alloc, SLOT_PER_BUCKET>;
+            iterator operator--(int) {
+                iterator old(*this);
+                const_iterator::operator--();
+                return old;
+            }
+
+        private:
+            iterator(buckets_t& buckets, size_type index, size_type slot) noexcept
+                : const_iterator(buckets, index, slot) {}
+
+            friend class locked_table;
         };
 
-    public:
-        typedef templated_iterator<true> const_iterator;
-        typedef templated_iterator<false> iterator;
+        /**@}*/
+
+        /** @name Table Parameters */
+        /**@{*/
+
+        static constexpr size_type slot_per_bucket() {
+            return cuckoohash_map::slot_per_bucket();
+        }
+
+        /**@}*/
+
+        /** @name Constructors, Destructors, and Assignment */
+        /**@{*/
+
+        locked_table() = delete;
+        locked_table(const locked_table&) = delete;
+        locked_table& operator=(const locked_table&) = delete;
+
+        locked_table(locked_table&& lt) noexcept
+            : map_(std::move(lt.map_)),
+              unlocker_(std::move(lt.unlocker_))
+            {}
+
+        locked_table& operator=(locked_table&& lt) noexcept {
+            unlock();
+            map_ = std::move(lt.map_);
+            unlocker_ = std::move(lt.unlocker_);
+            return *this;
+        }
+
+        /**
+         * Unlocks the table, thereby freeing the locks on the table, but also
+         * invalidating all iterators and table operations with this object. It
+         * is idempotent.
+         */
+        void unlock() {
+            unlocker_.unlock();
+        }
+
+        /**@}*/
+
+        /** @name Table Details
+         *
+         * Methods for getting information about the table. Many are identical
+         * to their @ref cuckoohash_map counterparts. Only new functions or
+         * those with different behavior are documented.
+         *
+         */
+        /**@{*/
+
+        /**
+         * Returns whether the locked table has ownership of the table
+         *
+         * @return true if it still has ownership, false otherwise
+         */
+        bool is_active() const {
+            return unlocker_.is_active();
+        }
+
+        hasher hash_function() const {
+            return map_.get().hash_function();
+        }
+
+        key_equal key_eq() const {
+            return map_.get().key_eq();
+        }
+
+        allocator_type get_allocator() const {
+            return map_.get().get_allocator();
+        }
+
+        size_type hashpower() const {
+            return map_.get().hashpower();
+        }
+
+        size_type bucket_count() const {
+            return map_.get().bucket_count();
+        }
+
+        bool empty() const {
+            return map_.get().empty();
+        }
+
+        size_type size() const {
+            return map_.get().size();
+        }
+
+        size_type capacity() const {
+            return map_.get().capacity();
+        }
+
+        double load_factor() const {
+            return map_.get().load_factor();
+        }
+
+        void minimum_load_factor(const double mlf) {
+            map_.get().minimum_load_factor(mlf);
+        }
+
+        double minimum_load_factor() {
+            return map_.get().minimum_load_factor();
+        }
+
+        void maximum_hashpower(size_type mhp) {
+            map_.get().maximum_hashpower(mhp);
+        }
+
+        size_type maximum_hashpower() {
+            return map_.get().maximum_hashpower();
+        }
+
+        /**@}*/
+
+        /**@{*/
+        /**
+         * Returns an iterator to the beginning of the table. If the table is
+         * empty, it will point past the end of the table.
+         *
+         * @return an iterator to the beginning of the table
+         */
 
-        //! begin returns an iterator to the beginning of the table
         iterator begin() {
-            check_table();
-            return iterator(buckets_.get(), has_table_lock_, 0, 0);
+            return iterator(map_.get().buckets_, 0, 0);
         }
 
-        //! begin returns a const_iterator to the beginning of the table
         const_iterator begin() const {
-            check_table();
-            return const_iterator(buckets_.get(), has_table_lock_, 0, 0);
+            return const_iterator(map_.get().buckets_, 0, 0);
         }
 
-        //! cbegin returns a const_iterator to the beginning of the table
         const_iterator cbegin() const {
             return begin();
         }
 
-        //! end returns an iterator to the end of the table
+        /**@}*/
+
+        /** @name Iterators */
+        /**@{*/
+
+        /**@{*/
+        /**
+         * Returns an iterator past the end of the table.
+         *
+         * @return an iterator past the end of the table
+         */
+
         iterator end() {
-            check_table();
-            const auto end_pos = const_iterator::end_pos(buckets_.get());
-            return iterator(buckets_.get(), has_table_lock_,
-                            static_cast<size_t>(end_pos.first),
-                            static_cast<size_t>(end_pos.second));
+            const auto end_pos = const_iterator::end_pos(map_.get().buckets_);
+            return iterator(map_.get().buckets_,
+                            static_cast<size_type>(end_pos.first),
+                            static_cast<size_type>(end_pos.second));
         }
 
-        //! end returns a const_iterator to the end of the table
         const_iterator end() const {
-            check_table();
-            const auto end_pos = const_iterator::end_pos(buckets_.get());
-            return const_iterator(buckets_.get(), has_table_lock_,
-                                  static_cast<size_t>(end_pos.first),
-                                  static_cast<size_t>(end_pos.second));
+            const auto end_pos = const_iterator::end_pos(map_.get().buckets_);
+            return const_iterator(map_.get().buckets_,
+                                  static_cast<size_type>(end_pos.first),
+                                  static_cast<size_type>(end_pos.second));
         }
 
-        //! cend returns a const_iterator to the end of the table
         const_iterator cend() const {
             return end();
         }
 
-    private:
-        // Throws an exception if the locked_table has been invalidated because
-        // it lost ownership of the table info.
-        void check_table() const {
-            if (!has_table_lock()) {
-                throw std::runtime_error(
-                    "locked_table lost ownership of table");
+        /**@}*/
+
+        /**@}*/
+
+        /** @name Modifiers */
+        /**@{*/
+
+        void clear() {
+            map_.get().cuckoo_clear();
+        }
+
+        /**
+         * This behaves like the @c unordered_map::try_emplace method, but with
+         * the same argument lifetime properties as @ref cuckoohash_map::insert.
+         * It will always invalidate all iterators, due to the possibilities of
+         * cuckoo hashing and expansion.
+         */
+        template <typename K, typename... Args>
+        std::pair<iterator, bool> insert(K&& key, Args&&... val) {
+            K k(std::forward<K>(key));
+            hash_value hv = map_.get().hashed_key(k);
+            auto b = map_.get().template snapshot_and_lock_two<locking_inactive>(hv);
+            table_position pos = map_.get().cuckoo_insert_loop(hv, b, k);
+            if (pos.status == ok) {
+                map_.get().add_to_bucket(
+                    pos.index, pos.slot, hv.partial, k,
+                    std::forward<Args>(val)...);
+            } else {
+                assert(pos.status == failure_key_duplicated);
+            }
+            return std::make_pair(
+                iterator(map_.get().buckets_, pos.index, pos.slot),
+                pos.status == ok);
+        }
+
+        iterator erase(const_iterator pos) {
+            map_.get().del_from_bucket(map_.get().buckets_[pos.index_],
+                                       pos.index_,
+                                       pos.slot_);
+            return iterator(map_.get().buckets_, pos.index_, pos.slot_);
+        }
+
+        iterator erase(iterator pos) {
+            map_.get().del_from_bucket(map_.get().buckets_[pos.index_],
+                                       pos.index_,
+                                       pos.slot_);
+            return iterator(map_.get().buckets_, pos.index_, pos.slot_);
+        }
+
+        template <typename K>
+        size_type erase(const K& key) {
+            const hash_value hv = map_.get().hashed_key(key);
+            const auto b = map_.get().
+                template snapshot_and_lock_two<locking_inactive>(hv);
+            const table_position pos = map_.get().cuckoo_find(
+                key, hv.partial, b.first(), b.second());
+            if (pos.status == ok) {
+                map_.get().del_from_bucket(map_.get().buckets_[pos.index],
+                                           pos.index, pos.slot);
+                return 1;
+            } else {
+                return 0;
             }
         }
 
-        friend class cuckoohash_map<Key, T, Hash, Pred, Alloc, SLOT_PER_BUCKET>;
-    };
+        /**@}*/
 
-    //! lock_table construct a \ref locked_table object that owns all the locks
-    //! in the table. This can be used to iterate through the table.
-    locked_table lock_table() {
-        return locked_table(*this);
-    }
+        /** @name Lookup */
+        /**@{*/
 
-    // This class is a friend for unit testing
-    friend class UnitTestInternalAccess;
+        template <typename K>
+        iterator find(const K& key) {
+            const hash_value hv = map_.get().hashed_key(key);
+            const auto b = map_.get().
+                template snapshot_and_lock_two<locking_inactive>(hv);
+            const table_position pos = map_.get().cuckoo_find(
+                key, hv.partial, b.first(), b.second());
+            if (pos.status == ok) {
+                return iterator(map_.get().buckets_, pos.index, pos.slot);
+            } else {
+                return end();
+            }
+        }
 
-    // Member variables
-private:
-    // 2**hashpower is the number of buckets. This cannot be changed unless all
-    // the locks are taken on the table. Since it is still read and written by
-    // multiple threads not necessarily synchronized by a lock, we keep it
-    // atomic
-    std::atomic<size_t> hashpower_;
+        template <typename K>
+        const_iterator find(const K& key) const {
+            const hash_value hv = map_.get().hashed_key(key);
+            const auto b = map_.get().
+                template snapshot_and_lock_two<locking_inactive>(hv);
+            const table_position pos = map_.get().cuckoo_find(
+                key, hv.partial, b.first(), b.second());
+            if (pos.status == ok) {
+                return const_iterator(map_.get().buckets_, pos.index, pos.slot);
+            } else {
+                return end();
+            }
+        }
 
-    // vector of buckets. The size or memory location of the buckets cannot be
-    // changed unless al the locks are taken on the table. Thus, it is only safe
-    // to access the buckets_ vector when you have at least one lock held.
-    buckets_t buckets_;
+        template <typename K>
+        mapped_type& at(const K& key) {
+            auto it = find(key);
+            if (it == end()) {
+                throw std::out_of_range("key not found in table");
+            } else {
+                return it->second;
+            }
+        }
 
-    // array of locks. marked mutable, so that const methods can take locks.
-    // Even though it's a vector, it should not ever change in size after the
-    // initial allocation.
-    mutable locks_t locks_;
+        template <typename K>
+        const mapped_type& at(const K& key) const {
+            auto it = find(key);
+            if (it == end()) {
+                throw std::out_of_range("key not found in table");
+            } else {
+                return it->second;
+            }
+        }
 
-    // a lock to synchronize expansions
-    expansion_lock_t expansion_lock_;
+        /**
+         * This function has the same lifetime properties as @ref
+         * cuckoohash_map::insert, except that the value is default-constructed,
+         * with no parameters, if it is not already in the table.
+         */
+        template <typename K>
+        T& operator[](K&& key) {
+            auto result = insert(std::forward<K>(key));
+            return result.first->second;
+        }
 
-    // per-core counters for the number of inserts and deletes
-    std::vector<
-        cacheint, typename allocator_type::template rebind<cacheint>::other>
-    num_inserts_, num_deletes_;
+        template <typename K>
+        size_type count(const K& key) const {
+            const hash_value hv = map_.get().hashed_key(key);
+            const auto b = map_.get().
+                template snapshot_and_lock_two<locking_inactive>(hv);
+            return map_.get().cuckoo_find(
+                key, hv.partial, b.first(), b.second()).status == ok ? 1 : 0;
+        }
 
-    // stores the minimum load factor allowed for automatic expansions. Whenever
-    // an automatic expansion is triggered (during an insertion where cuckoo
-    // hashing fails, for example), we check the load factor against this
-    // double, and throw an exception if it's lower than this value. It can be
-    // used to signal when the hash function is bad or the input adversarial.
-    std::atomic<double> minimum_load_factor_;
+        template <typename K>
+        std::pair<iterator, iterator> equal_range(const K& key) {
+            auto it = find(key);
+            if (it == end()) {
+                return std::make_pair(it, it);
+            } else {
+                auto start_it = it++;
+                return std::make_pair(start_it, it);
+            }
+        }
 
-    // stores the maximum hashpower allowed for any expansions. If set to
-    // NO_MAXIMUM_HASHPOWER, this limit will be disregarded.
-    std::atomic<size_t> maximum_hashpower_;
+        template <typename K>
+        std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
+            auto it = find(key);
+            if (it == end()) {
+                return std::make_pair(it, it);
+            } else {
+                auto start_it = it++;
+                return std::make_pair(start_it, it);
+            }
+        }
 
-    // The hash function
-    hasher hash_fn;
+        /**@}*/
 
-    // The equality function
-    key_equal eq_fn;
+        /** @name Re-sizing */
+        /**@{*/
+
+        /**
+         * This has the same behavior as @ref cuckoohash_map::rehash, except
+         * that we don't return anything.
+         */
+        void rehash(size_type n) {
+            map_.get().template cuckoo_rehash<locking_inactive>(n);
+        }
+
+        /**
+         * This has the same behavior as @ref cuckoohash_map::reserve, except
+         * that we don't return anything.
+         */
+        void reserve(size_type n) {
+            map_.get().template cuckoo_reserve<locking_inactive>(n);
+        }
+
+        /**@}*/
+
+        /** @name Comparison  */
+        /**@{*/
+
+        bool operator==(const locked_table& lt) const {
+            if (size() != lt.size()) {
+                return false;
+            }
+            for (const auto& elem : lt) {
+                auto it = find(elem.first);
+                if (it == end() || it->second != elem.second) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        bool operator!=(const locked_table& lt) const {
+            if (size() != lt.size()) {
+                return true;
+            }
+            for (const auto& elem : lt) {
+                auto it = find(elem.first);
+                if (it == end() || it->second != elem.second) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        /**@}*/
+
+    private:
+        // The constructor locks the entire table. We keep this constructor
+        // private (but expose it to the cuckoohash_map class), since we don't
+        // want users calling it.
+        locked_table(cuckoohash_map& map) noexcept
+            : map_(map), unlocker_(
+                map_.get().template snapshot_and_lock_all<locking_active>())
+            {}
+
+        // A reference to the map owned by the table
+        std::reference_wrapper<cuckoohash_map> map_;
+        // A manager for all the locks we took on the table.
+        AllBuckets<locking_active> unlocker_;
+
+        friend class cuckoohash_map;
+    };
 };
 
 #endif // _CUCKOOHASH_MAP_HH
diff --git a/include/cuckoohash_util.hh b/include/cuckoohash_util.hh
index 38521af..cdb31c6 100644
--- a/include/cuckoohash_util.hh
+++ b/include/cuckoohash_util.hh
@@ -5,15 +5,19 @@
 
 #include <exception>
 #include <thread>
+#include <utility>
 #include <vector>
 #include "cuckoohash_config.hh" // for LIBCUCKOO_DEBUG
 
 #if LIBCUCKOO_DEBUG
+//! When \ref LIBCUCKOO_DEBUG is 0, LIBCUCKOO_DBG will printing out status
+//! messages in various situations
 #  define LIBCUCKOO_DBG(fmt, ...)                                          \
      fprintf(stderr, "\x1b[32m""[libcuckoo:%s:%d:%lu] " fmt"" "\x1b[0m",   \
-             __FILE__,__LINE__, (unsigned long)std::this_thread::get_id(), \
+             __FILE__,__LINE__, std::hash<std::thread::id>()(std::this_thread::get_id()), \
              __VA_ARGS__)
 #else
+//! When \ref LIBCUCKOO_DEBUG is 0, LIBCUCKOO_DBG does nothing
 #  define LIBCUCKOO_DBG(fmt, ...)  do {} while (0)
 #endif
 
@@ -54,14 +58,13 @@
 #  endif
 #endif
 
-// For enabling certain methods based on a condition. Here's an example.
-// ENABLE_IF(some_cond, type, static, inline) method() {
-//     ...
-// }
-#define ENABLE_IF(preamble, condition, return_type)                     \
-    template <class Bogus=void*>                                        \
-    preamble typename std::enable_if<sizeof(Bogus) &&                   \
-        condition, return_type>::type
+//! For enabling certain methods based on a condition. Here's an example.
+//! LIBCUCKOO_ENABLE_IF(sizeof(int) == 4, int) method() {
+//!     ...
+//! }
+#define LIBCUCKOO_ENABLE_IF(condition, return_type)                       \
+    template <class Bogus = void*>                                        \
+    typename std::enable_if<sizeof(Bogus) && condition, return_type>::type
 
 /**
  * Thrown when an automatic expansion is triggered, but the load factor of the
@@ -80,6 +83,9 @@ public:
     libcuckoo_load_factor_too_low(const double lf)
         : load_factor_(lf) {}
 
+    /**
+     * @return a descriptive error message
+     */
     virtual const char* what() const noexcept override {
         return "Automatic expansion triggered when load factor was below "
             "minimum threshold";
@@ -88,7 +94,7 @@ public:
     /**
      * @return the load factor of the table when the exception was thrown
      */
-    double load_factor() {
+    double load_factor() const {
         return load_factor_;
     }
 private:
@@ -110,6 +116,9 @@ public:
     libcuckoo_maximum_hashpower_exceeded(const size_t hp)
         : hashpower_(hp) {}
 
+    /**
+     * @return a descriptive error message
+     */
     virtual const char* what() const noexcept override {
         return "Expansion beyond maximum hashpower";
     }
@@ -117,61 +126,11 @@ public:
     /**
      * @return the hashpower we were trying to expand to
      */
-    size_t hashpower() {
+    size_t hashpower() const {
         return hashpower_;
     }
 private:
     const size_t hashpower_;
 };
 
-// Allocates an array of the given size and value-initializes each element with
-// the 0-argument constructor
-template <class T, class Alloc>
-T* create_array(const size_t size) {
-    Alloc allocator;
-    T* arr = allocator.allocate(size);
-    // Initialize all the elements, safely deallocating and destroying
-    // everything in case of error.
-    size_t i;
-    try {
-        for (i = 0; i < size; ++i) {
-            allocator.construct(&arr[i]);
-        }
-    } catch (...) {
-        for (size_t j = 0; j < i; ++j) {
-            allocator.destroy(&arr[j]);
-        }
-        allocator.deallocate(arr, size);
-        throw;
-    }
-    return arr;
-}
-
-// Destroys every element of an array of the given size and then deallocates the
-// memory.
-template <class T, class Alloc>
-void destroy_array(T* arr, const size_t size) {
-    Alloc allocator;
-    for (size_t i = 0; i < size; ++i) {
-        allocator.destroy(&arr[i]);
-    }
-    allocator.deallocate(arr, size);
-}
-
-// executes the function over the given range split over num_threads threads
-template <class F>
-static void parallel_exec(size_t start, size_t end,
-                          size_t num_threads, F func) {
-    size_t work_per_thread = (end - start) / num_threads;
-    std::vector<std::thread> threads(num_threads);
-    for (size_t i = 0; i < num_threads - 1; ++i) {
-        threads[i] = std::thread(func, start, start + work_per_thread);
-        start += work_per_thread;
-    }
-    threads[num_threads - 1] = std::thread(func, start, end);
-    for (std::thread& t : threads) {
-        t.join();
-    }
-}
-
 #endif // _CUCKOOHASH_UTIL_HH
diff --git a/include/lazy_array.hh b/include/lazy_array.hh
deleted file mode 100644
index 80191e7..0000000
--- a/include/lazy_array.hh
+++ /dev/null
@@ -1,119 +0,0 @@
-/** \file */
-
-#ifndef _LAZY_ARRAY_HH
-#define _LAZY_ARRAY_HH
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-
-#include "cuckoohash_util.hh"
-
-// lazy array. A fixed-size array, broken up into segments that are dynamically
-// allocated, only when requested. The array size and segment size are
-// pre-defined, and are powers of two. The user must make sure the necessary
-// segments are allocated before accessing the array.
-template <uint8_t OFFSET_BITS, uint8_t SEGMENT_BITS,
-          class T, class Alloc = std::allocator<T>
-          >
-class lazy_array {
-    static_assert(SEGMENT_BITS + OFFSET_BITS <= sizeof(size_t)*8,
-                  "The number of segment and offset bits cannot exceed "
-                  " the number of bits in a size_t");
-private:
-    static const size_t SEGMENT_SIZE = 1UL << OFFSET_BITS;
-    static const size_t NUM_SEGMENTS = 1UL << SEGMENT_BITS;
-    // The segments array itself is mutable, so that the const subscript
-    // operator can still add segments
-    mutable std::array<T*, NUM_SEGMENTS> segments_;
-
-    void move_other_array(lazy_array&& arr) {
-        clear();
-        std::copy(arr.segments_.begin(), arr.segments_.end(),
-                  segments_.begin());
-        std::fill(arr.segments_.begin(), arr.segments_.end(), nullptr);
-    }
-
-    inline size_t get_segment(size_t i) {
-        return i >> OFFSET_BITS;
-    }
-
-    static const size_t OFFSET_MASK = ((1UL << OFFSET_BITS) - 1);
-    inline size_t get_offset(size_t i) {
-        return i & OFFSET_MASK;
-    }
-
-public:
-    lazy_array(): segments_{{nullptr}} {}
-
-    // No copying
-    lazy_array(const lazy_array&) = delete;
-    lazy_array& operator=(const lazy_array&) = delete;
-
-    // Moving is allowed
-    lazy_array(lazy_array&& arr) : segments_{{nullptr}} {
-        move_other_array(std::move(arr));
-    }
-    lazy_array& operator=(lazy_array&& arr) {
-        move_other_vector(std::move(arr));
-        return *this;
-    }
-
-    ~lazy_array() {
-        clear();
-    }
-
-    void clear() {
-        for (size_t i = 0; i < segments_.size(); ++i) {
-            if (segments_[i] != nullptr) {
-                destroy_array<T, Alloc>(segments_[i], SEGMENT_SIZE);
-                segments_[i] = nullptr;
-            }
-        }
-    }
-
-    T& operator[](size_t i) {
-        assert(segments_[get_segment(i)] != nullptr);
-        return segments_[get_segment(i)][get_offset(i)];
-    }
-
-    const T& operator[](size_t i) const {
-        assert(segments_[get_segment(i)] != nullptr);
-        return segments_[get_segment(i)][get_offset(i)];
-    }
-
-    // Ensures that the array has enough segments to index target elements, not
-    // exceeding the total size. The user must ensure that the array is properly
-    // allocated before accessing a certain index. This saves having to check
-    // every index operation.
-    void allocate(size_t target) {
-        assert(target <= size());
-        if (target == 0) {
-            return;
-        }
-        const size_t last_segment = get_segment(target - 1);
-        for (size_t i = 0; i <= last_segment; ++i) {
-            if (segments_[i] == nullptr) {
-                segments_[i] = create_array<T, Alloc>(SEGMENT_SIZE);
-            }
-        }
-    }
-
-    // Returns the number of elements in the array that can be indexed, starting
-    // contiguously from the beginning.
-    size_t allocated_size() const {
-        size_t num_allocated_segments = 0;
-        for (;
-             (num_allocated_segments < NUM_SEGMENTS &&
-              segments_[num_allocated_segments] != nullptr);
-             ++num_allocated_segments) {}
-        return num_allocated_segments * SEGMENT_SIZE;
-    }
-
-    static constexpr size_t size() {
-        return 1UL << (OFFSET_BITS + SEGMENT_BITS);
-    }
-};
-
-#endif // _LAZY_ARRAY_HH
diff --git a/include/libcuckoo_lazy_array.hh b/include/libcuckoo_lazy_array.hh
new file mode 100644
index 0000000..99c4b5b
--- /dev/null
+++ b/include/libcuckoo_lazy_array.hh
@@ -0,0 +1,202 @@
+/** \file */
+
+#ifndef _LIBCUCKOO_LAZY_ARRAY_HH
+#define _LIBCUCKOO_LAZY_ARRAY_HH
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+#include "cuckoohash_util.hh"
+
+/**
+ * A fixed-size array, broken up into segments that are dynamically allocated
+ * upon request. It is the user's responsibility to make sure they only access
+ * allocated parts of the array.
+ *
+ * @tparam OFFSET_BITS the number of bits of the index used as the offset within
+ * a segment
+ * @tparam SEGMENT_BITS the number of bits of the index used as the segment
+ * index
+ * @tparam T the type of stored in the container
+ * @tparam Alloc the allocator used to allocate data
+ */
+template <uint8_t OFFSET_BITS, uint8_t SEGMENT_BITS,
+          class T, class Alloc = std::allocator<T>
+          >
+class libcuckoo_lazy_array {
+public:
+    using value_type = T;
+    using allocator_type = Alloc;
+private:
+    using traits_ = std::allocator_traits<allocator_type>;
+public:
+    using size_type = std::size_t;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    static_assert(SEGMENT_BITS + OFFSET_BITS <= sizeof(size_type)*8,
+                  "The number of segment and offset bits cannot exceed "
+                  " the number of bits in a size_type");
+
+    /**
+     * Default constructor. Creates an empty array with no allocated segments.
+     */
+    libcuckoo_lazy_array(const allocator_type& allocator = Alloc())
+        noexcept(noexcept(Alloc()))
+        : segments_{{nullptr}}, allocated_segments_(0), allocator_(allocator) {}
+
+    /**
+     * Constructs an array with enough segments allocated to fit @p target
+     * elements. Each allocated element is default-constructed.
+     *
+     * @param target the number of elements to allocate space for
+     */
+    libcuckoo_lazy_array(size_type target,
+                         const allocator_type& allocator = Alloc())
+        noexcept(noexcept(Alloc()))
+        : libcuckoo_lazy_array(allocator) {
+        segments_.fill(nullptr);
+        resize(target);
+    }
+
+    libcuckoo_lazy_array(const libcuckoo_lazy_array&) = delete;
+    libcuckoo_lazy_array& operator=(const libcuckoo_lazy_array&) = delete;
+
+    /**
+     * Move constructor
+     *
+     * @param arr the array being moved
+     */
+    libcuckoo_lazy_array(libcuckoo_lazy_array&& arr) noexcept
+        : segments_(arr.segments_),
+          allocated_segments_(arr.allocated_segments_),
+          allocator_(std::move(arr.allocator_)) {
+        // Deactivate the array by setting its allocated segment count to 0
+        arr.allocated_segments_ = 0;
+    }
+
+    /**
+     * Destructor. Destroys all elements allocated in the array.
+     */
+    ~libcuckoo_lazy_array()
+        noexcept(std::is_nothrow_destructible<T>::value) {
+        clear();
+    }
+
+    /**
+     * Destroys all elements allocated in the array.
+     */
+    void clear() {
+        for (size_type i = 0; i < allocated_segments_; ++i) {
+            destroy_array(segments_[i]);
+            segments_[i] = nullptr;
+        }
+    }
+
+    /**
+     * Index operator
+     *
+     * @return a reference to the data at the given index
+     */
+    reference operator[](size_type i) {
+        assert(get_segment(i) < allocated_segments_);
+        return segments_[get_segment(i)][get_offset(i)];
+    }
+
+    /**
+     * Const index operator
+     *
+     * @return a const reference to the data at the given index
+     */
+    const_reference operator[](size_type i) const {
+        assert(get_segment(i) < allocated_segments_);
+        return segments_[get_segment(i)][get_offset(i)];
+    }
+
+    /**
+     * Returns the number of elements the array has allocated space for
+     *
+     * @return current size of the array
+     */
+    size_type size() const {
+        return allocated_segments_ * SEGMENT_SIZE;
+    }
+
+    /**
+     * Returns the maximum number of elements the array can hold
+     *
+     * @return maximum size of the array
+     */
+    static constexpr size_type max_size() {
+        return 1UL << (OFFSET_BITS + SEGMENT_BITS);
+    }
+
+    /**
+     * Allocate enough space for @p target elements, not exceeding the capacity
+     * of the array. Under no circumstance will the array be shrunk.
+     *
+     * @param target the number of elements to ensure space is allocated for
+     */
+    void resize(size_type target) {
+        target = std::min(target, max_size());
+        if (target == 0) {
+            return;
+        }
+        const size_type last_segment = get_segment(target - 1);
+        for (size_type i = allocated_segments_; i <= last_segment; ++i) {
+            segments_[i] = create_array();
+        }
+        allocated_segments_ = last_segment + 1;
+    }
+
+private:
+    static constexpr size_type SEGMENT_SIZE = 1UL << OFFSET_BITS;
+    static constexpr size_type NUM_SEGMENTS = 1UL << SEGMENT_BITS;
+    static constexpr size_type OFFSET_MASK = SEGMENT_SIZE - 1;
+
+    std::array<T*, NUM_SEGMENTS> segments_;
+    size_type allocated_segments_;
+    allocator_type allocator_;
+
+    static size_type get_segment(size_type i) {
+        return i >> OFFSET_BITS;
+    }
+
+    static size_type get_offset(size_type i) {
+        return i & OFFSET_MASK;
+    }
+
+    // Allocates a SEGMENT_SIZE-sized array and default-initializes each element
+    typename traits_::pointer create_array() {
+        typename traits_::pointer arr = traits_::allocate(
+            allocator_, SEGMENT_SIZE);
+        // Initialize all the elements, safely deallocating and destroying
+        // everything in case of error.
+        size_type i;
+        try {
+            for (i = 0; i < SEGMENT_SIZE; ++i) {
+                traits_::construct(allocator_, &arr[i]);
+            }
+        } catch (...) {
+            for (size_type j = 0; j < i; ++j) {
+                traits_::destroy(allocator_, &arr[j]);
+            }
+            traits_::deallocate(allocator_, arr, SEGMENT_SIZE);
+            throw;
+        }
+        return arr;
+    }
+
+    // Destroys every element of a SEGMENT_SIZE-sized array and then deallocates
+    // the memory.
+    void destroy_array(typename traits_::pointer arr) {
+        for (size_type i = 0; i < SEGMENT_SIZE; ++i) {
+            traits_::destroy(allocator_, &arr[i]);
+        }
+        traits_::deallocate(allocator_, arr, SEGMENT_SIZE);
+    }
+};
+
+#endif // _LIBCUCKOO_LAZY_ARRAY_HH
diff --git a/include/merge_files.hpp b/include/merge_files.hpp
deleted file mode 100644
index d717975..0000000
--- a/include/merge_files.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*  This file is part of Jellyfish.
-
-    Jellyfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Jellyfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __JELLYFISH_MERGE_FILES_HPP__
-#define __JELLYFISH_MERGE_FILES_HPP__
-
-#include <vector>
-#include <jellyfish/err.hpp>
-#include <jellyfish/file_header.hpp>
-
-define_error_class(MergeError);
-
-/// Merge files. Throw a MergeError in case of error.
-void merge_files(std::vector<const char*> input_files, const char* out_file,
-                 jellyfish::file_header& h, uint64_t min, uint64_t max);
-
-#endif /* __JELLYFISH_MERGE_FILES_HPP__ */
diff --git a/include/pcg_extras.hpp b/include/pcg_extras.hpp
new file mode 100644
index 0000000..1e545a1
--- /dev/null
+++ b/include/pcg_extras.hpp
@@ -0,0 +1,638 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014 Melissa O'Neill <oneill at pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+/*
+ * This file provides support code that is useful for random-number generation
+ * but not specific to the PCG generation scheme, including:
+ *      - 128-bit int support for platforms where it isn't available natively
+ *      - bit twiddling operations
+ *      - I/O of 128-bit and 8-bit integers
+ *      - Handling the evilness of SeedSeq
+ *      - Support for efficiently producing random numbers less than a given
+ *        bound
+ */
+
+#ifndef PCG_EXTRAS_HPP_INCLUDED
+#define PCG_EXTRAS_HPP_INCLUDED 1
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <limits>
+#include <iostream>
+#include <type_traits>
+#include <utility>
+#include <locale>
+#include <iterator>
+#include <utility>
+
+#ifdef __GNUC__
+    #include <cxxabi.h>
+#endif
+
+/*
+ * Abstractions for compiler-specific directives
+ */
+
+#ifdef __GNUC__
+    #define PCG_NOINLINE __attribute__((noinline))
+#else
+    #define PCG_NOINLINE
+#endif
+
+/*
+ * Some members of the PCG library use 128-bit math.  When compiling on 64-bit
+ * platforms, both GCC and Clang provide 128-bit integer types that are ideal
+ * for the job.
+ *
+ * On 32-bit platforms (or with other compilers), we fall back to a C++
+ * class that provides 128-bit unsigned integers instead.  It may seem
+ * like we're reinventing the wheel here, because libraries already exist
+ * that support large integers, but most existing libraries provide a very
+ * generic multiprecision code, but here we're operating at a fixed size.
+ * Also, most other libraries are fairly heavyweight.  So we use a direct
+ * implementation.  Sadly, it's much slower than hand-coded assembly or
+ * direct CPU support.
+ *
+ */
+#if __SIZEOF_INT128__
+    namespace pcg_extras {
+        typedef __uint128_t pcg128_t;
+    }
+    #define PCG_128BIT_CONSTANT(high,low) \
+            ((pcg128_t(high) << 64) + low)
+#else
+    #include "pcg_uint128.hpp"
+    namespace pcg_extras {
+        typedef pcg_extras::uint_x4<uint32_t,uint64_t> pcg128_t;
+    }
+    #define PCG_128BIT_CONSTANT(high,low) \
+            pcg128_t(high,low)
+    #define PCG_EMULATED_128BIT_MATH 1
+#endif
+
+
+namespace pcg_extras {
+
+/*
+ * We often need to represent a "number of bits".  When used normally, these
+ * numbers are never greater than 128, so an unsigned char is plenty.
+ * If you're using a nonstandard generator of a larger size, you can set
+ * PCG_BITCOUNT_T to have it define it as a larger size.  (Some compilers
+ * might produce faster code if you set it to an unsigned int.)
+ */
+
+#ifndef PCG_BITCOUNT_T
+    typedef uint8_t bitcount_t;
+#else
+    typedef PCG_BITCOUNT_T bitcount_t;
+#endif
+
+/*
+ * C++ requires us to be able to serialize RNG state by printing or reading
+ * it from a stream.  Because we use 128-bit ints, we also need to be able
+ * ot print them, so here is code to do so.
+ *
+ * This code provides enough functionality to print 128-bit ints in decimal
+ * and zero-padded in hex.  It's not a full-featured implementation.
+ */
+
+template <typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out, pcg128_t value)
+{
+    auto desired_base = out.flags() & out.basefield;
+    bool want_hex = desired_base == out.hex;
+
+    if (want_hex) {
+        uint64_t highpart = uint64_t(value >> 64);
+        uint64_t lowpart  = uint64_t(value);
+        auto desired_width = out.width();
+        if (desired_width > 16) {
+            out.width(desired_width - 16);
+        }
+        if (highpart != 0 || desired_width > 16)
+            out << highpart;
+        CharT oldfill = '\0';
+        if (highpart != 0) {
+            out.width(16);
+            oldfill = out.fill('0');
+        }
+        auto oldflags = out.setf(decltype(desired_base){}, out.showbase);
+        out << lowpart;
+        out.setf(oldflags);
+        if (highpart != 0) {
+            out.fill(oldfill);
+        }
+        return out;
+    }
+    constexpr size_t MAX_CHARS_128BIT = 40;
+
+    char buffer[MAX_CHARS_128BIT];
+    char* pos = buffer+sizeof(buffer);
+    *(--pos) = '\0';
+    constexpr auto BASE = pcg128_t(10ULL);
+    do {
+        auto div = value / BASE;
+        auto mod = uint32_t(value - (div * BASE));
+        *(--pos) = '0' + char(mod);
+        value = div;
+    } while(value != pcg128_t(0ULL));
+    return out << pos;
+}
+
+template <typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in, pcg128_t& value)
+{
+    typename std::basic_istream<CharT,Traits>::sentry s(in);
+
+    if (!s)
+         return in;
+
+    constexpr auto BASE = pcg128_t(10ULL);
+    pcg128_t current(0ULL);
+    bool did_nothing = true;
+    bool overflow = false;
+    for(;;) {
+        CharT wide_ch = in.get();
+        if (!in.good())
+            break;
+        auto ch = in.narrow(wide_ch, '\0');
+        if (ch < '0' || ch > '9') {
+            in.unget();
+            break;
+        }
+        did_nothing = false;
+        pcg128_t digit(uint32_t(ch - '0'));
+        pcg128_t timesbase = current*BASE;
+        overflow = overflow || timesbase < current;
+        current = timesbase + digit;
+        overflow = overflow || current < digit;
+    }
+
+    if (did_nothing || overflow) {
+        in.setstate(std::ios::failbit);
+        if (overflow)
+            current = ~pcg128_t(0ULL);
+    }
+
+    value = current;
+
+    return in;
+}
+
+/*
+ * Likewise, if people use tiny rngs, we'll be serializing uint8_t.
+ * If we just used the provided IO operators, they'd read/write chars,
+ * not ints, so we need to define our own.  We *can* redefine this operator
+ * here because we're in our own namespace.
+ */
+
+template <typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>&out, uint8_t value)
+{
+    return out << uint32_t(value);
+}
+
+template <typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in, uint8_t& target)
+{
+    uint32_t value = 0xdecea5edU;
+    in >> value;
+    if (!in && value == 0xdecea5edU)
+        return in;
+    if (value > uint8_t(~0)) {
+        in.setstate(std::ios::failbit);
+        value = ~0U;
+    }
+    target = uint8_t(value);
+    return in;
+}
+
+/* Unfortunately, the above functions don't get found in preference to the
+ * built in ones, so we create some more specific overloads that will.
+ * Ugh.
+ */
+
+inline std::ostream& operator<<(std::ostream& out, uint8_t value)
+{
+    return pcg_extras::operator<< <char>(out, value);
+}
+
+inline std::istream& operator>>(std::istream& in, uint8_t& value)
+{
+    return pcg_extras::operator>> <char>(in, value);
+}
+
+
+
+/*
+ * Useful bitwise operations.
+ */
+
+/*
+ * XorShifts are invertable, but they are someting of a pain to invert.
+ * This function backs them out.  It's used by the whacky "inside out"
+ * generator defined later.
+ */
+
+template <typename itype>
+inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift)
+{
+    if (2*shift >= bits) {
+        return x ^ (x >> shift);
+    }
+    itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1;
+    itype highmask1 = ~lowmask1;
+    itype top1 = x;
+    itype bottom1 = x & lowmask1;
+    top1 ^= top1 >> shift;
+    top1 &= highmask1;
+    x = top1 | bottom1;
+    itype lowmask2 = (itype(1U) << (bits - shift)) - 1;
+    itype bottom2 = x & lowmask2;
+    bottom2 = unxorshift(bottom2, bits - shift, shift);
+    bottom2 &= lowmask1;
+    return top1 | bottom2;
+}
+
+/*
+ * Rotate left and right.
+ *
+ * In ideal world, compilers would spot idiomatic rotate code and convert it
+ * to a rotate instruction.  Of course, opinions vary on what the correct
+ * idiom is and how to spot it.  For clang, sometimes it generates better
+ * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM.
+ */
+
+template <typename itype>
+inline itype rotl(itype value, bitcount_t rot)
+{
+    constexpr bitcount_t bits = sizeof(itype) * 8;
+    constexpr bitcount_t mask = bits - 1;
+#if PCG_USE_ZEROCHECK_ROTATE_IDIOM
+    return rot ? (value << rot) | (value >> (bits - rot)) : value;
+#else
+    return (value << rot) | (value >> ((- rot) & mask));
+#endif
+}
+
+template <typename itype>
+inline itype rotr(itype value, bitcount_t rot)
+{
+    constexpr bitcount_t bits = sizeof(itype) * 8;
+    constexpr bitcount_t mask = bits - 1;
+#if PCG_USE_ZEROCHECK_ROTATE_IDIOM
+    return rot ? (value >> rot) | (value << (bits - rot)) : value;
+#else
+    return (value >> rot) | (value << ((- rot) & mask));
+#endif
+}
+
+/* Unfortunately, both Clang and GCC sometimes perform poorly when it comes
+ * to properly recognizing idiomatic rotate code, so for we also provide
+ * assembler directives (enabled with PCG_USE_INLINE_ASM).  Boo, hiss.
+ * (I hope that these compilers get better so that this code can die.)
+ *
+ * These overloads will be preferred over the general template code above.
+ */
+#if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__  || __i386__)
+
+inline uint8_t rotr(uint8_t value, bitcount_t rot)
+{
+    asm ("rorb   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+inline uint16_t rotr(uint16_t value, bitcount_t rot)
+{
+    asm ("rorw   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+inline uint32_t rotr(uint32_t value, bitcount_t rot)
+{
+    asm ("rorl   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+#if __x86_64__
+inline uint64_t rotr(uint64_t value, bitcount_t rot)
+{
+    asm ("rorq   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+#endif // __x86_64__
+
+#endif // PCG_USE_INLINE_ASM
+
+
+/*
+ * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of
+ * 32-bit integers with seed data, but sometimes we want to produce
+ * larger or smaller integers.
+ *
+ * The following code handles this annoyance.
+ *
+ * uneven_copy will copy an array of 32-bit ints to an array of larger or
+ * smaller ints (actually, the code is general it only needing forward
+ * iterators).  The copy is identical to the one that would be performed if
+ * we just did memcpy on a standard little-endian machine, but works
+ * regardless of the endian of the machine (or the weirdness of the ints
+ * involved).
+ *
+ * generate_to initializes an array of integers using a SeedSeq
+ * object.  It is given the size as a static constant at compile time and
+ * tries to avoid memory allocation.  If we're filling in 32-bit constants
+ * we just do it directly.  If we need a separate buffer and it's small,
+ * we allocate it on the stack.  Otherwise, we fall back to heap allocation.
+ * Ugh.
+ *
+ * generate_one produces a single value of some integral type using a
+ * SeedSeq object.
+ */
+
+ /* uneven_copy helper, case where destination ints are less than 32 bit. */
+
+template<class SrcIter, class DestIter>
+SrcIter uneven_copy_impl(
+    SrcIter src_first, DestIter dest_first, DestIter dest_last,
+    std::true_type)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr bitcount_t SRC_SIZE  = sizeof(src_t);
+    constexpr bitcount_t DEST_SIZE = sizeof(dest_t);
+    constexpr bitcount_t DEST_BITS = DEST_SIZE * 8;
+    constexpr bitcount_t SCALE     = SRC_SIZE / DEST_SIZE;
+
+    size_t count = 0;
+    src_t value = 0;
+
+    while (dest_first != dest_last) {
+        if ((count++ % SCALE) == 0)
+            value = *src_first++;       // Get more bits
+        else
+            value >>= DEST_BITS;        // Move down bits
+
+        *dest_first++ = dest_t(value);  // Truncates, ignores high bits.
+    }
+    return src_first;
+}
+
+ /* uneven_copy helper, case where destination ints are more than 32 bit. */
+
+template<class SrcIter, class DestIter>
+SrcIter uneven_copy_impl(
+    SrcIter src_first, DestIter dest_first, DestIter dest_last,
+    std::false_type)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr auto SRC_SIZE  = sizeof(src_t);
+    constexpr auto SRC_BITS  = SRC_SIZE * 8;
+    constexpr auto DEST_SIZE = sizeof(dest_t);
+    constexpr auto SCALE     = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE;
+
+    while (dest_first != dest_last) {
+        dest_t value(0UL);
+        unsigned int shift = 0;
+
+        for (size_t i = 0; i < SCALE; ++i) {
+            value |= dest_t(*src_first++) << shift;
+            shift += SRC_BITS;
+        }
+
+        *dest_first++ = value;
+    }
+    return src_first;
+}
+
+/* uneven_copy, call the right code for larger vs. smaller */
+
+template<class SrcIter, class DestIter>
+inline SrcIter uneven_copy(SrcIter src_first,
+                           DestIter dest_first, DestIter dest_last)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t);
+
+    return uneven_copy_impl(src_first, dest_first, dest_last,
+                            std::integral_constant<bool, DEST_IS_SMALLER>{});
+}
+
+/* generate_to, fill in a fixed-size array of integral type using a SeedSeq
+ * (actually works for any random-access iterator)
+ */
+
+template <size_t size, typename SeedSeq, typename DestIter>
+inline void generate_to_impl(SeedSeq&& generator, DestIter dest,
+                             std::true_type)
+{
+    generator.generate(dest, dest+size);
+}
+
+template <size_t size, typename SeedSeq, typename DestIter>
+void generate_to_impl(SeedSeq&& generator, DestIter dest,
+                      std::false_type)
+{
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+    constexpr auto DEST_SIZE = sizeof(dest_t);
+    constexpr auto GEN_SIZE  = sizeof(uint32_t);
+
+    constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE;
+    constexpr size_t FROM_ELEMS =
+        GEN_IS_SMALLER
+            ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE)
+            : (size + (GEN_SIZE / DEST_SIZE) - 1)
+                / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER);
+                        //  this odd code ^^^^^^^^^^^^^^^^^ is work-around for
+                        //  a bug: http://llvm.org/bugs/show_bug.cgi?id=21287
+
+    if (FROM_ELEMS <= 1024) {
+        uint32_t buffer[FROM_ELEMS];
+        generator.generate(buffer, buffer+FROM_ELEMS);
+        uneven_copy(buffer, dest, dest+size);
+    } else {
+        uint32_t* buffer = static_cast<uint32_t*>(malloc(GEN_SIZE * FROM_ELEMS));
+        generator.generate(buffer, buffer+FROM_ELEMS);
+        uneven_copy(buffer, dest, dest+size);
+        free(static_cast<void*>(buffer));
+    }
+}
+
+template <size_t size, typename SeedSeq, typename DestIter>
+inline void generate_to(SeedSeq&& generator, DestIter dest)
+{
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+    constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t);
+
+    generate_to_impl<size>(std::forward<SeedSeq>(generator), dest,
+                           std::integral_constant<bool, IS_32BIT>{});
+}
+
+/* generate_one, produce a value of integral type using a SeedSeq
+ * (optionally, we can have it produce more than one and pick which one
+ * we want)
+ */
+
+template <typename UInt, size_t i = 0UL, size_t N = i+1UL, typename SeedSeq>
+inline UInt generate_one(SeedSeq&& generator)
+{
+    UInt result[N];
+    generate_to<N>(std::forward<SeedSeq>(generator), result);
+    return result[i];
+}
+
+template <typename RngType>
+auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound)
+        -> typename RngType::result_type
+{
+    typedef typename RngType::result_type rtype;
+    rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound)
+                    % upper_bound;
+    for (;;) {
+        rtype r = rng() - RngType::min();
+        if (r >= threshold)
+            return r % upper_bound;
+    }
+}
+
+template <typename Iter, typename RandType>
+void shuffle(Iter from, Iter to, RandType&& rng)
+{
+    typedef typename std::iterator_traits<Iter>::difference_type delta_t;
+    typedef typename std::remove_reference<RandType>::type::result_type result_t;
+    auto count = to - from;
+    while (count > 1) {
+        delta_t chosen = delta_t(bounded_rand(rng, result_t(count)));
+        --count;
+        --to;
+        using std::swap;
+        swap(*(from + chosen), *to);
+    }
+}
+
+/*
+ * Although std::seed_seq is useful, it isn't everything.  Often we want to
+ * initialize a random-number generator some other way, such as from a random
+ * device.
+ *
+ * Technically, it does not meet the requirements of a SeedSequence because
+ * it lacks some of the rarely-used member functions (some of which would
+ * be impossible to provide).  However the C++ standard is quite specific
+ * that actual engines only called the generate method, so it ought not to be
+ * a problem in practice.
+ */
+
+template <typename RngType>
+class seed_seq_from {
+private:
+    RngType rng_;
+
+    typedef uint_least32_t result_type;
+
+public:
+    template<typename... Args>
+    seed_seq_from(Args&&... args) :
+        rng_(std::forward<Args>(args)...)
+    {
+        // Nothing (else) to do...
+    }
+
+    template<typename Iter>
+    void generate(Iter start, Iter finish)
+    {
+        for (auto i = start; i != finish; ++i)
+            *i = result_type(rng_());
+    }
+
+    constexpr size_t size() const
+    {
+        return (sizeof(typename RngType::result_type) > sizeof(result_type)
+                && RngType::max() > ~size_t(0UL))
+             ? ~size_t(0UL)
+             : size_t(RngType::max());
+    }
+};
+
+/*
+ * Sometimes you might want a distinct seed based on when the program
+ * was compiled.  That way, a particular instance of the program will
+ * behave the same way, but when recompiled it'll produce a different
+ * value.
+ */
+
+template <typename IntType>
+struct static_arbitrary_seed {
+private:
+    static constexpr IntType fnv(IntType hash, const char* pos) {
+        return *pos == '\0'
+             ? hash
+             : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1));
+    }
+
+public:
+    static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)),
+                        __DATE__ __TIME__ __FILE__);
+};
+
+// Sometimes, when debugging or testing, it's handy to be able print the name
+// of a (in human-readable form).  This code allows the idiom:
+//
+//      cout << printable_typename<my_foo_type_t>()
+//
+// to print out my_foo_type_t (or its concrete type if it is a synonym)
+
+template <typename T>
+struct printable_typename {};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, printable_typename<T>) {
+    const char *implementation_typename = typeid(T).name();
+#ifdef __GNUC__
+    int status;
+    char* pretty_name =
+        abi::__cxa_demangle(implementation_typename, NULL, NULL, &status);
+    if (status == 0)
+        out << pretty_name;
+    free(static_cast<void*>(pretty_name));
+    if (status == 0)
+        return out;
+#endif
+    out << implementation_typename;
+    return out;
+}
+
+} // namespace pcg_extras
+
+#endif // PCG_EXTRAS_HPP_INCLUDED
diff --git a/include/pcg_random.hpp b/include/pcg_random.hpp
new file mode 100644
index 0000000..81c14b6
--- /dev/null
+++ b/include/pcg_random.hpp
@@ -0,0 +1,1756 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014 Melissa O'Neill <oneill at pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *     http://www.pcg-random.org
+ */
+
+/*
+ * This code provides the reference implementation of the PCG family of
+ * random number generators.  The code is complex because it implements
+ *
+ *      - several members of the PCG family, specifically members corresponding
+ *        to the output functions:
+ *             - XSH RR         (good for 64-bit state, 32-bit output)
+ *             - XSH RS         (good for 64-bit state, 32-bit output)
+ *             - XSL RR         (good for 128-bit state, 64-bit output)
+ *             - RXS M XS       (statistically most powerful generator)
+ *             - XSL RR RR      (good for 128-bit state, 128-bit output)
+ *             - and RXS, RXS M, XSH, XSL       (mostly for testing)
+ *      - at potentially *arbitrary* bit sizes
+ *      - with four different techniques for random streams (MCG, one-stream
+ *        LCG, settable-stream LCG, unique-stream LCG)
+ *      - and the extended generation schemes allowing arbitrary periods
+ *      - with all features of C++11 random number generation (and more),
+ *        some of which are somewhat painful, including
+ *            - initializing with a SeedSequence which writes 32-bit values
+ *              to memory, even though the state of the generator may not
+ *              use 32-bit values (it might use smaller or larger integers)
+ *            - I/O for RNGs and a prescribed format, which needs to handle
+ *              the issue that 8-bit and 128-bit integers don't have working
+ *              I/O routines (e.g., normally 8-bit = char, not integer)
+ *            - equality and inequality for RNGs
+ *      - and a number of convenience typedefs to mask all the complexity
+ *
+ * The code employes a fairly heavy level of abstraction, and has to deal
+ * with various C++ minutia.  If you're looking to learn about how the PCG
+ * scheme works, you're probably best of starting with one of the other
+ * codebases (see www.pcg-random.org).  But if you're curious about the
+ * constants for the various output functions used in those other, simpler,
+ * codebases, this code shows how they are calculated.
+ *
+ * On the positive side, at least there are convenience typedefs so that you
+ * can say
+ *
+ *      pcg32 myRNG;
+ *
+ * rather than:
+ *
+ *      pcg_detail::engine<
+ *          uint32_t,                                           // Output Type
+ *          uint64_t,                                           // State Type
+ *          pcg_detail::xsh_rr_mixin<uint32_t, uint64_t>, true, // Output Func
+ *          pcg_detail::specific_stream<uint64_t>,              // Stream Kind
+ *          pcg_detail::default_multiplier<uint64_t>            // LCG Mult
+ *      > myRNG;
+ *
+ */
+
+#ifndef PCG_RAND_HPP_INCLUDED
+#define PCG_RAND_HPP_INCLUDED 1
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <limits>
+#include <iostream>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <locale>
+#include <new>
+#include <stdexcept>
+
+/*
+ * The pcg_extras namespace contains some support code that is likley to
+ * be useful for a variety of RNGs, including:
+ *      - 128-bit int support for platforms where it isn't available natively
+ *      - bit twiddling operations
+ *      - I/O of 128-bit and 8-bit integers
+ *      - Handling the evilness of SeedSeq
+ *      - Support for efficiently producing random numbers less than a given
+ *        bound
+ */
+
+#include "pcg_extras.hpp"
+
+namespace pcg_detail {
+
+using namespace pcg_extras;
+
+/*
+ * The LCG generators need some constants to function.  This code lets you
+ * look up the constant by *type*.  For example
+ *
+ *      default_multiplier<uint32_t>::multiplier()
+ *
+ * gives you the default multipler for 32-bit integers.  We use the name
+ * of the constant and not a generic word like value to allow these classes
+ * to be used as mixins.
+ */
+
+template <typename T>
+struct default_multiplier {
+    // Not defined for an arbitrary type
+};
+
+template <typename T>
+struct default_increment {
+    // Not defined for an arbitrary type
+};
+
+#define PCG_DEFINE_CONSTANT(type, what, kind, constant) \
+        template <>                                     \
+        struct what ## _ ## kind<type> {                \
+            static constexpr type kind() {              \
+                return constant;                        \
+            }                                           \
+        };
+
+PCG_DEFINE_CONSTANT(uint8_t,  default, multiplier, 141U)
+PCG_DEFINE_CONSTANT(uint8_t,  default, increment,  77U)
+
+PCG_DEFINE_CONSTANT(uint16_t, default, multiplier, 12829U)
+PCG_DEFINE_CONSTANT(uint16_t, default, increment,  47989U)
+
+PCG_DEFINE_CONSTANT(uint32_t, default, multiplier, 747796405U)
+PCG_DEFINE_CONSTANT(uint32_t, default, increment,  2891336453U)
+
+PCG_DEFINE_CONSTANT(uint64_t, default, multiplier, 6364136223846793005ULL)
+PCG_DEFINE_CONSTANT(uint64_t, default, increment,  1442695040888963407ULL)
+
+PCG_DEFINE_CONSTANT(pcg128_t, default, multiplier,
+        PCG_128BIT_CONSTANT(2549297995355413924ULL,4865540595714422341ULL))
+PCG_DEFINE_CONSTANT(pcg128_t, default, increment,
+        PCG_128BIT_CONSTANT(6364136223846793005ULL,1442695040888963407ULL))
+
+
+/*
+ * Each PCG generator is available in four variants, based on how it applies
+ * the additive constant for its underlying LCG; the variations are:
+ *
+ *     single stream   - all instances use the same fixed constant, thus
+ *                       the RNG always somewhere in same sequence
+ *     mcg             - adds zero, resulting in a single stream and reduced
+ *                       period
+ *     specific stream - the constant can be changed at any time, selecting
+ *                       a different random sequence
+ *     unique stream   - the constant is based on the memory addresss of the
+ *                       object, thus every RNG has its own unique sequence
+ *
+ * This variation is provided though mixin classes which define a function
+ * value called increment() that returns the nesessary additive constant.
+ */
+
+
+
+/*
+ * unique stream
+ */
+
+
+template <typename itype>
+class unique_stream {
+protected:
+    static constexpr bool is_mcg = false;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    constexpr itype increment() const {
+        return itype(reinterpret_cast<unsigned long>(this) | 1);
+    }
+
+    constexpr itype stream() const
+    {
+         return increment() >> 1;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return (sizeof(itype) < sizeof(size_t) ? sizeof(itype)
+                                               : sizeof(size_t))*8 - 1u;
+    }
+
+protected:
+    constexpr unique_stream() = default;
+};
+
+
+/*
+ * no stream (mcg)
+ */
+
+template <typename itype>
+class no_stream {
+protected:
+    static constexpr bool is_mcg = true;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    static constexpr itype increment() {
+        return 0;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return 0u;
+    }
+
+protected:
+    constexpr no_stream() = default;
+};
+
+
+/*
+ * single stream/sequence (oneseq)
+ */
+
+template <typename itype>
+class oneseq_stream : public default_increment<itype> {
+protected:
+    static constexpr bool is_mcg = false;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    static constexpr itype stream()
+    {
+         return default_increment<itype>::increment() >> 1;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return 0u;
+    }
+
+protected:
+    constexpr oneseq_stream() = default;
+};
+
+
+/*
+ * specific stream
+ */
+
+template <typename itype>
+class specific_stream {
+protected:
+    static constexpr bool is_mcg = false;
+
+    itype inc_ = default_increment<itype>::increment();
+
+public:
+    typedef itype state_type;
+    typedef itype stream_state;
+
+    constexpr itype increment() const {
+        return inc_;
+    }
+
+    itype stream()
+    {
+         return inc_ >> 1;
+    }
+
+    void set_stream(itype specific_seq)
+    {
+         inc_ = (specific_seq << 1) | 1;
+    }
+
+    static constexpr bool can_specify_stream = true;
+
+    static constexpr size_t streams_pow2()
+    {
+        return (sizeof(itype)*8) - 1u;
+    }
+
+protected:
+    specific_stream() = default;
+
+    specific_stream(itype specific_seq)
+        : inc_(itype(specific_seq << 1) | itype(1U))
+    {
+        // Nothing (else) to do.
+    }
+};
+
+
+/*
+ * This is where it all comes together.  This function joins together three
+ * mixin classes which define
+ *    - the LCG additive constant (the stream)
+ *    - the LCG multiplier
+ *    - the output function
+ * in addition, we specify the type of the LCG state, and the result type,
+ * and whether to use the pre-advance version of the state for the output
+ * (increasing instruction-level parallelism) or the post-advance version
+ * (reducing register pressure).
+ *
+ * Given the high level of parameterization, the code has to use some
+ * template-metaprogramming tricks to handle some of the suble variations
+ * involved.
+ */
+
+template <typename xtype, typename itype,
+          typename output_mixin,
+          bool output_previous = true,
+          typename stream_mixin = oneseq_stream<itype>,
+          typename multiplier_mixin = default_multiplier<itype> >
+class engine : protected output_mixin,
+               public stream_mixin,
+               protected multiplier_mixin {
+protected:
+    itype state_;
+
+    struct can_specify_stream_tag {};
+    struct no_specifiable_stream_tag {};
+
+    using stream_mixin::increment;
+    using multiplier_mixin::multiplier;
+
+public:
+    typedef xtype result_type;
+    typedef itype state_type;
+
+    static constexpr size_t period_pow2()
+    {
+        return sizeof(state_type)*8 - 2*stream_mixin::is_mcg;
+    }
+
+    // It would be nice to use std::numeric_limits for these, but
+    // we can't be sure that it'd be defined for the 128-bit types.
+
+    static constexpr result_type min()
+    {
+        return result_type(0UL);
+    }
+
+    static constexpr result_type max()
+    {
+        return result_type(~result_type(0UL));
+    }
+
+protected:
+    itype bump(itype state)
+    {
+        return state * multiplier() + increment();
+    }
+
+    itype base_generate()
+    {
+        return state_ = bump(state_);
+    }
+
+    itype base_generate0()
+    {
+        itype old_state = state_;
+        state_ = bump(state_);
+        return old_state;
+    }
+
+public:
+    result_type operator()()
+    {
+        if (output_previous)
+            return this->output(base_generate0());
+        else
+            return this->output(base_generate());
+    }
+
+    result_type operator()(result_type upper_bound)
+    {
+        return bounded_rand(*this, upper_bound);
+    }
+
+protected:
+    static itype advance(itype state, itype delta,
+                         itype cur_mult, itype cur_plus);
+
+    static itype distance(itype cur_state, itype newstate, itype cur_mult,
+                          itype cur_plus, itype mask = ~itype(0U));
+
+    itype distance(itype newstate, itype mask = itype(~itype(0U))) const
+    {
+        return distance(state_, newstate, multiplier(), increment(), mask);
+    }
+
+public:
+    void advance(itype delta)
+    {
+        state_ = advance(state_, delta, this->multiplier(), this->increment());
+    }
+
+    void backstep(itype delta)
+    {
+        advance(-delta);
+    }
+
+    void discard(itype delta)
+    {
+        advance(delta);
+    }
+
+    bool wrapped()
+    {
+        if (stream_mixin::is_mcg) {
+            // For MCGs, the low order two bits never change. In this
+            // implementation, we keep them fixed at 3 to make this test
+            // easier.
+            return state_ == 3;
+        } else {
+            return state_ == 0;
+        }
+    }
+
+    engine(itype state = itype(0xcafef00dd15ea5e5ULL))
+        : state_(this->is_mcg ? state|state_type(3U)
+                              : bump(state + this->increment()))
+    {
+        // Nothing else to do.
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename sm = stream_mixin>
+    engine(itype state, typename sm::stream_state stream_seed)
+        : stream_mixin(stream_seed),
+          state_(this->is_mcg ? state|state_type(3U)
+                              : bump(state + this->increment()))
+    {
+        // Nothing else to do.
+    }
+
+    template<typename SeedSeq>
+    engine(SeedSeq&& seedSeq, typename std::enable_if<
+                  !stream_mixin::can_specify_stream
+               && !std::is_convertible<SeedSeq, itype>::value
+               && !std::is_convertible<SeedSeq, engine>::value,
+               no_specifiable_stream_tag>::type = {})
+        : engine(generate_one<itype>(std::forward<SeedSeq>(seedSeq)))
+    {
+        // Nothing else to do.
+    }
+
+    template<typename SeedSeq>
+    engine(SeedSeq&& seedSeq, typename std::enable_if<
+                   stream_mixin::can_specify_stream
+               && !std::is_convertible<SeedSeq, itype>::value
+               && !std::is_convertible<SeedSeq, engine>::value,
+        can_specify_stream_tag>::type = {})
+        : engine(generate_one<itype,1,2>(seedSeq),
+                 generate_one<itype,0,2>(seedSeq))
+    {
+        // Nothing else to do.
+    }
+
+
+    template<typename... Args>
+    void seed(Args&&... args)
+    {
+        new (this) engine(std::forward<Args>(args)...);
+    }
+
+    template <typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+              typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+    friend bool operator==(const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_lhs, multiplier_mixin_lhs>&,
+                           const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_rhs, multiplier_mixin_rhs>&);
+
+    template <typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+              typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+    friend itype1 operator-(const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_lhs, multiplier_mixin_lhs>&,
+                            const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_rhs, multiplier_mixin_rhs>&);
+
+    template <typename CharT, typename Traits,
+              typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin1, typename multiplier_mixin1>
+    friend std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits>& out,
+               const engine<xtype1,itype1,
+                              output_mixin1,output_previous1,
+                              stream_mixin1, multiplier_mixin1>&);
+
+    template <typename CharT, typename Traits,
+              typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin1, typename multiplier_mixin1>
+    friend std::basic_istream<CharT,Traits>&
+    operator>>(std::basic_istream<CharT,Traits>& in,
+               engine<xtype1, itype1,
+                        output_mixin1, output_previous1,
+                        stream_mixin1, multiplier_mixin1>& rng);
+};
+
+template <typename CharT, typename Traits,
+          typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out,
+           const engine<xtype,itype,
+                          output_mixin,output_previous,
+                          stream_mixin, multiplier_mixin>& rng)
+{
+    auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left);
+    auto space = out.widen(' ');
+    auto orig_fill = out.fill();
+
+    out << rng.multiplier() << space
+        << rng.increment() << space
+        << rng.state_;
+
+    out.flags(orig_flags);
+    out.fill(orig_fill);
+    return out;
+}
+
+
+template <typename CharT, typename Traits,
+          typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in,
+           engine<xtype,itype,
+                    output_mixin,output_previous,
+                    stream_mixin, multiplier_mixin>& rng)
+{
+    auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws);
+
+    itype multiplier, increment, state;
+    in >> multiplier >> increment >> state;
+
+    if (!in.fail()) {
+        bool good = true;
+        if (multiplier != rng.multiplier()) {
+           good = false;
+        } else if (rng.can_specify_stream) {
+           rng.set_stream(increment >> 1);
+        } else if (increment != rng.increment()) {
+           good = false;
+        }
+        if (good) {
+            rng.state_ = state;
+        } else {
+            in.clear(std::ios::failbit);
+        }
+    }
+
+    in.flags(orig_flags);
+    return in;
+}
+
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+itype engine<xtype,itype,output_mixin,output_previous,stream_mixin,
+             multiplier_mixin>::advance(
+    itype state, itype delta, itype cur_mult, itype cur_plus)
+{
+    // The method used here is based on Brown, "Random Number Generation
+    // with Arbitrary Stride,", Transactions of the American Nuclear
+    // Society (Nov. 1994).  The algorithm is very similar to fast
+    // exponentiation.
+    //
+    // Even though delta is an unsigned integer, we can pass a
+    // signed integer to go backwards, it just goes "the long way round".
+
+    constexpr itype ZERO = 0u;  // itype may be a non-trivial types, so
+    constexpr itype ONE  = 1u;  // we define some ugly constants.
+    itype acc_mult = 1;
+    itype acc_plus = 0;
+    while (delta > ZERO) {
+       if (delta & ONE) {
+          acc_mult *= cur_mult;
+          acc_plus = acc_plus*cur_mult + cur_plus;
+       }
+       cur_plus = (cur_mult+ONE)*cur_plus;
+       cur_mult *= cur_mult;
+       delta >>= 1;
+    }
+    return acc_mult * state + acc_plus;
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+itype engine<xtype,itype,output_mixin,output_previous,stream_mixin,
+               multiplier_mixin>::distance(
+    itype cur_state, itype newstate, itype cur_mult, itype cur_plus, itype mask)
+{
+    constexpr itype ONE  = 1u;  // itype could be weird, so use constant
+    itype the_bit = stream_mixin::is_mcg ? itype(4u) : itype(1u);
+    itype distance = 0u;
+    while ((cur_state & mask) != (newstate & mask)) {
+       if ((cur_state & the_bit) != (newstate & the_bit)) {
+           cur_state = cur_state * cur_mult + cur_plus;
+           distance |= the_bit;
+       }
+       assert((cur_state & the_bit) == (newstate & the_bit));
+       the_bit <<= 1;
+       cur_plus = (cur_mult+ONE)*cur_plus;
+       cur_mult *= cur_mult;
+    }
+    return stream_mixin::is_mcg ? distance >> 2 : distance;
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+itype operator-(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+               const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    if (lhs.multiplier() != rhs.multiplier()
+        || lhs.increment() != rhs.increment())
+        throw std::logic_error("incomparable generators");
+    return rhs.distance(lhs.state_);
+}
+
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+bool operator==(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+                const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    return    (lhs.multiplier() == rhs.multiplier())
+           && (lhs.increment()  == rhs.increment())
+           && (lhs.state_       == rhs.state_);
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+inline bool operator!=(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+                       const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    return !operator==(lhs,rhs);
+}
+
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using oneseq_base  = engine<xtype, itype,
+                        output_mixin<xtype, itype>, output_previous,
+                        oneseq_stream<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using unique_base = engine<xtype, itype,
+                         output_mixin<xtype, itype>, output_previous,
+                         unique_stream<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using setseq_base = engine<xtype, itype,
+                         output_mixin<xtype, itype>, output_previous,
+                         specific_stream<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8)>
+using mcg_base = engine<xtype, itype,
+                      output_mixin<xtype, itype>, output_previous,
+                      no_stream<itype> >;
+
+/*
+ * OUTPUT FUNCTIONS.
+ *
+ * These are the core of the PCG generation scheme.  They specify how to
+ * turn the base LCG's internal state into the output value of the final
+ * generator.
+ *
+ * They're implemented as mixin classes.
+ *
+ * All of the classes have code that is written to allow it to be applied
+ * at *arbitrary* bit sizes, although in practice they'll only be used at
+ * standard sizes supported by C++.
+ */
+
+/*
+ * XSH RS -- high xorshift, followed by a random shift
+ *
+ * Fast.  A good performer.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_rs_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t sparebits   = bits - xtypebits;
+        constexpr bitcount_t opbits =
+                              sparebits-5 >= 64 ? 5
+                            : sparebits-4 >= 32 ? 4
+                            : sparebits-3 >= 16 ? 3
+                            : sparebits-2 >= 4  ? 2
+                            : sparebits-1 >= 1  ? 1
+                            :                     0;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t maxrandshift  = mask;
+        constexpr bitcount_t topspare     = opbits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift     = topspare + (xtypebits+maxrandshift)/2;
+        bitcount_t rshift =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> (bottomspare - maxrandshift + rshift));
+        return result;
+    }
+};
+
+/*
+ * XSH RR -- high xorshift, followed by a random rotate
+ *
+ * Fast.  A good performer.  Slightly better statistically than XSH RS.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_rr_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype)*8);
+        constexpr bitcount_t sparebits   = bits - xtypebits;
+        constexpr bitcount_t wantedopbits =
+                              xtypebits >= 128 ? 7
+                            : xtypebits >=  64 ? 6
+                            : xtypebits >=  32 ? 5
+                            : xtypebits >=  16 ? 4
+                            :                    3;
+        constexpr bitcount_t opbits =
+                              sparebits >= wantedopbits ? wantedopbits
+                                                        : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare    = opbits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift      = (topspare + xtypebits)/2;
+        bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask
+                                : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> bottomspare);
+        result = rotr(result, amprot);
+        return result;
+    }
+};
+
+/*
+ * RXS -- random xorshift
+ */
+
+template <typename xtype, typename itype>
+struct rxs_mixin {
+static xtype output_rxs(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype)*8);
+        constexpr bitcount_t shift       = bits - xtypebits;
+        constexpr bitcount_t extrashift  = (xtypebits - shift)/2;
+        bitcount_t rshift = shift > 64+8 ? (internal >> (bits - 6)) & 63
+                       : shift > 32+4 ? (internal >> (bits - 5)) & 31
+                       : shift > 16+2 ? (internal >> (bits - 4)) & 15
+                       : shift >  8+1 ? (internal >> (bits - 3)) & 7
+                       : shift >  4+1 ? (internal >> (bits - 2)) & 3
+                       : shift >  2+1 ? (internal >> (bits - 1)) & 1
+                       :              0;
+        internal ^= internal >> (shift + extrashift - rshift);
+        xtype result = internal >> rshift;
+        return result;
+    }
+};
+
+/*
+ * RXS M XS -- random xorshift, mcg multiply, fixed xorshift
+ *
+ * The most statistically powerful generator, but all those steps
+ * make it slower than some of the others.  We give it the rottenest jobs.
+ *
+ * Because it's usually used in contexts where the state type and the
+ * result type are the same, it is a permutation and is thus invertable.
+ * We thus provide a function to invert it.  This function is used to
+ * for the "inside out" generator used by the extended generator.
+ */
+
+/* Defined type-based concepts for the multiplication step.  They're actually
+ * all derived by truncating the 128-bit, which was computed to be a good
+ * "universal" constant.
+ */
+
+template <typename T>
+struct mcg_multiplier {
+    // Not defined for an arbitrary type
+};
+
+template <typename T>
+struct mcg_unmultiplier {
+    // Not defined for an arbitrary type
+};
+
+PCG_DEFINE_CONSTANT(uint8_t,  mcg, multiplier,   217U)
+PCG_DEFINE_CONSTANT(uint8_t,  mcg, unmultiplier, 105U)
+
+PCG_DEFINE_CONSTANT(uint16_t, mcg, multiplier,   62169U)
+PCG_DEFINE_CONSTANT(uint16_t, mcg, unmultiplier, 28009U)
+
+PCG_DEFINE_CONSTANT(uint32_t, mcg, multiplier,   277803737U)
+PCG_DEFINE_CONSTANT(uint32_t, mcg, unmultiplier, 2897767785U)
+
+PCG_DEFINE_CONSTANT(uint64_t, mcg, multiplier,   12605985483714917081ULL)
+PCG_DEFINE_CONSTANT(uint64_t, mcg, unmultiplier, 15009553638781119849ULL)
+
+PCG_DEFINE_CONSTANT(pcg128_t, mcg, multiplier,
+        PCG_128BIT_CONSTANT(17766728186571221404ULL, 12605985483714917081ULL))
+PCG_DEFINE_CONSTANT(pcg128_t, mcg, unmultiplier,
+        PCG_128BIT_CONSTANT(14422606686972528997ULL, 15009553638781119849ULL))
+
+
+template <typename xtype, typename itype>
+struct rxs_m_xs_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = xtypebits >= 128 ? 6
+                                 : xtypebits >=  64 ? 5
+                                 : xtypebits >=  32 ? 4
+                                 : xtypebits >=  16 ? 3
+                                 :                    2;
+        constexpr bitcount_t shift = bits - xtypebits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        bitcount_t rshift =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> (opbits + rshift);
+        internal *= mcg_multiplier<itype>::multiplier();
+        xtype result = internal >> shift;
+        result ^= result >> ((2U*xtypebits+2U)/3U);
+        return result;
+    }
+
+    static itype unoutput(itype internal)
+    {
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = bits >= 128 ? 6
+                                 : bits >=  64 ? 5
+                                 : bits >=  32 ? 4
+                                 : bits >=  16 ? 3
+                                 :               2;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+
+        internal = unxorshift(internal, bits, (2U*bits+2U)/3U);
+
+        internal *= mcg_unmultiplier<itype>::unmultiplier();
+
+        bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0;
+        internal = unxorshift(internal, bits, opbits + rshift);
+
+        return internal;
+    }
+};
+
+
+/*
+ * RXS M -- random xorshift, mcg multiply
+ */
+
+template <typename xtype, typename itype>
+struct rxs_m_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = xtypebits >= 128 ? 6
+                                 : xtypebits >=  64 ? 5
+                                 : xtypebits >=  32 ? 4
+                                 : xtypebits >=  16 ? 3
+                                 :                    2;
+        constexpr bitcount_t shift = bits - xtypebits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> (opbits + rshift);
+        internal *= mcg_multiplier<itype>::multiplier();
+        xtype result = internal >> shift;
+        return result;
+    }
+};
+
+/*
+ * XSL RR -- fixed xorshift (to low bits), random rotate
+ *
+ * Useful for 128-bit types that are split across two CPU registers.
+ */
+
+template <typename xtype, typename itype>
+struct xsl_rr_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t wantedopbits = xtypebits >= 128 ? 7
+                                       : xtypebits >=  64 ? 6
+                                       : xtypebits >=  32 ? 5
+                                       : xtypebits >=  16 ? 4
+                                       :                    3;
+        constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits
+                                                             : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        bitcount_t rot =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> bottomspare);
+        result = rotr(result, amprot);
+        return result;
+    }
+};
+
+
+/*
+ * XSL RR RR -- fixed xorshift (to low bits), random rotate (both parts)
+ *
+ * Useful for 128-bit types that are split across two CPU registers.
+ * If you really want an invertable 128-bit RNG, I guess this is the one.
+ */
+
+template <typename T> struct halfsize_trait {};
+template <> struct halfsize_trait<pcg128_t>  { typedef uint64_t type; };
+template <> struct halfsize_trait<uint64_t>  { typedef uint32_t type; };
+template <> struct halfsize_trait<uint32_t>  { typedef uint16_t type; };
+template <> struct halfsize_trait<uint16_t>  { typedef uint8_t type;  };
+
+template <typename xtype, typename itype>
+struct xsl_rr_rr_mixin {
+    typedef typename halfsize_trait<itype>::type htype;
+
+    static itype output(itype internal)
+    {
+        constexpr bitcount_t htypebits = bitcount_t(sizeof(htype) * 8);
+        constexpr bitcount_t bits      = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - htypebits;
+        constexpr bitcount_t wantedopbits = htypebits >= 128 ? 7
+                                       : htypebits >=  64 ? 6
+                                       : htypebits >=  32 ? 5
+                                       : htypebits >=  16 ? 4
+                                       :                    3;
+        constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits
+                                                                : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t xshift = (topspare + htypebits) / 2;
+
+        bitcount_t rot =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        htype lowbits = htype(internal);
+        lowbits = rotr(lowbits, amprot);
+        htype highbits = htype(internal >> topspare);
+        bitcount_t rot2 = lowbits & mask;
+        bitcount_t amprot2 = (rot2 << amplifier) & mask;
+        highbits = rotr(highbits, amprot2);
+        return (itype(highbits) << topspare) ^ itype(lowbits);
+    }
+};
+
+
+/*
+ * XSH -- fixed xorshift (to high bits)
+ *
+ * You shouldn't use this at 64-bits or less.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t topspare = 0;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        internal ^= internal >> xshift;
+        xtype result = internal >> bottomspare;
+        return result;
+    }
+};
+
+/*
+ * XSL -- fixed xorshift (to low bits)
+ *
+ * You shouldn't use this at 64-bits or less.
+ */
+
+template <typename xtype, typename itype>
+struct xsl_mixin {
+    inline xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        internal ^= internal >> xshift;
+        xtype result = internal >> bottomspare;
+        return result;
+    }
+};
+
+/* ---- End of Output Functions ---- */
+
+
+template <typename baseclass>
+struct inside_out : private baseclass {
+    inside_out() = delete;
+
+    typedef typename baseclass::result_type result_type;
+    typedef typename baseclass::state_type  state_type;
+    static_assert(sizeof(result_type) == sizeof(state_type),
+                  "Require a RNG whose output function is a permutation");
+
+    static bool external_step(result_type& randval, size_t i)
+    {
+        state_type state = baseclass::unoutput(randval);
+        state = state * baseclass::multiplier() + baseclass::increment()
+                + state_type(i*2);
+        result_type result = baseclass::output(state);
+        randval = result;
+        state_type zero =
+            baseclass::is_mcg ? state & state_type(3U) : state_type(0U);
+        return result == zero;
+    }
+
+    static bool external_advance(result_type& randval, size_t i,
+                                 result_type delta, bool forwards = true)
+    {
+        state_type state = baseclass::unoutput(randval);
+        state_type mult  = baseclass::multiplier();
+        state_type inc   = baseclass::increment() + state_type(i*2);
+        state_type zero =
+            baseclass::is_mcg ? state & state_type(3U) : state_type(0U);
+        state_type dist_to_zero = baseclass::distance(state, zero, mult, inc);
+        bool crosses_zero =
+            forwards ? dist_to_zero <= delta
+                     : (-dist_to_zero) <= delta;
+        if (!forwards)
+            delta = -delta;
+        state = baseclass::advance(state, delta, mult, inc);
+        randval = baseclass::output(state);
+        return crosses_zero;
+    }
+};
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, typename baseclass, typename extvalclass, bool kdd = true>
+class extended : public baseclass {
+public:
+    typedef typename baseclass::state_type  state_type;
+    typedef typename baseclass::result_type result_type;
+    typedef inside_out<extvalclass> insideout;
+
+private:
+    static constexpr bitcount_t rtypebits = sizeof(result_type)*8;
+    static constexpr bitcount_t stypebits = sizeof(state_type)*8;
+
+    static constexpr bitcount_t tick_limit_pow2 = 64U;
+
+    static constexpr size_t table_size  = 1UL << table_pow2;
+    static constexpr size_t table_shift = stypebits - table_pow2;
+    static constexpr state_type table_mask =
+        (state_type(1U) << table_pow2) - state_type(1U);
+
+    static constexpr bool   may_tick  =
+        (advance_pow2 < stypebits) && (advance_pow2 < tick_limit_pow2);
+    static constexpr size_t tick_shift = stypebits - advance_pow2;
+    static constexpr state_type tick_mask  =
+        may_tick ? state_type(
+                       (uint64_t(1) << (advance_pow2*may_tick)) - 1)
+                                        // ^-- stupidity to appease GCC warnings
+                 : ~state_type(0U);
+
+    static constexpr bool may_tock = stypebits < tick_limit_pow2;
+
+    result_type data_[table_size];
+
+    PCG_NOINLINE void advance_table();
+
+    PCG_NOINLINE void advance_table(state_type delta, bool isForwards = true);
+
+    result_type& get_extended_value()
+    {
+        state_type state = this->state_;
+        if (kdd && baseclass::is_mcg) {
+            // The low order bits of an MCG are constant, so drop them.
+            state >>= 2;
+        }
+        size_t index       = kdd ? state &  table_mask
+                                 : state >> table_shift;
+
+        if (may_tick) {
+            bool tick = kdd ? (state & tick_mask) == state_type(0u)
+                            : (state >> tick_shift) == state_type(0u);
+            if (tick)
+                    advance_table();
+        }
+        if (may_tock) {
+            bool tock = state == state_type(0u);
+            if (tock)
+                advance_table();
+        }
+        return data_[index];
+    }
+
+public:
+    static constexpr size_t period_pow2()
+    {
+        return baseclass::period_pow2() + table_size*extvalclass::period_pow2();
+    }
+
+    __attribute__((always_inline)) result_type operator()()
+    {
+        result_type rhs = get_extended_value();
+        result_type lhs = this->baseclass::operator()();
+        return lhs ^ rhs;
+    }
+
+    result_type operator()(result_type upper_bound)
+    {
+        return bounded_rand(*this, upper_bound);
+    }
+
+    void set(result_type wanted)
+    {
+        result_type& rhs = get_extended_value();
+        result_type lhs = this->baseclass::operator()();
+        rhs = lhs ^ wanted;
+    }
+
+    void advance(state_type distance, bool forwards = true);
+
+    void backstep(state_type distance)
+    {
+        advance(distance, false);
+    }
+
+    extended(const result_type* data)
+        : baseclass()
+    {
+        datainit(data);
+    }
+
+    extended(const result_type* data, state_type seed)
+        : baseclass(seed)
+    {
+        datainit(data);
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename bc = baseclass>
+    extended(const result_type* data, state_type seed,
+            typename bc::stream_state stream_seed)
+        : baseclass(seed, stream_seed)
+    {
+        datainit(data);
+    }
+
+    extended()
+        : baseclass()
+    {
+        selfinit();
+    }
+
+    extended(state_type seed)
+        : baseclass(seed)
+    {
+        selfinit();
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename bc = baseclass>
+    extended(state_type seed, typename bc::stream_state stream_seed)
+        : baseclass(seed, stream_seed)
+    {
+        selfinit();
+    }
+
+private:
+    void selfinit();
+    void datainit(const result_type* data);
+
+public:
+
+    template<typename SeedSeq, typename = typename std::enable_if<
+           !std::is_convertible<SeedSeq, result_type>::value
+        && !std::is_convertible<SeedSeq, extended>::value>::type>
+    extended(SeedSeq&& seedSeq)
+        : baseclass(seedSeq)
+    {
+        generate_to<table_size>(seedSeq, data_);
+    }
+
+    template<typename... Args>
+    void seed(Args&&... args)
+    {
+        new (this) extended(std::forward<Args>(args)...);
+    }
+
+    template <bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend bool operator==(const extended<table_pow2_, advance_pow2_,
+                                              baseclass_, extvalclass_, kdd_>&,
+                           const extended<table_pow2_, advance_pow2_,
+                                              baseclass_, extvalclass_, kdd_>&);
+
+    template <typename CharT, typename Traits,
+              bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits>& out,
+               const extended<table_pow2_, advance_pow2_,
+                              baseclass_, extvalclass_, kdd_>&);
+
+    template <typename CharT, typename Traits,
+              bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend std::basic_istream<CharT,Traits>&
+    operator>>(std::basic_istream<CharT,Traits>& in,
+               extended<table_pow2_, advance_pow2_,
+                        baseclass_, extvalclass_, kdd_>&);
+
+};
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::datainit(
+         const result_type* data)
+{
+    for (size_t i = 0; i < table_size; ++i)
+        data_[i] = data[i];
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::selfinit()
+{
+    // We need to fill the extended table with something, and we have
+    // very little provided data, so we use the base generator to
+    // produce values.  Although not ideal (use a seed sequence, folks!),
+    // unexpected correlations are mitigated by
+    //      - using XOR differences rather than the number directly
+    //      - the way the table is accessed, its values *won't* be accessed
+    //        in the same order the were written.
+    //      - any strange correlations would only be apparent if we
+    //        were to backstep the generator so that the base generator
+    //        was generating the same values again
+    result_type xdiff = baseclass::operator()() - baseclass::operator()();
+    for (size_t i = 0; i < table_size; ++i) {
+        data_[i] = baseclass::operator()() ^ xdiff;
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+bool operator==(const extended<table_pow2, advance_pow2,
+                               baseclass, extvalclass, kdd>& lhs,
+                const extended<table_pow2, advance_pow2,
+                               baseclass, extvalclass, kdd>& rhs)
+{
+    auto& base_lhs = static_cast<const baseclass&>(lhs);
+    auto& base_rhs = static_cast<const baseclass&>(rhs);
+    return base_lhs == base_rhs
+        && !std::equal(
+                std::begin(lhs.data_), std::end(lhs.data_),
+                std::begin(rhs.data_)
+            );
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+inline bool operator!=(const extended<table_pow2, advance_pow2,
+                                      baseclass, extvalclass, kdd>& lhs,
+                       const extended<table_pow2, advance_pow2,
+                                      baseclass, extvalclass, kdd>& rhs)
+{
+    return lhs != rhs;
+}
+
+template <typename CharT, typename Traits,
+          bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out,
+           const extended<table_pow2, advance_pow2,
+                          baseclass, extvalclass, kdd>& rng)
+{
+    auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left);
+    auto space = out.widen(' ');
+    auto orig_fill = out.fill();
+
+    out << rng.multiplier() << space
+        << rng.increment() << space
+        << rng.state_;
+
+    for (const auto& datum : rng.data_)
+        out << space << datum;
+
+    out.flags(orig_flags);
+    out.fill(orig_fill);
+    return out;
+}
+
+template <typename CharT, typename Traits,
+          bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in,
+           extended<table_pow2, advance_pow2,
+                    baseclass, extvalclass, kdd>& rng)
+{
+    extended<table_pow2, advance_pow2, baseclass, extvalclass> new_rng;
+    auto& base_rng = static_cast<baseclass&>(new_rng);
+    in >> base_rng;
+
+    if (in.fail())
+        return in;
+
+    auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws);
+
+    for (auto& datum : new_rng.data_) {
+        in >> datum;
+        if (in.fail())
+            goto bail;
+    }
+
+    rng = new_rng;
+
+bail:
+    in.flags(orig_flags);
+    return in;
+}
+
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void
+extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance_table()
+{
+    bool carry = false;
+    for (size_t i = 0; i < table_size; ++i) {
+        if (carry) {
+            carry = insideout::external_step(data_[i],i+1);
+        }
+        bool carry2 = insideout::external_step(data_[i],i+1);
+        carry = carry || carry2;
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void
+extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance_table(
+        state_type delta, bool isForwards)
+{
+    typedef typename baseclass::state_type   base_state_t;
+    typedef typename extvalclass::state_type ext_state_t;
+    constexpr bitcount_t basebits = sizeof(base_state_t)*8;
+    constexpr bitcount_t extbits  = sizeof(ext_state_t)*8;
+    static_assert(basebits <= extbits || advance_pow2 > 0,
+                  "Current implementation might overflow its carry");
+
+    base_state_t carry = 0;
+    for (size_t i = 0; i < table_size; ++i) {
+        base_state_t total_delta = carry + delta;
+        ext_state_t  trunc_delta = ext_state_t(total_delta);
+        if (basebits > extbits) {
+            carry = total_delta >> extbits;
+        } else {
+            carry = 0;
+        }
+        carry +=
+            insideout::external_advance(data_[i],i+1, trunc_delta, isForwards);
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance(
+    state_type distance, bool forwards)
+{
+    static_assert(kdd,
+        "Efficient advance is too hard for non-kdd extension. "
+        "For a weak advance, cast to base class");
+    state_type zero =
+        baseclass::is_mcg ? this->state_ & state_type(3U) : state_type(0U);
+    if (may_tick) {
+        state_type ticks = distance >> (advance_pow2*may_tick);
+                                        // ^-- stupidity to appease GCC
+                                        // warnings
+        state_type adv_mask =
+            baseclass::is_mcg ? tick_mask << 2 : tick_mask;
+        state_type next_advance_distance = this->distance(zero, adv_mask);
+        if (!forwards)
+            next_advance_distance = (-next_advance_distance) & tick_mask;
+        if (next_advance_distance < (distance & tick_mask)) {
+            ++ticks;
+        }
+        if (ticks)
+            advance_table(ticks, forwards);
+    }
+    if (forwards) {
+        if (may_tock && this->distance(zero) <= distance)
+            advance_table();
+        baseclass::advance(distance);
+    } else {
+        if (may_tock && -(this->distance(zero)) <= distance)
+            advance_table(state_type(1U), false);
+        baseclass::advance(-distance);
+    }
+}
+
+} // namespace pcg_detail
+
+namespace pcg_engines {
+
+using namespace pcg_detail;
+
+/* Predefined types for XSH RS */
+
+typedef oneseq_base<uint8_t,  uint16_t, xsh_rs_mixin>  oneseq_xsh_rs_16_8;
+typedef oneseq_base<uint16_t, uint32_t, xsh_rs_mixin>  oneseq_xsh_rs_32_16;
+typedef oneseq_base<uint32_t, uint64_t, xsh_rs_mixin>  oneseq_xsh_rs_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rs_mixin>  oneseq_xsh_rs_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, xsh_rs_mixin>  unique_xsh_rs_16_8;
+typedef unique_base<uint16_t, uint32_t, xsh_rs_mixin>  unique_xsh_rs_32_16;
+typedef unique_base<uint32_t, uint64_t, xsh_rs_mixin>  unique_xsh_rs_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsh_rs_mixin>  unique_xsh_rs_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, xsh_rs_mixin>  setseq_xsh_rs_16_8;
+typedef setseq_base<uint16_t, uint32_t, xsh_rs_mixin>  setseq_xsh_rs_32_16;
+typedef setseq_base<uint32_t, uint64_t, xsh_rs_mixin>  setseq_xsh_rs_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rs_mixin>  setseq_xsh_rs_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, xsh_rs_mixin>  mcg_xsh_rs_16_8;
+typedef mcg_base<uint16_t, uint32_t, xsh_rs_mixin>  mcg_xsh_rs_32_16;
+typedef mcg_base<uint32_t, uint64_t, xsh_rs_mixin>  mcg_xsh_rs_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rs_mixin>  mcg_xsh_rs_128_64;
+
+/* Predefined types for XSH RR */
+
+typedef oneseq_base<uint8_t,  uint16_t, xsh_rr_mixin>  oneseq_xsh_rr_16_8;
+typedef oneseq_base<uint16_t, uint32_t, xsh_rr_mixin>  oneseq_xsh_rr_32_16;
+typedef oneseq_base<uint32_t, uint64_t, xsh_rr_mixin>  oneseq_xsh_rr_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rr_mixin>  oneseq_xsh_rr_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, xsh_rr_mixin>  unique_xsh_rr_16_8;
+typedef unique_base<uint16_t, uint32_t, xsh_rr_mixin>  unique_xsh_rr_32_16;
+typedef unique_base<uint32_t, uint64_t, xsh_rr_mixin>  unique_xsh_rr_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsh_rr_mixin>  unique_xsh_rr_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, xsh_rr_mixin>  setseq_xsh_rr_16_8;
+typedef setseq_base<uint16_t, uint32_t, xsh_rr_mixin>  setseq_xsh_rr_32_16;
+typedef setseq_base<uint32_t, uint64_t, xsh_rr_mixin>  setseq_xsh_rr_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rr_mixin>  setseq_xsh_rr_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, xsh_rr_mixin>  mcg_xsh_rr_16_8;
+typedef mcg_base<uint16_t, uint32_t, xsh_rr_mixin>  mcg_xsh_rr_32_16;
+typedef mcg_base<uint32_t, uint64_t, xsh_rr_mixin>  mcg_xsh_rr_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rr_mixin>  mcg_xsh_rr_128_64;
+
+
+/* Predefined types for RXS M XS */
+
+typedef oneseq_base<uint8_t,  uint8_t, rxs_m_xs_mixin>   oneseq_rxs_m_xs_8_8;
+typedef oneseq_base<uint16_t, uint16_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_16_16;
+typedef oneseq_base<uint32_t, uint32_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_32_32;
+typedef oneseq_base<uint64_t, uint64_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_64_64;
+typedef oneseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_128_128;
+
+typedef unique_base<uint8_t,  uint8_t, rxs_m_xs_mixin>  unique_rxs_m_xs_8_8;
+typedef unique_base<uint16_t, uint16_t, rxs_m_xs_mixin> unique_rxs_m_xs_16_16;
+typedef unique_base<uint32_t, uint32_t, rxs_m_xs_mixin> unique_rxs_m_xs_32_32;
+typedef unique_base<uint64_t, uint64_t, rxs_m_xs_mixin> unique_rxs_m_xs_64_64;
+typedef unique_base<pcg128_t, pcg128_t, rxs_m_xs_mixin> unique_rxs_m_xs_128_128;
+
+typedef setseq_base<uint8_t,  uint8_t, rxs_m_xs_mixin>  setseq_rxs_m_xs_8_8;
+typedef setseq_base<uint16_t, uint16_t, rxs_m_xs_mixin> setseq_rxs_m_xs_16_16;
+typedef setseq_base<uint32_t, uint32_t, rxs_m_xs_mixin> setseq_rxs_m_xs_32_32;
+typedef setseq_base<uint64_t, uint64_t, rxs_m_xs_mixin> setseq_rxs_m_xs_64_64;
+typedef setseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin> setseq_rxs_m_xs_128_128;
+
+                // MCG versions don't make sense here, so aren't defined.
+
+/* Predefined types for XSL RR (only defined for "large" types) */
+
+typedef oneseq_base<uint32_t, uint64_t, xsl_rr_mixin>  oneseq_xsl_rr_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsl_rr_mixin>  oneseq_xsl_rr_128_64;
+
+typedef unique_base<uint32_t, uint64_t, xsl_rr_mixin>  unique_xsl_rr_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsl_rr_mixin>  unique_xsl_rr_128_64;
+
+typedef setseq_base<uint32_t, uint64_t, xsl_rr_mixin>  setseq_xsl_rr_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsl_rr_mixin>  setseq_xsl_rr_128_64;
+
+typedef mcg_base<uint32_t, uint64_t, xsl_rr_mixin>  mcg_xsl_rr_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsl_rr_mixin>  mcg_xsl_rr_128_64;
+
+
+/* Predefined types for XSL RR RR (only defined for "large" types) */
+
+typedef oneseq_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    oneseq_xsl_rr_rr_64_64;
+typedef oneseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    oneseq_xsl_rr_rr_128_128;
+
+typedef unique_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    unique_xsl_rr_rr_64_64;
+typedef unique_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    unique_xsl_rr_rr_128_128;
+
+typedef setseq_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    setseq_xsl_rr_rr_64_64;
+typedef setseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    setseq_xsl_rr_rr_128_128;
+
+                // MCG versions don't make sense here, so aren't defined.
+
+/* Extended generators */
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std8 = extended<table_pow2, advance_pow2, BaseRNG,
+                          oneseq_rxs_m_xs_8_8, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std16 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_16_16, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std32 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_32_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std64 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_64_64, kdd>;
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_rxs_m_xs_32_32 =
+          ext_std32<table_pow2, advance_pow2, oneseq_rxs_m_xs_32_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_mcg_xsh_rs_64_32 =
+          ext_std32<table_pow2, advance_pow2, mcg_xsh_rs_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_xsh_rs_64_32 =
+          ext_std32<table_pow2, advance_pow2, oneseq_xsh_rs_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_setseq_xsh_rr_64_32 =
+          ext_std32<table_pow2, advance_pow2, setseq_xsh_rr_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_mcg_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, mcg_xsl_rr_128_64, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, oneseq_xsl_rr_128_64, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_setseq_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, setseq_xsl_rr_128_64, kdd>;
+
+} // namespace pcg_engines
+
+typedef pcg_engines::setseq_xsh_rr_64_32        pcg32;
+typedef pcg_engines::oneseq_xsh_rr_64_32        pcg32_oneseq;
+typedef pcg_engines::unique_xsh_rr_64_32        pcg32_unique;
+typedef pcg_engines::mcg_xsh_rs_64_32           pcg32_fast;
+
+typedef pcg_engines::setseq_xsl_rr_128_64       pcg64;
+typedef pcg_engines::oneseq_xsl_rr_128_64       pcg64_oneseq;
+typedef pcg_engines::unique_xsl_rr_128_64       pcg64_unique;
+typedef pcg_engines::mcg_xsl_rr_128_64          pcg64_fast;
+
+typedef pcg_engines::setseq_rxs_m_xs_8_8        pcg8_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_16_16      pcg16_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_32_32      pcg32_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_64_64      pcg64_once_insecure;
+typedef pcg_engines::setseq_xsl_rr_rr_128_128   pcg128_once_insecure;
+
+typedef pcg_engines::oneseq_rxs_m_xs_8_8        pcg8_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_16_16      pcg16_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_32_32      pcg32_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_64_64      pcg64_oneseq_once_insecure;
+typedef pcg_engines::oneseq_xsl_rr_rr_128_128   pcg128_oneseq_once_insecure;
+
+
+// These two extended RNGs provide two-dimensionally equidistributed
+// 32-bit generators.  pcg32_k2_fast occupies the same space as pcg64,
+// and can be called twice to generate 64 bits, but does not required
+// 128-bit math; on 32-bit systems, it's faster than pcg64 as well.
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true>     pcg32_k2;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,true>     pcg32_k2_fast;
+
+// These eight extended RNGs have about as much state as arc4random
+//
+//  - the k variants are k-dimensionally equidistributed
+//  - the c variants offer better crypographic security
+//
+// (just how good the cryptographic security is is an open question)
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true>     pcg32_k64;
+typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,true>        pcg32_k64_oneseq;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,true>     pcg32_k64_fast;
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,false>    pcg32_c64;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,false>    pcg32_c64_oneseq;
+typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,false>       pcg32_c64_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,true>    pcg64_k32;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,true>   pcg64_k32_oneseq;
+typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,true>      pcg64_k32_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,false>   pcg64_c32;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,false>  pcg64_c32_oneseq;
+typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,false>     pcg64_c32_fast;
+
+// These eight extended RNGs have more state than the Mersenne twister
+//
+//  - the k variants are k-dimensionally equidistributed
+//  - the c variants offer better crypographic security
+//
+// (just how good the cryptographic security is is an open question)
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,true>    pcg32_k1024;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,true>    pcg32_k1024_fast;
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,false>   pcg32_c1024;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,false>   pcg32_c1024_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,true>   pcg64_k1024;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,true>  pcg64_k1024_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,false>  pcg64_c1024;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,false> pcg64_c1024_fast;
+
+// These generators have an insanely huge period (2^524352), and is suitable
+// for silly party tricks, such as dumping out 64 KB ZIP files at an arbitrary
+// point in the future.   [Actually, over the full period of the generator, it
+// will produce every 64 KB ZIP file 2^64 times!]
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<14,16,true>    pcg32_k16384;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<14,32,true>    pcg32_k16384_fast;
+
+#endif // PCG_RAND_HPP_INCLUDED
diff --git a/include/safe_btree.h b/include/safe_btree.h
deleted file mode 100755
index 2d85c70..0000000
--- a/include/safe_btree.h
+++ /dev/null
@@ -1,395 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// A safe_btree<> wraps around a btree<> and removes the caveat that insertion
-// and deletion invalidate iterators. A safe_btree<> maintains a generation
-// number that is incremented on every mutation. A safe_btree<>::iterator keeps
-// a pointer to the safe_btree<> it came from, the generation of the tree when
-// it was last validated and the key the underlying btree<>::iterator points
-// to. If an iterator is accessed and its generation differs from the tree
-// generation it is revalidated.
-//
-// References and pointers returned by safe_btree iterators are not safe.
-//
-// See the incorrect usage examples mentioned in safe_btree_set.h and
-// safe_btree_map.h.
-
-#ifndef UTIL_BTREE_SAFE_BTREE_H__
-#define UTIL_BTREE_SAFE_BTREE_H__
-
-#include <stddef.h>
-#include <iosfwd>
-#include <utility>
-
-#include "btree.h"
-
-namespace btree {
-
-template <typename Tree, typename Iterator>
-class safe_btree_iterator {
- public:
-  typedef typename Iterator::key_type key_type;
-  typedef typename Iterator::value_type value_type;
-  typedef typename Iterator::size_type size_type;
-  typedef typename Iterator::difference_type difference_type;
-  typedef typename Iterator::pointer pointer;
-  typedef typename Iterator::reference reference;
-  typedef typename Iterator::const_pointer const_pointer;
-  typedef typename Iterator::const_reference const_reference;
-  typedef typename Iterator::iterator_category iterator_category;
-  typedef typename Tree::iterator iterator;
-  typedef typename Tree::const_iterator const_iterator;
-  typedef safe_btree_iterator<Tree, Iterator> self_type;
-
-  void update() const {
-    if (iter_ != tree_->internal_btree()->end()) {
-      // A positive generation indicates a valid key.
-      generation_ = tree_->generation();
-      key_ = iter_.key();
-    } else {
-      // Use a negative generation to indicate iter_ points to end().
-      generation_ = -tree_->generation();
-    }
-  }
-
- public:
-  safe_btree_iterator()
-      : generation_(0),
-        key_(),
-        iter_(),
-        tree_(NULL) {
-  }
-  safe_btree_iterator(const iterator &x)
-      : generation_(x.generation()),
-        key_(x.key()),
-        iter_(x.iter()),
-        tree_(x.tree()) {
-  }
-  safe_btree_iterator(Tree *tree, const Iterator &iter)
-      : generation_(),
-        key_(),
-        iter_(iter),
-        tree_(tree) {
-    update();
-  }
-
-  Tree* tree() const { return tree_; }
-  int64_t generation() const { return generation_; }
-
-  Iterator* mutable_iter() const {
-    if (generation_ != tree_->generation()) {
-      if (generation_ > 0) {
-        // This does the wrong thing for a multi{set,map}. If my iter was
-        // pointing to the 2nd of 2 values with the same key, then this will
-        // reset it to point to the first. This is why we don't provide a
-        // safe_btree_multi{set,map}.
-        iter_ = tree_->internal_btree()->lower_bound(key_);
-        update();
-      } else if (-generation_ != tree_->generation()) {
-        iter_ = tree_->internal_btree()->end();
-        generation_ = -tree_->generation();
-      }
-    }
-    return &iter_;
-  }
-  const Iterator& iter() const {
-    return *mutable_iter();
-  }
-
-  // Equality/inequality operators.
-  bool operator==(const const_iterator &x) const {
-    return iter() == x.iter();
-  }
-  bool operator!=(const const_iterator &x) const {
-    return iter() != x.iter();
-  }
-
-  // Accessors for the key/value the iterator is pointing at.
-  const key_type& key() const {
-    return key_;
-  }
-  // This reference value is potentially invalidated by any non-const
-  // method on the tree; it is NOT safe.
-  reference operator*() const {
-    assert(generation_ > 0);
-    return iter().operator*();
-  }
-  // This pointer value is potentially invalidated by any non-const
-  // method on the tree; it is NOT safe.
-  pointer operator->() const {
-    assert(generation_ > 0);
-    return iter().operator->();
-  }
-
-  // Increment/decrement operators.
-  self_type& operator++() {
-    ++(*mutable_iter());
-    update();
-    return *this;
-  }
-  self_type& operator--() {
-    --(*mutable_iter());
-    update();
-    return *this;
-  }
-  self_type operator++(int) {
-    self_type tmp = *this;
-    ++*this;
-    return tmp;
-  }
-  self_type operator--(int) {
-    self_type tmp = *this;
-    --*this;
-    return tmp;
-  }
-
- private:
-  // The generation of the tree when "iter" was updated.
-  mutable int64_t generation_;
-  // The key the iterator points to.
-  mutable key_type key_;
-  // The underlying iterator.
-  mutable Iterator iter_;
-  // The tree the iterator is associated with.
-  Tree *tree_;
-};
-
-template <typename Params>
-class safe_btree {
-  typedef safe_btree<Params> self_type;
-
-  typedef btree<Params> btree_type;
-  typedef typename btree_type::iterator tree_iterator;
-  typedef typename btree_type::const_iterator tree_const_iterator;
-
- public:
-  typedef typename btree_type::params_type params_type;
-  typedef typename btree_type::key_type key_type;
-  typedef typename btree_type::data_type data_type;
-  typedef typename btree_type::mapped_type mapped_type;
-  typedef typename btree_type::value_type value_type;
-  typedef typename btree_type::key_compare key_compare;
-  typedef typename btree_type::allocator_type allocator_type;
-  typedef typename btree_type::pointer pointer;
-  typedef typename btree_type::const_pointer const_pointer;
-  typedef typename btree_type::reference reference;
-  typedef typename btree_type::const_reference const_reference;
-  typedef typename btree_type::size_type size_type;
-  typedef typename btree_type::difference_type difference_type;
-  typedef safe_btree_iterator<self_type, tree_iterator> iterator;
-  typedef safe_btree_iterator<
-    const self_type, tree_const_iterator> const_iterator;
-  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
-  typedef std::reverse_iterator<iterator> reverse_iterator;
-
- public:
-  // Default constructor.
-  safe_btree(const key_compare &comp, const allocator_type &alloc)
-      : tree_(comp, alloc),
-        generation_(1) {
-  }
-
-  // Copy constructor.
-  safe_btree(const self_type &x)
-      : tree_(x.tree_),
-        generation_(1) {
-  }
-
-  iterator begin() {
-    return iterator(this, tree_.begin());
-  }
-  const_iterator begin() const {
-    return const_iterator(this, tree_.begin());
-  }
-  iterator end() {
-    return iterator(this, tree_.end());
-  }
-  const_iterator end() const {
-    return const_iterator(this, tree_.end());
-  }
-  reverse_iterator rbegin() {
-    return reverse_iterator(end());
-  }
-  const_reverse_iterator rbegin() const {
-    return const_reverse_iterator(end());
-  }
-  reverse_iterator rend() {
-    return reverse_iterator(begin());
-  }
-  const_reverse_iterator rend() const {
-    return const_reverse_iterator(begin());
-  }
-
-  // Lookup routines.
-  iterator lower_bound(const key_type &key) {
-    return iterator(this, tree_.lower_bound(key));
-  }
-  const_iterator lower_bound(const key_type &key) const {
-    return const_iterator(this, tree_.lower_bound(key));
-  }
-  iterator upper_bound(const key_type &key) {
-    return iterator(this, tree_.upper_bound(key));
-  }
-  const_iterator upper_bound(const key_type &key) const {
-    return const_iterator(this, tree_.upper_bound(key));
-  }
-  std::pair<iterator, iterator> equal_range(const key_type &key) {
-    std::pair<tree_iterator, tree_iterator> p = tree_.equal_range(key);
-    return std::make_pair(iterator(this, p.first),
-                     iterator(this, p.second));
-  }
-  std::pair<const_iterator, const_iterator> equal_range(const key_type &key) const {
-    std::pair<tree_const_iterator, tree_const_iterator> p = tree_.equal_range(key);
-    return std::make_pair(const_iterator(this, p.first),
-                     const_iterator(this, p.second));
-  }
-  iterator find_unique(const key_type &key) {
-    return iterator(this, tree_.find_unique(key));
-  }
-  const_iterator find_unique(const key_type &key) const {
-    return const_iterator(this, tree_.find_unique(key));
-  }
-  iterator find_multi(const key_type &key) {
-    return iterator(this, tree_.find_multi(key));
-  }
-  const_iterator find_multi(const key_type &key) const {
-    return const_iterator(this, tree_.find_multi(key));
-  }
-  size_type count_unique(const key_type &key) const {
-    return tree_.count_unique(key);
-  }
-  size_type count_multi(const key_type &key) const {
-    return tree_.count_multi(key);
-  }
-
-  // Insertion routines.
-  template <typename ValuePointer>
-  std::pair<iterator, bool> insert_unique(const key_type &key, ValuePointer value) {
-    std::pair<tree_iterator, bool> p = tree_.insert_unique(key, value);
-    generation_ += p.second;
-    return std::make_pair(iterator(this, p.first), p.second);
-  }
-  std::pair<iterator, bool> insert_unique(const value_type &v) {
-    std::pair<tree_iterator, bool> p = tree_.insert_unique(v);
-    generation_ += p.second;
-    return std::make_pair(iterator(this, p.first), p.second);
-  }
-  iterator insert_unique(iterator position, const value_type &v) {
-    tree_iterator tree_pos = position.iter();
-    ++generation_;
-    return iterator(this, tree_.insert_unique(tree_pos, v));
-  }
-  template <typename InputIterator>
-  void insert_unique(InputIterator b, InputIterator e) {
-    for (; b != e; ++b) {
-      insert_unique(*b);
-    }
-  }
-  iterator insert_multi(const value_type &v) {
-    ++generation_;
-    return iterator(this, tree_.insert_multi(v));
-  }
-  iterator insert_multi(iterator position, const value_type &v) {
-    tree_iterator tree_pos = position.iter();
-    ++generation_;
-    return iterator(this, tree_.insert_multi(tree_pos, v));
-  }
-  template <typename InputIterator>
-  void insert_multi(InputIterator b, InputIterator e) {
-    for (; b != e; ++b) {
-      insert_multi(*b);
-    }
-  }
-  self_type& operator=(const self_type &x) {
-    if (&x == this) {
-      // Don't copy onto ourselves.
-      return *this;
-    }
-    ++generation_;
-    tree_ = x.tree_;
-    return *this;
-  }
-
-  // Deletion routines.
-  void erase(const iterator &begin, const iterator &end) {
-    tree_.erase(begin.iter(), end.iter());
-    ++generation_;
-  }
-  // Erase the specified iterator from the btree. The iterator must be valid
-  // (i.e. not equal to end()).  Return an iterator pointing to the node after
-  // the one that was erased (or end() if none exists).
-  iterator erase(iterator iter) {
-    tree_iterator res = tree_.erase(iter.iter());
-    ++generation_;
-    return iterator(this, res);
-  }
-  int erase_unique(const key_type &key) {
-    int res = tree_.erase_unique(key);
-    generation_ += res;
-    return res;
-  }
-  int erase_multi(const key_type &key) {
-    int res = tree_.erase_multi(key);
-    generation_ += res;
-    return res;
-  }
-
-  // Access to the underlying btree.
-  btree_type* internal_btree() { return &tree_; }
-  const btree_type* internal_btree() const { return &tree_; }
-
-  // Utility routines.
-  void clear() {
-    ++generation_;
-    tree_.clear();
-  }
-  void swap(self_type &x) {
-    ++generation_;
-    ++x.generation_;
-    tree_.swap(x.tree_);
-  }
-  void dump(std::ostream &os) const {
-    tree_.dump(os);
-  }
-  void verify() const {
-    tree_.verify();
-  }
-  int64_t generation() const {
-    return generation_;
-  }
-  key_compare key_comp() const { return tree_.key_comp(); }
-
-  // Size routines.
-  size_type size() const { return tree_.size(); }
-  size_type max_size() const { return tree_.max_size(); }
-  bool empty() const { return tree_.empty(); }
-  size_type height() const { return tree_.height(); }
-  size_type internal_nodes() const { return tree_.internal_nodes(); }
-  size_type leaf_nodes() const { return tree_.leaf_nodes(); }
-  size_type nodes() const { return tree_.nodes(); }
-  size_type bytes_used() const { return tree_.bytes_used(); }
-  static double average_bytes_per_value() {
-    return btree_type::average_bytes_per_value();
-  }
-  double fullness() const { return tree_.fullness(); }
-  double overhead() const { return tree_.overhead(); }
-
- private:
-  btree_type tree_;
-  int64_t generation_;
-};
-
-}  // namespace btree
-
-#endif  // UTIL_BTREE_SAFE_BTREE_H__
diff --git a/include/safe_btree_map.h b/include/safe_btree_map.h
deleted file mode 100755
index a0668f1..0000000
--- a/include/safe_btree_map.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The safe_btree_map<> is like btree_map<> except that it removes the caveat
-// about insertion and deletion invalidating existing iterators at a small cost
-// in making iterators larger and slower.
-//
-// Revalidation occurs whenever an iterator is accessed.  References
-// and pointers returned by safe_btree_map<> iterators are not stable,
-// they are potentially invalidated by any non-const method on the map.
-//
-// BEGIN INCORRECT EXAMPLE
-//   for (auto i = safe_map->begin(); i != safe_map->end(); ++i) {
-//     const T *value = &i->second;  // DO NOT DO THIS
-//     [code that modifies safe_map and uses value];
-//   }
-// END INCORRECT EXAMPLE
-#ifndef UTIL_BTREE_SAFE_BTREE_MAP_H__
-#define UTIL_BTREE_SAFE_BTREE_MAP_H__
-
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "btree_container.h"
-#include "btree_map.h"
-#include "safe_btree.h"
-
-namespace btree {
-
-// The safe_btree_map class is needed mainly for its constructors.
-template <typename Key, typename Value,
-          typename Compare = std::less<Key>,
-          typename Alloc = std::allocator<std::pair<const Key, Value> >,
-          int TargetNodeSize = 256>
-class safe_btree_map : public btree_map_container<
-  safe_btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > {
-
-  typedef safe_btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type;
-  typedef btree_map_params<
-    Key, Value, Compare, Alloc, TargetNodeSize> params_type;
-  typedef safe_btree<params_type> btree_type;
-  typedef btree_map_container<btree_type> super_type;
-
- public:
-  typedef typename btree_type::key_compare key_compare;
-  typedef typename btree_type::allocator_type allocator_type;
-
- public:
-  // Default constructor.
-  safe_btree_map(const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  safe_btree_map(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  safe_btree_map(InputIterator b, InputIterator e,
-                 const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(b, e, comp, alloc) {
-  }
-};
-
-template <typename K, typename V, typename C, typename A, int N>
-inline void swap(safe_btree_map<K, V, C, A, N> &x,
-                 safe_btree_map<K, V, C, A, N> &y) {
-  x.swap(y);
-}
-
-} // namespace btree
-
-#endif  // UTIL_BTREE_SAFE_BTREE_MAP_H__
diff --git a/include/safe_btree_set.h b/include/safe_btree_set.h
deleted file mode 100755
index a6cd541..0000000
--- a/include/safe_btree_set.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The safe_btree_set<> is like btree_set<> except that it removes the caveat
-// about insertion and deletion invalidating existing iterators at a small cost
-// in making iterators larger and slower.
-//
-// Revalidation occurs whenever an iterator is accessed.  References
-// and pointers returned by safe_btree_map<> iterators are not stable,
-// they are potentially invalidated by any non-const method on the set.
-//
-// BEGIN INCORRECT EXAMPLE
-//   for (auto i = safe_set->begin(); i != safe_set->end(); ++i) {
-//     const T &value = *i;  // DO NOT DO THIS
-//     [code that modifies safe_set and uses value];
-//   }
-// END INCORRECT EXAMPLE
-
-#ifndef UTIL_BTREE_SAFE_BTREE_SET_H__
-#define UTIL_BTREE_SAFE_BTREE_SET_H__
-
-#include <functional>
-#include <memory>
-
-#include "btree_container.h"
-#include "btree_set.h"
-#include "safe_btree.h"
-
-namespace btree {
-
-// The safe_btree_set class is needed mainly for its constructors.
-template <typename Key,
-          typename Compare = std::less<Key>,
-          typename Alloc = std::allocator<Key>,
-          int TargetNodeSize = 256>
-class safe_btree_set : public btree_unique_container<
-  safe_btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > {
-
-  typedef safe_btree_set<Key, Compare, Alloc, TargetNodeSize> self_type;
-  typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type;
-  typedef safe_btree<params_type> btree_type;
-  typedef btree_unique_container<btree_type> super_type;
-
- public:
-  typedef typename btree_type::key_compare key_compare;
-  typedef typename btree_type::allocator_type allocator_type;
-
- public:
-  // Default constructor.
-  safe_btree_set(const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(comp, alloc) {
-  }
-
-  // Copy constructor.
-  safe_btree_set(const self_type &x)
-      : super_type(x) {
-  }
-
-  // Range constructor.
-  template <class InputIterator>
-  safe_btree_set(InputIterator b, InputIterator e,
-                 const key_compare &comp = key_compare(),
-                 const allocator_type &alloc = allocator_type())
-      : super_type(b, e, comp, alloc) {
-  }
-};
-
-template <typename K, typename C, typename A, int N>
-inline void swap(safe_btree_set<K, C, A, N> &x,
-                 safe_btree_set<K, C, A, N> &y) {
-  x.swap(y);
-}
-
-} // namespace btree
-
-#endif  // UTIL_BTREE_SAFE_BTREE_SET_H__
diff --git a/scripts/ConvertBootstrapsToTSV.py b/scripts/ConvertBootstrapsToTSV.py
index 12bb878..ce986f6 100644
--- a/scripts/ConvertBootstrapsToTSV.py
+++ b/scripts/ConvertBootstrapsToTSV.py
@@ -29,8 +29,8 @@ def main(args):
         if 'auxDir' in dat:
             auxDir = dat['auxDir']
 
-    bootstrapFile = os.path.sep.join([quantDir, "aux", "bootstrap", "bootstraps.gz"])
-    nameFile = os.path.sep.join([quantDir, "aux", "bootstrap", "names.tsv.gz"])
+    bootstrapFile = os.path.sep.join([quantDir, auxDir, "bootstrap", "bootstraps.gz"])
+    nameFile = os.path.sep.join([quantDir, auxDir, "bootstrap", "names.tsv.gz"])
     if not os.path.isfile(bootstrapFile):
        logging.error("The required bootstrap file {} doesn't appear to exist".format(bootstrapFile)) 
        sys.exit(1)
@@ -45,11 +45,12 @@ def main(args):
     ntxp = len(txpNames)
     logging.info("Expecting bootstrap info for {} transcripts".format(ntxp))
     
-    with open(os.path.sep.join([quantDir, "aux", "meta_info.json"])) as fh:
+    with open(os.path.sep.join([quantDir, auxDir, "meta_info.json"])) as fh:
         meta_info = json.load(fh)
         
     if meta_info['samp_type'] == 'gibbs':
-        s = struct.Struct('<' + 'i' * ntxp)
+        #s = struct.Struct('<' + 'i' * ntxp)
+        s = struct.Struct('@' + 'd' * ntxp)
     elif meta_info['samp_type'] == 'bootstrap':
         s = struct.Struct('@' + 'd' * ntxp)
     else:
diff --git a/scripts/Dockerfile b/scripts/Dockerfile
new file mode 100644
index 0000000..c6f61d9
--- /dev/null
+++ b/scripts/Dockerfile
@@ -0,0 +1,34 @@
+FROM combinelab/holy-build-box-64:latest_working
+
+RUN set -x
+
+# Install things we need
+RUN yum install -y --quiet wget
+RUN wget http://download.fedoraproject.org/pub/epel/5/x86_64/epel-release-5-4.noarch.rpm
+RUN rpm -i --quiet epel-release-5-4.noarch.rpm
+#yum install -y --quiet git
+RUN yum install -y --quiet unzip
+RUN yum install -y --quiet bzip2-devel.x86_64
+RUN yum install -y --quiet xz-devel.x86_64
+RUN yum install -y --quiet git
+
+RUN wget http://downloads.sourceforge.net/project/boost/boost/1.59.0/boost_1_59_0.tar.gz 
+RUN tar xzf boost_1_59_0.tar.gz
+WORKDIR "/boost_1_59_0"
+RUN source /hbb_exe/activate && ./bootstrap.sh --prefix=/usr --with-libraries=iostreams,atomic,chrono,container,date_time,exception,filesystem,graph,graph_parallel,math,program_options,system,thread,timer,serialization
+RUN source /hbb_exe/activate && ./b2 -d0 -j10 cxxflags=-std=c++11 link=static install
+WORKDIR "/"
+RUN rm boost_1_59_0.tar.gz
+RUN rm -fr "/boost_1_59_0"
+RUN wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
+RUN bash Miniconda2-latest-Linux-x86_64.sh -b
+RUN PYTHONPATH=/root/miniconda2/lib/python2.7/site-packages PATH=/root/miniconda2/bin:$PATH pip install pandas scipy numpy matplotlib seaborn
+
+# java
+RUN wget --no-cookies --no-check-certificate --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F; oraclelicense=accept-securebackup-cookie" "http://download.oracle.com/otn-pub/java/jdk/8u60-b27/jre-8u60-linux-x64.rpm" -O jre-8u60-linux-x64.rpm
+RUN yum localinstall --nogpgcheck -y --quiet jre-8u60-linux-x64.rpm
+RUN rm jre-8u60-linux-x64.rpm
+
+# and nextflow
+RUN curl -fsSL get.nextflow.io | bash
+RUN mv nextflow /usr/local/bin/
diff --git a/scripts/fetchRapMap.sh b/scripts/fetchRapMap.sh
index febdbf1..d30c3aa 100755
--- a/scripts/fetchRapMap.sh
+++ b/scripts/fetchRapMap.sh
@@ -17,12 +17,12 @@ if [ -d ${INSTALL_DIR}/src/rapmap ] ; then
 fi
 
 mkdir -p ${EXTERNAL_DIR}
-curl -k -L https://github.com/COMBINE-lab/RapMap/archive/salmon-v0.7.2.zip -o ${EXTERNAL_DIR}/rapmap.zip
+curl -k -L https://github.com/COMBINE-lab/RapMap/archive/salmon-v0.8.0.zip -o ${EXTERNAL_DIR}/rapmap.zip
 #curl -k -L https://github.com/COMBINE-lab/RapMap/archive/develop-salmon.zip -o ${EXTERNAL_DIR}/rapmap.zip
 rm -fr ${EXTERNAL_DIR}/RapMap
 unzip ${EXTERNAL_DIR}/rapmap.zip -d ${EXTERNAL_DIR}
 #mv ${EXTERNAL_DIR}/RapMap-develop-salmon ${EXTERNAL_DIR}/RapMap
-mv ${EXTERNAL_DIR}/RapMap-salmon-v0.7.2 ${EXTERNAL_DIR}/RapMap
+mv ${EXTERNAL_DIR}/RapMap-salmon-v0.8.0 ${EXTERNAL_DIR}/RapMap
 
 mkdir -p ${INSTALL_DIR}/include/rapmap
 mkdir -p ${INSTALL_DIR}/src/rapmap
diff --git a/scripts/make-release.sh b/scripts/make-release.sh
index 77fa60c..c2769af 100755
--- a/scripts/make-release.sh
+++ b/scripts/make-release.sh
@@ -44,6 +44,9 @@ echo -e "Copying over the binary and Intel TBB libraries\n"
 cp ${DIR}/../bin/salmon ${DIR}/../RELEASES/${betaname}/bin/
 cp ${DIR}/../lib/libtbb* ${DIR}/../RELEASES/${betaname}/lib/
 
+# copy over the test data
+cp ${DIR}/../sample_data.tgz ${DIR}/../RELEASES/${betaname}/
+
 # copy other dependencies (shared libraries)
 echo -e "Copying over other shared library dependencies\n"
 bash ${DIR}/../scripts/cpld.bash ${DIR}/../bin/salmon ${DIR}/../RELEASES/${betaname}/lib/
diff --git a/scripts/test_sim_corr.py b/scripts/test_sim_corr.py
new file mode 100644
index 0000000..82da053
--- /dev/null
+++ b/scripts/test_sim_corr.py
@@ -0,0 +1,48 @@
+import argparse
+import errno    
+import os
+
+
+def mkdir_p(path):
+    """
+    http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python
+    """
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+def computeSimMetrics(args):
+    import pandas as pd
+    import json
+    simFile = args.sim
+    estFile = args.est
+    sd = pd.read_table(simFile).set_index('Name')
+    ed = pd.read_table(estFile).set_index('Name')
+    d = sd.join(ed, rsuffix='_est')
+
+    res = {}
+    res['pearson'] = d['TPM'].corr(d['TPM_est'])
+    res['spearman'] = d['TPM'].corr(d['TPM_est'], method='spearman')
+
+    import os.path
+    pdir = os.path.dirname(args.out)
+    if not os.path.exists(pdir):
+        mkdir_p(pdir)
+
+    with open(args.out, 'w') as outfile:
+        json.dump(res, outfile)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Test simulated accuracy')
+    parser.add_argument('--sim', type=str, help='path to simulated tpm')
+    parser.add_argument('--est', type=str, help='path to estimated tpm')
+    parser.add_argument('--out', type=str, help='where to write the output')
+    args = parser.parse_args()
+    computeSimMetrics(args)
+
+
diff --git a/src/BuildSalmonIndex.cpp b/src/BuildSalmonIndex.cpp
index 098f246..7063aa0 100644
--- a/src/BuildSalmonIndex.cpp
+++ b/src/BuildSalmonIndex.cpp
@@ -104,7 +104,7 @@ int salmonIndex(int argc, char* argv[]) {
                             "The default should be OK, unless your transcriptome is huge. "
 			    "This value should be a power of 2.")
     ;
-
+    
     po::variables_map vm;
     int ret = 0;
     try {
@@ -143,6 +143,18 @@ Creates a salmon index.
         string transcriptFile = vm["transcripts"].as<string>();
         bfs::path indexDirectory(vm["index"].as<string>());
 
+        // Check that the transcriptome file exists
+        if (!bfs::exists(transcriptFile)) {
+          std::cerr << "The file [" << transcriptFile << "] provided for the transcriptome "
+                    << "does not appear to exist.";
+          std::exit(1);
+        }
+        // and is not a directory
+        if (bfs::is_directory(transcriptFile)) {
+          std::cerr << "The provided transcriptome argument [" << transcriptFile << "] appears to be a directory; "
+                    << "please provide a file.";
+          std::exit(1);
+        }
 
         if (!bfs::exists(indexDirectory)) {
             std::cerr << "index [" << indexDirectory << "] did not previously exist "
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c851492..12ee140 100755
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -27,7 +27,6 @@ xxhash.c
 ${GAT_SOURCE_DIR}/external/install/src/rapmap/RapMapFileSystem.cpp
 ${GAT_SOURCE_DIR}/external/install/src/rapmap/RapMapSAIndexer.cpp
 ${GAT_SOURCE_DIR}/external/install/src/rapmap/RapMapSAIndex.cpp
-${GAT_SOURCE_DIR}/external/install/src/rapmap/RapMapSAMapper.cpp
 ${GAT_SOURCE_DIR}/external/install/src/rapmap/RapMapUtils.cpp
 ${GAT_SOURCE_DIR}/external/install/src/rapmap/HitManager.cpp
 ${GAT_SOURCE_DIR}/external/install/src/rapmap/rank9b.cpp
@@ -109,6 +108,7 @@ endif()
 # Build the Salmon library
 add_library(salmon_core STATIC ${SALMON_LIB_SRCS} )
 
+
 # Build the salmon executable
 add_executable(salmon ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
 
@@ -144,9 +144,44 @@ target_link_libraries(salmon
     ${LIBSALMON_LINKER_FLAGS}
     ${NON_APPLECLANG_LIBS}
     ${FAST_MALLOC_LIB}
+    ${LIBRT}
 )
 
+##
+# External dependencies of salmon_core and salmon
+##
+if (${FETCHED_JEMALLOC})
+   add_dependencies(salmon_core libjemalloc)
+   add_dependencies(salmon libjemalloc)
+endif()
+
+if (${FETCHED_BOOST})
+   add_dependencies(salmon_core libboost)
+   add_dependencies(salmon libboost)
+endif()
+
+if (${FETCHED_JELLYFISH})
+   add_dependencies(salmon_core libjellyfish)
+   add_dependencies(salmon libjellyfish)
+endif()
+
+if (${FETCHED_TBB})
+   add_dependencies(salmon_core libtbb)
+   add_dependencies(salmon libtbb)
+endif()
+
+add_dependencies(salmon_core libcereal)
+add_dependencies(salmon libcereal)
+add_dependencies(salmon_core libstadenio)
+add_dependencies(salmon libstadenio)
+add_dependencies(salmon_core libspdlog)
+add_dependencies(salmon libspdlog)
+add_dependencies(salmon_core libbwa)
 add_dependencies(salmon libbwa)
+add_dependencies(salmon_core libgff)
+add_dependencies(salmon libgff)
+add_dependencies(salmon_core libdivsufsort)
+add_dependencies(salmon libdivsufsort)
 
 # Link the executable
 target_link_libraries(unitTests
@@ -167,8 +202,11 @@ target_link_libraries(unitTests
     ${LIBSALMON_LINKER_FLAGS}
     ${NON_APPLECLANG_LIBS}
     ${FAST_MALLOC_LIB}
+    ${LIBRT}
     )
 
+add_dependencies(salmon unitTests)
+
 ### No need for this, I think
 ##  This ensures that the salmon executable should work with or without `make install`
 ###
diff --git a/src/CollapsedEMOptimizer.cpp b/src/CollapsedEMOptimizer.cpp
index 7ce4ed7..731cfe8 100644
--- a/src/CollapsedEMOptimizer.cpp
+++ b/src/CollapsedEMOptimizer.cpp
@@ -86,6 +86,30 @@ double truncateCountVector(VecT& alphas, std::vector<double>& cutoff) {
 }
 
 /**
+ *  Populate the prior parameters for the VBEM
+ *  Note: effLens *must* be valid before calling this function.
+ */
+std::vector<double> populatePriorAlphas_(
+                                         std::vector<Transcript>& transcripts, // transcripts
+                                         Eigen::VectorXd& effLens, // current effective length estimate
+                                         double priorValue,        // the per-nucleotide prior value to use
+                                         bool perTranscriptPrior   // true if prior is per-txp, else per-nucleotide
+                                         ) {
+    // start out with the per-txp prior
+    std::vector<double> priorAlphas(transcripts.size(), priorValue);
+
+    // If the prior is per-nucleotide (default, then we need a potentially different
+    // value for each transcript based on its length).
+    if (!perTranscriptPrior) {
+        for (size_t i = 0; i < transcripts.size(); ++i) {
+            priorAlphas[i] = priorValue * effLens(i); 
+        }
+    }
+    return priorAlphas;
+}
+
+
+/**
  * Single-threaded EM-update routine for use in bootstrapping
  */
 template <typename VecT>
@@ -148,24 +172,25 @@ void VBEMUpdate_(std::vector<std::vector<uint32_t>>& txpGroupLabels,
                  VecT& expTheta) {
 
   assert(alphaIn.size() == alphaOut.size());
-
+  size_t M = alphaIn.size();
   size_t numEQClasses = txpGroupLabels.size();
   double alphaSum = {0.0};
-  for (auto& e : alphaIn) {
-    alphaSum += e;
+  for (size_t i = 0; i < M; ++i) {
+    alphaSum +=  alphaIn[i] + priorAlphas[i];
   }
 
   double logNorm = boost::math::digamma(alphaSum);
 
   //double prior = priorAlpha;
 
-  for (size_t i = 0; i < transcripts.size(); ++i) {
-    if (alphaIn[i] > ::digammaMin) {
-      expTheta[i] = std::exp(boost::math::digamma(alphaIn[i]) - logNorm);
+  for (size_t i = 0; i < M; ++i) {
+      auto ap = alphaIn[i] + priorAlphas[i];
+    if (ap > ::digammaMin) {
+      expTheta[i] = std::exp(boost::math::digamma(ap) - logNorm);
     } else {
       expTheta[i] = 0.0;
     }
-    alphaOut[i] = priorAlphas[i];
+    alphaOut[i] = 0.0;//priorAlphas[i];
   }
 
   for (size_t eqID = 0; eqID < numEQClasses; ++eqID) {
@@ -278,10 +303,10 @@ void VBEMUpdate_(std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
                  CollapsedEMOptimizer::VecType& expTheta) {
 
   assert(alphaIn.size() == alphaOut.size());
-
+  size_t M = alphaIn.size();
   double alphaSum = {0.0};
-  for (auto& e : alphaIn) {
-    alphaSum += e;
+  for (size_t i = 0; i < M; ++i) {
+      alphaSum +=  alphaIn[i] + priorAlphas[i];
   }
 
   double logNorm = boost::math::digamma(alphaSum);
@@ -293,15 +318,14 @@ void VBEMUpdate_(std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
                       //double prior = priorAlpha;
 
                       for (auto i : boost::irange(range.begin(), range.end())) {
-                        if (alphaIn[i] > ::digammaMin) {
-                          expTheta[i] =
-                              std::exp(boost::math::digamma(alphaIn[i].load()) -
-                                       logNorm);
-                        } else {
-                          expTheta[i] = 0.0;
-                        }
-                        //alphaOut[i] = prior * transcripts[i].RefLength;
-                        alphaOut[i] = priorAlphas[i];
+                          auto ap = alphaIn[i].load() + priorAlphas[i];
+                          if (ap > ::digammaMin) {
+                              expTheta[i] = std::exp(boost::math::digamma(ap) - logNorm);
+                          } else {
+                              expTheta[i] = 0.0;
+                          }
+                          //alphaOut[i] = prior * transcripts[i].RefLength;
+                          alphaOut[i] = 0.0;
                       }
                     });
 
@@ -358,7 +382,7 @@ void VBEMUpdate_(std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
 template <typename VecT>
 size_t markDegenerateClasses(
     std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
-    VecT& alphaIn, Eigen::VectorXd& effLens,
+    VecT& alphaIn, Eigen::VectorXd& effLens, std::vector<bool>& available, 
     std::shared_ptr<spdlog::logger> jointLog, bool verbose = false) {
 
   size_t numDropped{0};
@@ -412,6 +436,11 @@ size_t markDegenerateClasses(
       }
       ++numDropped;
       kv.first.setValid(false);
+    } else {
+      for (size_t i = 0; i < txps.size(); ++i) {
+        auto tid = txps[i];
+        available[tid] = true;
+      }
     }
   }
   return numDropped;
@@ -423,7 +452,7 @@ bool doBootstrap(
     std::vector<std::vector<uint32_t>>& txpGroups,
     std::vector<std::vector<double>>& txpGroupCombinedWeights,
     std::vector<Transcript>& transcripts, Eigen::VectorXd& effLens,
-    std::vector<double>& sampleWeights, uint64_t totalNumFrags,
+    const std::vector<double>& sampleWeights, uint64_t totalNumFrags,
     uint64_t numMappedFrags, double uniformTxpWeight,
     std::atomic<uint32_t>& bsNum, SalmonOpts& sopt,
     std::vector<double>& priorAlphas,
@@ -447,11 +476,20 @@ bool doBootstrap(
   auto& jointLog = sopt.jointLog;
 
   std::random_device rd;
-  MultinomialSampler msamp(rd);
-
+  std::mt19937 gen(rd());
+  //MultinomialSampler msamp(rd);
+  std::discrete_distribution<uint64_t> csamp(sampleWeights.begin(), sampleWeights.end());
   while (bsNum++ < numBootstraps) {
+    csamp.reset();
+
+    for (size_t sc = 0; sc < sampCounts.size(); ++sc) {
+      sampCounts[sc] = 0;
+    }
+    for (size_t fn = 0; fn < totalNumFrags; ++fn) {
+      ++sampCounts[csamp(gen)];
+    }
     // Do a new bootstrap
-    msamp(sampCounts.begin(), totalNumFrags, numClasses, sampleWeights.begin());
+    //msamp(sampCounts.begin(), totalNumFrags, numClasses, sampleWeights.begin());
 
     double totalLen{0.0};
     for (size_t i = 0; i < transcripts.size(); ++i) {
@@ -503,7 +541,7 @@ bool doBootstrap(
     if (useVBEM and !perTranscriptPrior) {
         std::vector<double> cutoffs(transcripts.size(), 0.0);
         for (size_t i = 0; i < transcripts.size(); ++i) {
-            cutoffs[i] = priorAlphas[i] + minAlpha;
+            cutoffs[i] = minAlpha;
         }
         //alphaSum = truncateCountVector(alphas, cutoffs);
         alphaSum = truncateCountVector(alphas, cutoffs);
@@ -552,6 +590,7 @@ bool CollapsedEMOptimizer::gatherBootstraps(
     double relDiffTolerance, uint32_t maxIter) {
 
   std::vector<Transcript>& transcripts = readExp.transcripts();
+  std::vector<bool> available(transcripts.size(), false);
   using VecT = CollapsedEMOptimizer::SerialVecType;
   // With atomics
   VecT alphas(transcripts.size(), 0.0);
@@ -561,7 +600,6 @@ bool CollapsedEMOptimizer::gatherBootstraps(
 
   bool scaleCounts = (!sopt.useQuasi and !sopt.allowOrphans);
 
-  auto& fragStartDists = readExp.fragmentStartPositionDistributions();
   uint64_t numMappedFrags =
       scaleCounts ? readExp.upperBoundHits() : readExp.numMappedFragments();
 
@@ -582,24 +620,13 @@ bool CollapsedEMOptimizer::gatherBootstraps(
   bool useVBEM{sopt.useVBOpt};
   bool perTranscriptPrior{sopt.perTranscriptPrior};
   double priorValue{sopt.vbPrior};
-  
-  // If we use VBEM, we'll need the prior parameters
-  std::vector<double> priorAlphas(transcripts.size(), priorValue);
-  // If the prior is per-nucleotide (default, then we need a potentially different
-  // value for each transcript based on its length).
-  if (!perTranscriptPrior) {
-    for (size_t i = 0; i < transcripts.size(); ++i) {
-      priorAlphas[i] = priorValue * transcripts[i].RefLength;
-    }
-  }
-  //double priorAlpha = 1e-3;//1.00;
 
   auto jointLog = sopt.jointLog;
 
   jointLog->info("Will draw {} bootstrap samples", numBootstraps);
   jointLog->info("Optimizing over {} equivalence classes", eqVec.size());
 
-  double totalNumFrags{static_cast<double>(readExp.numMappedFragments())};
+  double totalNumFrags{static_cast<double>(numMappedFrags)};
   double totalLen{0.0};
 
   if (activeTranscriptIDs.size() == 0) {
@@ -614,12 +641,16 @@ bool CollapsedEMOptimizer::gatherBootstraps(
     alphas[i] = transcripts[i].getActive() ? scale * totalNumFrags : 0.0;
     effLens(i) = (sopt.noEffectiveLengthCorrection)
                      ? transcripts[i].RefLength
-                     : std::exp(transcripts[i].getCachedLogEffectiveLength());
+                     : transcripts[i].EffectiveLength;
     totalLen += effLens(i);
   }
 
+  
+  // If we use VBEM, we'll need the prior parameters
+  std::vector<double> priorAlphas = populatePriorAlphas_(transcripts, effLens, priorValue, perTranscriptPrior);
+
   auto numRemoved =
-      markDegenerateClasses(eqVec, alphas, effLens, sopt.jointLog);
+    markDegenerateClasses(eqVec, alphas, effLens, available, sopt.jointLog);
   sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
                       numRemoved);
 
@@ -681,11 +712,10 @@ bool CollapsedEMOptimizer::gatherBootstraps(
 
 void updateEqClassWeights(
     std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
-    Eigen::VectorXd& posWeightInvDenoms, Eigen::VectorXd& effLens) {
+    Eigen::VectorXd& effLens) {
   tbb::parallel_for(
       BlockedIndexRange(size_t(0), size_t(eqVec.size())),
-      [&eqVec, &effLens,
-       &posWeightInvDenoms](const BlockedIndexRange& range) -> void {
+      [&eqVec, &effLens](const BlockedIndexRange& range) -> void {
         // For each index in the equivalence class vector
         for (auto eqID : boost::irange(range.begin(), range.end())) {
           // The vector entry
@@ -704,8 +734,7 @@ void updateEqClassWeights(
             auto tid = k.txps[i];
             v.posWeights[i] = 1.0 / effLens(tid);
             v.combinedWeights[i] =
-                kv.second.count *
-                (v.weights[i] * v.posWeights[i] * posWeightInvDenoms[tid]);
+                kv.second.count * (v.weights[i] * v.posWeights[i]);
             wsum += v.combinedWeights[i];
           }
           double wnorm = 1.0 / wsum;
@@ -722,11 +751,15 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
 
   tbb::task_scheduler_init tbbScheduler(sopt.numThreads);
   std::vector<Transcript>& transcripts = readExp.transcripts();
+  std::vector<bool> available(transcripts.size(), false);
 
   uint32_t minIter = 50;
   bool seqBiasCorrect = sopt.biasCorrect;
   bool gcBiasCorrect = sopt.gcBiasCorrect;
-  bool doBiasCorrect = seqBiasCorrect or gcBiasCorrect;
+  bool posBiasCorrect = sopt.posBiasCorrect;
+  bool doBiasCorrect = seqBiasCorrect or gcBiasCorrect or posBiasCorrect;
+  bool metaGenomeMode = sopt.meta;
+  bool altInitMode = sopt.alternativeInitMode;
 
   using VecT = CollapsedEMOptimizer::VecType;
   // With atomics
@@ -735,7 +768,6 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
   VecType expTheta(transcripts.size());
 
   Eigen::VectorXd effLens(transcripts.size());
-  Eigen::VectorXd posWeightInvDenoms(transcripts.size());
 
   std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
       readExp.equivalenceClassBuilder().eqVec();
@@ -747,20 +779,6 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
   bool perTranscriptPrior{sopt.perTranscriptPrior};
   double priorValue{sopt.vbPrior};
   
-  // If we use VBEM, we'll need the prior parameters
-  std::vector<double> priorAlphas(transcripts.size(), priorValue);
-  // If the prior is per-nucleotide (default, then we need a potentially different
-  // value for each transcript based on its length).
-  if (!perTranscriptPrior) {
-    for (size_t i = 0; i < transcripts.size(); ++i) {
-      priorAlphas[i] = priorValue * transcripts[i].RefLength;
-    }
-  }
-
-  // If we use VBEM, we'll need the prior parameters
-  //double priorAlpha = 1e-3;//0.01;
-  //double priorAlpha = 1.0;
-
   auto jointLog = sopt.jointLog;
 
   auto& fragStartDists = readExp.fragmentStartPositionDistributions();
@@ -782,44 +800,39 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
     effLens(i) = useEffectiveLengths
                      ? std::exp(txp.getCachedLogEffectiveLength())
                      : txp.RefLength;
+    if (sopt.noLengthCorrection) { effLens(i) = 100.0; }
     txp.EffectiveLength = effLens(i);
 
-    if (txp.uniqueCount() > 0) {
-      totalWeight += txp.uniqueCount();
-      alphasPrime[i] = 1.0;
-      ++numActive;
-    } else {
-      totalWeight += 1e-3 * effLens(i);
-      alphasPrime[i] = 1.0;
-      ++numActive;
-    }
-
-    if (noRichEq or !useFSPD) {
-      posWeightInvDenoms(i) = 1.0;
-    } else {
-      auto& fragStartDist = fragStartDists[txp.lengthClassIndex()];
-      double denomFactor = fragStartDist.evalCDF(
-          static_cast<int32_t>(txp.EffectiveLength), txp.RefLength);
-      posWeightInvDenoms(i) = (denomFactor >= salmon::math::LOG_EPSILON)
-                                  ? std::exp(-denomFactor)
-                                  : (1e-5);
-    }
-
+    double uniqueCount = static_cast<double>(txp.uniqueCount() + 0.5);
+    auto wi = uniqueCount * 1e-3 * effLens(i);
+    alphasPrime[i] = wi;
+    totalWeight += wi; 
+    ++numActive;
     totalLen += effLens(i);
   }
 
+  // If we use VBEM, we'll need the prior parameters
+  std::vector<double> priorAlphas = populatePriorAlphas_(transcripts, effLens, priorValue, perTranscriptPrior);
+
   // Based on the number of observed reads, use
   // a linear combination of the online estimates
   // and the uniform distribution.
   double uniformPrior = totalWeight / static_cast<double>(numActive);
   // double fracObserved = 1.0;
   double fracObserved = std::min(1.0, totalWeight / sopt.numRequiredFragments);
-  if (sopt.initUniform) { fracObserved = 0.0; }
-  for (size_t i = 0; i < alphas.size(); ++i) {
-    alphas[i] = (alphasPrime[i] == 1.0)
-                    ? ((alphas[i] * fracObserved) +
-                       (uniformPrior * (1.0 - fracObserved)))
-                    : 0.0;
+  // Above, we placed the uniformative (uniform) initalization into the alphasPrime
+  // variables.  If that's what the user requested, then copy those over to the alphas
+  if (sopt.initUniform) { 
+    for (size_t i = 0; i < alphas.size(); ++i) {
+        alphas[i] = alphasPrime[i];
+        alphasPrime[i] = 1.0;
+    } 
+  } else { // otherwise, initalize with a linear combination of the true and uniform alphas 
+      for (size_t i = 0; i < alphas.size(); ++i) {
+        auto uniAbund = (metaGenomeMode or altInitMode) ? alphasPrime[i].load() : uniformPrior;
+        alphas[i] = (alphas[i] * fracObserved) + (uniAbund * (1.0 - fracObserved));
+        alphasPrime[i] = 1.0;
+      }
   }
 
   // If the user requested *not* to use "rich" equivalence classes,
@@ -829,8 +842,7 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
   // by the effective length term.
   tbb::parallel_for(
       BlockedIndexRange(size_t(0), size_t(eqVec.size())),
-      [&eqVec, &effLens, &posWeightInvDenoms, useFSPD,
-       noRichEq](const BlockedIndexRange& range) -> void {
+      [&eqVec, &effLens, noRichEq](const BlockedIndexRange& range) -> void {
         // For each index in the equivalence class vector
         for (auto eqID : boost::irange(range.begin(), range.end())) {
           // The vector entry
@@ -862,18 +874,13 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
             if (noRichEq) {
               // Keep length factor separate for the time being
               v.weights[i] = 1.0;
-              // Pos weight
-              v.posWeights[i] = 1.0 / el;
-            } else if (createdPosWeights or !useFSPD) {
-              // If the positional weights are new, then give them
-              // meaningful values.
-              v.posWeights[i] = 1.0 / el;
             }
+            // meaningful values.
+            v.posWeights[i] = 1.0 / el;
 
             // combined weight
             v.combinedWeights.push_back(
-                v.weights[i].load() *
-                (v.posWeights[i].load() * posWeightInvDenoms[tid]));
+                v.count * v.weights[i].load() * v.posWeights[i].load());
             wsum += v.combinedWeights.back();
           }
 
@@ -885,7 +892,7 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
       });
 
   auto numRemoved =
-      markDegenerateClasses(eqVec, alphas, effLens, sopt.jointLog);
+    markDegenerateClasses(eqVec, alphas, effLens, available, sopt.jointLog);
   sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
                       numRemoved);
 
@@ -911,27 +918,19 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
 
       jointLog->info("iteration {}, adjusting effective lengths to account for biases", itNum);
       effLens = salmon::utils::updateEffectiveLengths(sopt, readExp, effLens,
-                                                      alphas, true);
-      //(itNum == recomputeIt.front()));
+                                                      alphas, available, true);
+      // if we're doing the VB optimization, update the priors
+      if (useVBEM) {
+          priorAlphas = populatePriorAlphas_(transcripts, effLens, priorValue, perTranscriptPrior);
+      }
 
       // Check for strangeness with the lengths.
       for (size_t i = 0; i < effLens.size(); ++i) {
         if (effLens(i) <= 0.0) {
           jointLog->warn("Transcript {} had length {}", i, effLens(i));
         }
-        if (noRichEq or !useFSPD) {
-          posWeightInvDenoms(i) = 1.0;
-        } else {
-          auto& txp = transcripts[i];
-          auto& fragStartDist = fragStartDists[txp.lengthClassIndex()];
-          double denomFactor = fragStartDist.evalCDF(
-              static_cast<int32_t>(effLens(i)), txp.RefLength);
-          posWeightInvDenoms(i) = (denomFactor >= salmon::math::LOG_EPSILON)
-                                      ? std::exp(-denomFactor)
-                                      : 1e-5;
-        }
       }
-      updateEqClassWeights(eqVec, posWeightInvDenoms, effLens);
+      updateEqClassWeights(eqVec, effLens);
       needBias = false;
     }
 
@@ -973,7 +972,7 @@ bool CollapsedEMOptimizer::optimize(ExpT& readExp, SalmonOpts& sopt,
   if (useVBEM and !perTranscriptPrior) {
       std::vector<double> cutoffs(transcripts.size(), 0.0);
       for (size_t i = 0; i < transcripts.size(); ++i) {
-	cutoffs[i] = priorAlphas[i] + minAlpha;
+          cutoffs[i] = minAlpha;
       }
       //alphaSum = truncateCountVector(alphas, cutoffs);
       alphaSum = truncateCountVector(alphas, cutoffs);
diff --git a/src/CollapsedGibbsSampler.cpp b/src/CollapsedGibbsSampler.cpp
index 96b10c8..8780f0b 100644
--- a/src/CollapsedGibbsSampler.cpp
+++ b/src/CollapsedGibbsSampler.cpp
@@ -1,111 +1,607 @@
-#include <vector>
-#include <unordered_map>
 #include <atomic>
 #include <random>
+#include <unordered_map>
+#include <vector>
+#include <mutex>
+#include <thread>
 
-#include "tbb/task_scheduler_init.h"
+#include "tbb/blocked_range.h"
+#include "tbb/combinable.h"
+#include "tbb/enumerable_thread_specific.h"
 #include "tbb/parallel_for.h"
 #include "tbb/parallel_for_each.h"
 #include "tbb/parallel_reduce.h"
-#include "tbb/blocked_range.h"
 #include "tbb/partitioner.h"
+#include "tbb/task_scheduler_init.h"
 
 //#include "fastapprox.h"
-#include <boost/math/special_functions/digamma.hpp>
 #include <boost/filesystem.hpp>
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/math/distributions/gamma.hpp>
+// PCG Random number generator
+#include "pcg_random.hpp"
 
 // C++ string formatting library
 #include "spdlog/fmt/fmt.h"
 
-#include "cuckoohash_map.hh"
 #include "Eigen/Dense"
+#include "cuckoohash_map.hh"
 
+#include "AlignmentLibrary.hpp"
+#include "BootstrapWriter.hpp"
 #include "CollapsedGibbsSampler.hpp"
+#include "MultinomialSampler.hpp"
+#include "ReadExperiment.hpp"
+#include "ReadPair.hpp"
+#include "SalmonMath.hpp"
 #include "Transcript.hpp"
 #include "TranscriptGroup.hpp"
-#include "SalmonMath.hpp"
-#include "AlignmentLibrary.hpp"
-#include "ReadPair.hpp"
 #include "UnpairedRead.hpp"
-#include "ReadExperiment.hpp"
-#include "MultinomialSampler.hpp"
-#include "BootstrapWriter.hpp"
+#include "ezETAProgressBar.hpp"
 
-using BlockedIndexRange =  tbb::blocked_range<size_t>;
+using BlockedIndexRange = tbb::blocked_range<size_t>;
 
 // intelligently chosen value adopted from
 // https://github.com/pachterlab/kallisto/blob/master/src/EMAlgorithm.h#L18
 constexpr double minEQClassWeight = std::numeric_limits<double>::denorm_min();
 constexpr double minWeight = std::numeric_limits<double>::denorm_min();
 
-void initCountMap_(
-        std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
-        std::vector<Transcript>& transcriptsIn,
-        double priorAlpha,
-        MultinomialSampler& msamp,
-        std::vector<uint64_t>& countMap,
-        std::vector<double>& probMap,
-        Eigen::VectorXd& effLens,
-        std::vector<int>& txpCounts) {
-
-    size_t offset{0};
-    for (auto& eqClass : eqVec) {
-        uint64_t classCount = eqClass.second.count;
+/** http://codereview.stackexchange.com/questions/106773/dividing-a-range-into-n-sub-ranges */
+template <typename Iterator>
+std::vector<std::pair<Iterator, Iterator>>
+  divide_work( Iterator begin, Iterator end, std::size_t n )
+{
+  std::vector<std::pair<Iterator, Iterator>> ranges;
+  if (n == 0) return ranges;
+  ranges.reserve(n);
+
+  auto dist = std::distance(begin, end);
+  n = std::min<size_t>(n, dist);
+  auto chunk = dist / n;
+  auto remainder = dist % n;
+
+  for (size_t i = 0; i < n-1; ++i) {
+    auto next_end = std::next(begin, chunk + (remainder ? 1 : 0));
+    ranges.emplace_back(begin, next_end);
+
+    begin = next_end;
+    if (remainder) remainder -= 1;
+  }
+
+  // last chunk
+  ranges.emplace_back(begin, end);
+  return ranges;
+}
 
-        // for each transcript in this class
-        const TranscriptGroup& tgroup = eqClass.first;
-        const size_t groupSize = tgroup.txps.size();
-        if (tgroup.valid) {
+/**
+ * This non-collapsed Gibbs step is largely inspired by the method first
+ * introduced by  Turro et al. [1].  Given the current estimates `txpCount` of the read count
+ * for each transcript,  the mean transcript fractions are sampled from a Gamma distribution
+ * ~ Gam( prior[i] + txpCount[i], \Beta + effLens[i]).  Then, given these
+ * transcript fractions,  The reads are re-assigned within each equivalence class by sampling from a
+ * multinomial * distributed according to these means.
+ *
+ * [1] Haplotype and isoform specific expression estimation using multi-mapping RNA-seq reads.
+ * Turro E, Su S-Y, Goncalves A, Coin L, Richardson S and Lewin A.
+ * Genome Biology, 2011 Feb; 12:R13.  doi: 10.1186/gb-2011-12-2-r13.
+ **/
+void sampleRoundNonCollapsedMultithreaded_(
+    std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+    std::vector<bool>& active,
+    std::vector<uint32_t>& activeList,
+    std::vector<uint64_t>& countMap, std::vector<double>& probMap,
+    std::vector<double>& muGlobal, Eigen::VectorXd& effLens,
+    const std::vector<double>& priorAlphas, std::vector<double>& txpCount,
+    std::vector<uint32_t>& offsetMap) {
+
+  // generate coeff for \mu from \alpha and \effLens
+  double beta = 0.1;
+  double norm = 0.0; 
+  
+  // Sample the transcript fractions \mu from a gamma distribution, and
+  // reset txpCounts to zero for each transcript.
+  typedef tbb::enumerable_thread_specific<pcg32_unique> GeneratorType;
+  auto getGenerator = []() -> pcg32_unique {
+    return pcg32_unique(pcg_extras::seed_seq_from<std::random_device>());
+  };
+  GeneratorType localGenerator(getGenerator);
+
+  // Compute the mu to be used in the equiv class resampling
+  tbb::parallel_for(
+                    BlockedIndexRange(size_t(0), size_t(activeList.size())), // 1024 is grainsize, use only with simple_partitioner
+      [&, beta](const BlockedIndexRange& range) -> void {
+        GeneratorType::reference gen = localGenerator.local();
+        for (auto activeIdx : boost::irange(range.begin(), range.end())) {
+          auto i = activeList[activeIdx];
+          double ci = static_cast<double>(txpCount[i] + priorAlphas[i]);
+          std::gamma_distribution<double> d(ci, 1.0 / (beta + effLens(i)));
+          muGlobal[i] = d(gen);
+          txpCount[i] = 0.0;
+          /** DEBUG
+          if (std::isnan(muGlobal[i]) or std::isinf(muGlobal[i])) {
+            std::cerr << "txpCount = " << txpCount[i] << ", prior = " << priorAlphas[i] << ", alpha = " << ci << ", beta = " << (1.0 / (beta + effLens(i))) << ", mu = " << muGlobal[i] << "\n";
+            std::exit(1);
+          } 
+          **/
+        }
+      });
+
+  /**
+   * These will store "thread local" parameters
+   * for the threads doing local sampling of equivalence class counts.
+   */
+  class CombineableTxpCounts {
+  public:
+    CombineableTxpCounts(uint32_t numTxp) : txpCount(numTxp, 0) {
+      gen.reset(new pcg32_unique(pcg_extras::seed_seq_from<std::random_device>()));
+    }
+    std::vector<int> txpCount;
+    std::unique_ptr<pcg32_unique> gen{nullptr};
+  };
+  tbb::combinable<CombineableTxpCounts> combineableCounts(txpCount.size());
+
+  std::mutex writeMut;
+  // resample within each equivalence class
+  tbb::parallel_for(
+                    BlockedIndexRange(size_t(0), size_t(eqVec.size())),
+      [&](const BlockedIndexRange& range) -> void {
+
+        auto& txpCountLoc = combineableCounts.local().txpCount;
+        auto& gen = *(combineableCounts.local().gen.get());
+        for (auto eqid : boost::irange(range.begin(), range.end())) {
+          auto& eqClass = eqVec[eqid];
+          size_t offset = offsetMap[eqid];
+
+          // get total number of reads for an equivalence class
+          uint64_t classCount = eqClass.second.count;
+
+          // for each transcript in this class
+          const TranscriptGroup& tgroup = eqClass.first;
+          const size_t groupSize = tgroup.txps.size();
+          if (tgroup.valid) {
             const std::vector<uint32_t>& txps = tgroup.txps;
             const auto& auxs = eqClass.second.combinedWeights;
+            const auto& weights = eqClass.second.weights;
 
             double denom = 0.0;
+            // If this is a single-transcript group,
+            // then it gets the full count --- otherwise,
+            // sample!
             if (BOOST_LIKELY(groupSize > 1)) {
+              // For each transcript in the group
+              double muSum = 0.0;
+              for (size_t i = 0; i < groupSize; ++i) {
+                auto tid = txps[i];
+                size_t gi = offset + i;
+                probMap[gi] = (1000.0 * muGlobal[tid]) * weights[i];
+                muSum += probMap[gi];
+                denom += probMap[gi];
+              }
+
+              if (denom <= ::minEQClassWeight) {
+                {
+                  std::lock_guard<std::mutex> lg(writeMut);
+                  std::cerr << "[WARNING] eq class denom was too small : denom = " << denom << ", numReads = "
+                            << classCount << ". Distributing reads evenly for this class\n";
+                }
 
+                denom = 0.0;
+                muSum = 0.0;
                 for (size_t i = 0; i < groupSize; ++i) {
-                    auto tid = txps[i];
-                    auto aux = auxs[i];
-                    denom += (priorAlpha + transcriptsIn[tid].mass(false)) * aux;
-                    countMap[offset + i] = 0;
+                  auto tid = txps[i];
+                  size_t gi = offset + i;
+                  probMap[gi] = 1.0 / effLens(tid);
+                  muSum += probMap[gi];
+                  denom += probMap[gi];
                 }
 
-		if (denom > ::minEQClassWeight) {
-	   	   // Get the multinomial probabilities
-		   double norm = 1.0 / denom;
-		   for (size_t i = 0; i < groupSize; ++i) {
-		     auto tid = txps[i];
-		     auto aux = auxs[i];
-		     probMap[offset + i] = norm *
-                        ((priorAlpha + transcriptsIn[tid].mass(false)) * aux);
-		    }
-
-	   	    // re-sample
-	            msamp(countMap.begin() + offset,
-                      classCount,
-                      groupSize,
-                      probMap.begin() + offset);
-		}
-            } else {
-                countMap[offset] = classCount;
+                // If it's still too small --- divide evenly
+                if (denom <= ::minEQClassWeight) {
+                  for (size_t i = 0; i < groupSize; ++i) {
+                    auto tid = txps[i];
+                    size_t gi = offset + i;
+                    probMap[gi] = 1.0;
+                  }
+                  denom = groupSize;
+                  muSum = groupSize;
+                }
+              }
+
+              if (denom > ::minEQClassWeight) {
+                // Local multinomial
+                std::discrete_distribution<int> dist(probMap.begin() + offset,
+                                                     probMap.begin() + offset +
+                                                         groupSize);
+                for (size_t s = 0; s < classCount; ++s) {
+                  auto ind = dist(gen);
+                  ++txpCountLoc[txps[ind]];
+                }
+              }
+            } // do nothing if group size less than 2
+            else {
+              auto tid = txps[0];
+              txpCountLoc[tid] += static_cast<int>(classCount);
             }
+          } // valid group
+        }   // loop over all eq classes
+      });
 
+  auto combineCounts = [&txpCount](const CombineableTxpCounts& p) -> void {
+    for (size_t i = 0; i < txpCount.size(); ++i) {
+      txpCount[i] += static_cast<double>(p.txpCount[i]);
+    }
+  };
+  combineableCounts.combine_each(combineCounts);
+}
 
-            for (size_t i = 0; i < groupSize; ++i) {
-                auto tid = txps[i];
-                txpCounts[tid] += countMap[offset + i];
-            }
+CollapsedGibbsSampler::CollapsedGibbsSampler() {}
 
-            offset += groupSize;
-       } // valid group
-    } // loop over all eq classes
+class DistStats {
+public:
+  DistStats()
+      : meanVal(0.0), minVal(std::numeric_limits<double>::max()), maxVal(0.0) {}
+  double meanVal;
+  double minVal;
+  double maxVal;
+};
+
+/**
+ *  Populate the prior parameters for the VBEM
+ *  Note: effLens *must* be valid before calling this function.
+ */
+// Get rid of redundancy of this function
+std::vector<double> populatePriorAlphasGibbs_(
+    std::vector<Transcript>& transcripts, // transcripts
+    Eigen::VectorXd& effLens,             // current effective length estimate
+    double priorValue,      // the per-nucleotide prior value to use
+    bool perTranscriptPrior // true if prior is per-txp, else per-nucleotide
+    ) {
+  // start out with the per-txp prior
+  std::vector<double> priorAlphas(transcripts.size(), priorValue);
+
+  // If the prior is per-nucleotide (default, then we need a potentially
+  // different
+  // value for each transcript based on its length).
+  if (!perTranscriptPrior) {
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+      double ml = std::max(1.0, effLens(i));
+      priorAlphas[i] = priorValue * ml;
+    }
+  }
+  return priorAlphas;
+}
+
+template <typename ExpT>
+bool CollapsedGibbsSampler::sample(
+    ExpT& readExp, SalmonOpts& sopt,
+    std::function<bool(const std::vector<double>&)>& writeBootstrap,
+    uint32_t numSamples) {
+
+  namespace bfs = boost::filesystem;
+  auto& jointLog = sopt.jointLog;
+  tbb::task_scheduler_init tbbScheduler(sopt.numThreads);
+  std::vector<Transcript>& transcripts = readExp.transcripts();
+
+  // Fill in the effective length vector
+  Eigen::VectorXd effLens(transcripts.size());
+
+  std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
+      readExp.equivalenceClassBuilder().eqVec();
+
+  using VecT = CollapsedGibbsSampler::VecType;
+
+  std::vector<std::vector<int>> allSamples(
+      numSamples, std::vector<int>(transcripts.size(), 0));
+
+  std::vector<double> alphasIn(transcripts.size(), 0.0);
+  std::vector<double> alphasInit(transcripts.size(), 0.0);
+
+  bool useScaledCounts = (!sopt.useQuasi and !sopt.allowOrphans);
+  auto numMappedFragments = (useScaledCounts) ? readExp.upperBoundHits()
+                                              : readExp.numMappedFragments();
+  uint32_t numInternalRounds = sopt.thinningFactor;
+  size_t numTranscripts{transcripts.size()};
+
+  for (size_t i = 0; i < transcripts.size(); ++i) {
+    auto& txp = transcripts[i];
+    alphasIn[i] = txp.projectedCounts;
+    alphasInit[i] = txp.projectedCounts;
+    effLens(i) = txp.EffectiveLength;
+  }
+
+  bool perTranscriptPrior = (sopt.useVBOpt) ? sopt.perTranscriptPrior : true;
+  double priorValue = (sopt.useVBOpt) ? sopt.vbPrior : 1e-4;
+  std::vector<double> priorAlphas = populatePriorAlphasGibbs_(
+                                                              transcripts, effLens, priorValue, perTranscriptPrior);
+  /** DEBUG 
+  for (size_t i = 0; i < priorAlphas.size(); ++i) {
+    auto& v = priorAlphas[i];
+    if (!std::isfinite(v)) {
+      std::cerr << "prior for transcript " << i << " is " << v << ", eff length = " << effLens(i) << "\n";
+    }
+  }
+  **/
+
+  std::vector<bool> active(numTranscripts, false);
+  size_t countMapSize{0};
+  std::vector<uint32_t> offsetMap(eqVec.size(), 0);
+  for (size_t i = 0; i < eqVec.size(); ++i) {
+    if (eqVec[i].first.valid) {
+      countMapSize += eqVec[i].first.txps.size();
+      for (auto t : eqVec[i].first.txps) { active[t] = true; }
+      if (i < eqVec.size() - 1) {
+        offsetMap[i + 1] = countMapSize;
+      }
+    }
+  }
+
+  std::vector<uint32_t> activeList; activeList.reserve(numTranscripts);
+  for (size_t i = 0; i < numTranscripts; ++i) {
+    if (active[i]) {
+      activeList.push_back(i);
+    } else {
+      alphasIn[i] = 0.0;
+      alphasInit[i] = 0.0;
+    }
+  }
+
+  // will hold estimated counts
+  std::vector<double> alphas(numTranscripts, 0.0);
+  std::vector<double> mu(numTranscripts, 0.0);
+  std::vector<uint64_t> countMap(countMapSize, 0);
+  std::vector<double> probMap(countMapSize, 0.0);
+
+  /*
+  std::random_device rd;
+  MultinomialSampler ms(rd);
+  initCountMap_(eqVec, transcripts, alphasIn, priorAlphas, ms, countMap,
+  probMap, effLens, allSamples[0]);
+  */
+
+  uint32_t nchains{1};
+  if (numSamples >= 50) {
+    nchains = 2;
+  }
+  if (numSamples >= 100) {
+    nchains = 4;
+  }
+  if (numSamples >= 200) {
+    nchains = 8;
+  }
+
+  std::vector<uint32_t> newChainIter{0};
+  if (nchains > 1) {
+    auto step = numSamples / nchains;
+    for (size_t i = 1; i < nchains; ++i) {
+      newChainIter.push_back(i * step);
+    }
+  }
+
+  auto nextChainStart = newChainIter.begin();
+
+  // For each sample this thread should generate
+  std::unique_ptr<ez::ezETAProgressBar> pbar{nullptr};
+  if (!sopt.quiet) {
+    pbar.reset(new ez::ezETAProgressBar(numSamples));
+    pbar->start();
+  }
+  bool isFirstSample{true};
+  for (size_t sampleID = 0; sampleID < numSamples; ++sampleID) {
+    if (pbar) {
+      ++(*pbar);
+    }
+    // If we should start a new chain here, then do it!
+    if (nextChainStart < newChainIter.end() and sampleID == *nextChainStart) {
+      alphasIn = alphasInit;
+      ++nextChainStart;
+    }
+    /*
+      if (!isFirstSample) {
+          // the counts start at what they were last round.
+        allSamples[sampleID] = allSamples[sampleID-1];
+      }
+      */
+
+    // Thin the chain by a factor of (numInternalRounds)
+    for (size_t i = 0; i < numInternalRounds; ++i) {
+      sampleRoundNonCollapsedMultithreaded_(eqVec, active, activeList, countMap, probMap, mu,
+                                            effLens, priorAlphas, alphasIn,
+                                            offsetMap);
+    }
+
+    double denom{0.0};
+    for (size_t tn = 0; tn < numTranscripts; ++tn) {
+      denom += mu[tn] * effLens[tn];
+    }
+    double scale = numMappedFragments / denom;
+    double asum = {0.0};
+    double minAlpha = 1e-8;
+    for (size_t tn = 0; tn < numTranscripts; ++tn) {
+      alphas[tn] = (mu[tn] * effLens[tn]) * scale;
+      alphas[tn] = (alphas[tn] > minAlpha) ? alphas[tn] : 0.0;
+      asum += alphas[tn];
+    }
+
+    writeBootstrap(alphas);
+    isFirstSample = false;
+  }
+  return true;
+}
+
+/*
+void initCountMap_(
+    std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+    std::vector<Transcript>& transcriptsIn, const std::vector<double>& alphasIn,
+    const std::vector<double>& priorAlphas, MultinomialSampler& msamp,
+    std::vector<uint64_t>& countMap, std::vector<double>& probMap,
+    Eigen::VectorXd& effLens, std::vector<int>& txpCounts) {
+
+  size_t offset{0};
+  for (auto& eqClass : eqVec) {
+    uint64_t classCount = eqClass.second.count;
+
+    // for each transcript in this class
+    const TranscriptGroup& tgroup = eqClass.first;
+    const size_t groupSize = tgroup.txps.size();
+    if (tgroup.valid) {
+      const std::vector<uint32_t>& txps = tgroup.txps;
+      const auto& auxs = eqClass.second.combinedWeights;
+
+      double denom = 0.0;
+      if (BOOST_LIKELY(groupSize > 1)) {
+
+        for (size_t i = 0; i < groupSize; ++i) {
+          auto tid = txps[i];
+          auto aux = auxs[i];
+          denom += (priorAlphas[tid] + alphasIn[tid]) * aux;
+          countMap[offset + i] = 0;
+        }
+
+        if (denom > ::minEQClassWeight) {
+          // Get the multinomial probabilities
+          double norm = 1.0 / denom;
+          for (size_t i = 0; i < groupSize; ++i) {
+            auto tid = txps[i];
+            auto aux = auxs[i];
+            probMap[offset + i] =
+                norm * ((priorAlphas[tid] + alphasIn[tid]) * aux);
+          }
+
+          // re-sample
+          msamp(countMap.begin() + offset, classCount, groupSize,
+                probMap.begin() + offset);
+        }
+      } else {
+        countMap[offset] = classCount;
+      }
+
+      for (size_t i = 0; i < groupSize; ++i) {
+        auto tid = txps[i];
+        txpCounts[tid] += countMap[offset + i];
+      }
+
+      offset += groupSize;
+    } // valid group
+  }   // loop over all eq classes
 }
 
+//
+ // This non-collapsed Gibbs step is largely inspired by the method first
+ //introduced by
+ // Turro et al. [1].  Given the current estimates `txpCount` of the read count
+ //for each transcript,
+ // the mean transcript fractions are sampled from a Gamma distribution
+ // ~ Gam( prior[i] + txpCount[i], \Beta + effLens[i]).  Then, given these
+ //transcript fractions,
+ // The reads are re-assigned within each equivalence class by sampling from a
+ //multinomial
+ // distributed according to these means.
+ //
+ // [1] Haplotype and isoform specific expression estimation using multi-mapping
+ //RNA-seq reads.
+ // Turro E, Su S-Y, Goncalves A, Coin L, Richardson S and Lewin A. Genome
+ //Biology, 2011 Feb; 12:R13.
+ // doi: 10.1186/gb-2011-12-2-r13.
+ //
+void sampleRoundNonCollapsed_(
+    std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
+    std::vector<uint64_t>& countMap, std::vector<double>& probMap,
+    Eigen::VectorXd& effLens, const std::vector<double>& priorAlphas,
+    std::vector<int>& txpCount, MultinomialSampler& msamp) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  // offset for 2d to 1d count map
+  size_t offset{0};
+
+  // retain original txp count
+  std::vector<int> origTxpCount = txpCount;
+
+  // reset txpCounts to zero
+  std::fill(txpCount.begin(), txpCount.end(), 0);
+
+  // generate norm. coeff for \mu from \alpha (countMap)
+  std::vector<double> muGlobal(txpCount.size(), 0.0);
+  double beta = 0.1;
+  double norm = 0.0;
+  for (size_t i = 0; i < origTxpCount.size(); ++i) {
+    std::gamma_distribution<double> d(origTxpCount[i] + priorAlphas[i],
+                                      1.0 / (beta + effLens(i)));
+    muGlobal[i] = d(gen);
+  }
+
+  for (auto& eqClass : eqVec) {
+    // get total number of reads for an equivalence class
+    uint64_t classCount = eqClass.second.count;
+
+    // for each transcript in this class
+    const TranscriptGroup& tgroup = eqClass.first;
+    const size_t groupSize = tgroup.txps.size();
+    if (tgroup.valid) {
+      const std::vector<uint32_t>& txps = tgroup.txps;
+      const auto& auxs = eqClass.second.combinedWeights;
+
+      double denom = 0.0;
+      // If this is a single-transcript group,
+      // then it gets the full count --- otherwise,
+      // sample!
+      if (BOOST_LIKELY(groupSize > 1)) {
+
+        std::vector<uint64_t> txpResamp(groupSize);
+        std::vector<double> mu(groupSize);
+
+        // For each transcript in the group
+        double muSum = 0.0;
+        for (size_t i = 0; i < groupSize; ++i) {
+          auto tid = txps[i];
+          auto aux = auxs[i];
+          // mu[i] = (origTxpCount[tid]+priorAlpha) * aux;
+          mu[i] = muGlobal[tid];
+          muSum += mu[i];
+          denom += (priorAlphas[tid] + origTxpCount[tid]) * aux;
+        }
+
+        // calculate prob vector
+        for (size_t i = 0; i < groupSize; ++i) {
+          probMap[offset + i] = mu[i] / muSum;
+          txpResamp[i] = 0.0;
+        }
+
+        if (denom > ::minEQClassWeight) {
+          // re-sample
+          msamp(txpResamp.begin(),       // count array to fill in
+                classCount,              // multinomial n
+                groupSize,               // multinomial k
+                probMap.begin() + offset // where to find multinomial probs
+                );
+
+          for (size_t i = 0; i < groupSize; ++i) {
+            auto tid = txps.at(i);
+            txpCount.at(tid) += txpResamp.at(i);
+            // txpCount.at(tid) -= countMap.at(offset + i);
+            // countMap.at(offset + i) = txpResamp.at(i);
+          }
+        } // do nothing when denom less than minEQClassWeight
+        else {
+          std::cerr << "minEQClassWeight error";
+        }
+      } // do nothing if group size less than 2
+      else {
+        auto tid = txps.at(0);
+        txpCount.at(tid) += countMap.at(offset);
+      }
+      offset += groupSize;
+    } // valid group
+  }   // loop over all eq classes
+}
+
+
+
 void sampleRound_(
         std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec,
         std::vector<uint64_t>& countMap,
         std::vector<double>& probMap,
         Eigen::VectorXd& effLens,
-        double priorAlpha,
+        const std::vector<double>& priorAlphas,
         std::vector<int>& txpCount,
         MultinomialSampler& msamp) {
 
@@ -152,7 +648,7 @@ void sampleRound_(
                     txpResamp[i] = currResamp;
                     txpCount[tid] -= currResamp;
                     countMap[offset + i] -= currResamp;
-                    denom += (priorAlpha + txpCount[tid]) * aux;
+                    denom += (priorAlphas[tid] + txpCount[tid]) * aux;
                 }
 
                 if (denom > ::minEQClassWeight) {
@@ -161,14 +657,16 @@ void sampleRound_(
                     for (size_t i = 0; i < groupSize; ++i) {
                         auto tid = txps[i];
                         auto aux = auxs[i];
-                        probMap[offset + i] = norm * ((priorAlpha + txpCount[tid]) * aux);
+                        probMap[offset + i] = norm * ((priorAlphas[tid] +
+txpCount[tid]) * aux);
                     }
 
                     // re-sample
                     msamp(txpResamp.begin(),        // count array to fill in
                             numResampled,		// multinomial n
                             groupSize,		// multinomial k
-                            probMap.begin() + offset  // where to find multinomial probs
+                            probMap.begin() + offset  // where to find
+multinomial probs
                          );
 
                     for (size_t i = 0; i < groupSize; ++i) {
@@ -193,20 +691,11 @@ void sampleRound_(
 
 }
 
-CollapsedGibbsSampler::CollapsedGibbsSampler() {}
-
-class DistStats {
-	public:
-	DistStats() : meanVal(0.0), minVal(std::numeric_limits<double>::max()), maxVal(0.0) {}
-	double meanVal;
-	double minVal;
-	double maxVal;
-};
-
+// The original sampler!
 template <typename ExpT>
-bool CollapsedGibbsSampler::sample(ExpT& readExp,
+bool CollapsedGibbsSampler::sampleMultipleChains(ExpT& readExp,
         SalmonOpts& sopt,
-        std::function<bool(const std::vector<int>&)>& writeBootstrap,
+        std::function<bool(const std::vector<double>&)>& writeBootstrap,
         uint32_t numSamples) {
 
     namespace bfs = boost::filesystem;
@@ -224,21 +713,31 @@ bool CollapsedGibbsSampler::sample(ExpT& readExp,
 
     std::vector<std::vector<int>> allSamples(numSamples,
                                         std::vector<int>(transcripts.size(),0));
-    double priorAlpha = 1e-8;
-    bool useScaledCounts = (!sopt.useQuasi and !sopt.allowOrphans);
-    auto numMappedFragments = (useScaledCounts) ? readExp.upperBoundHits() : readExp.numMappedFragments();
 
+    bool perTranscriptPrior = (sopt.useVBOpt) ? sopt.perTranscriptPrior : true;
+    double priorValue = (sopt.useVBOpt) ? sopt.vbPrior : 1e-8;
+    std::vector<double> priorAlphas = populatePriorAlphasGibbs_(transcripts,
+effLens, priorValue, perTranscriptPrior);
+    std::vector<double> alphasIn(priorAlphas.size(), 0.0);
+
+    bool useScaledCounts = (!sopt.useQuasi and !sopt.allowOrphans);
+    auto numMappedFragments = (useScaledCounts) ? readExp.upperBoundHits() :
+readExp.numMappedFragments();
+    uint32_t numInternalRounds = sopt.thinningFactor;
 
     for (size_t i = 0; i < transcripts.size(); ++i) {
         auto& txp = transcripts[i];
-        txp.setMass(priorAlpha + (txp.mass(false) * numMappedFragments));
+        //txp.setMass(priorAlphas[i] + (txp.mass(false) * numMappedFragments));
+        alphasIn[i] = txp.mass(false) * numMappedFragments;
         effLens(i) = txp.EffectiveLength;
     }
 
     tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(numSamples)),
-                [&eqVec, &transcripts, priorAlpha, &effLens,
-                 &allSamples, &writeBootstrap, useScaledCounts,
-                 &jointLog, numMappedFragments]( const BlockedIndexRange& range) -> void {
+                 [&eqVec, &transcripts, &alphasIn, &priorAlphas, &effLens,
+                  &allSamples, &writeBootstrap, useScaledCounts,
+numInternalRounds,
+                 &jointLog, numMappedFragments]( const BlockedIndexRange& range)
+-> void {
 
 
                 std::random_device rd;
@@ -258,12 +757,13 @@ bool CollapsedGibbsSampler::sample(ExpT& readExp,
                 std::vector<uint64_t> countMap(countMapSize, 0);
                 std::vector<double> probMap(countMapSize, 0.0);
 
-                initCountMap_(eqVec, transcripts, priorAlpha, ms, countMap, probMap, effLens, allSamples[range.begin()]);
+                initCountMap_(eqVec, transcripts, alphasIn, priorAlphas, ms,
+countMap, probMap, effLens, allSamples[range.begin()]);
 
                 // For each sample this thread should generate
                 bool isFirstSample{true};
-                bool numInternalRounds = 10;
-                for (auto sampleID : boost::irange(range.begin(), range.end())) {
+                for (auto sampleID : boost::irange(range.begin(), range.end()))
+{
                     if (sampleID % 100 == 0) {
                         std::cerr << "gibbs sampling " << sampleID << "\n";
                     }
@@ -275,34 +775,45 @@ bool CollapsedGibbsSampler::sample(ExpT& readExp,
 
                     // Thin the chain by a factor of (numInternalRounds)
                     for (size_t i = 0; i < numInternalRounds; ++i){
-                        sampleRound_(eqVec, countMap, probMap, effLens, priorAlpha,
+                      sampleRoundNonCollapsed_(eqVec, countMap, probMap,
+effLens, priorAlphas,
                                 allSamples[sampleID], ms);
                     }
 
                     // If we're scaling the counts, do it here.
                     if (useScaledCounts) {
-                        double numMappedFrags = static_cast<double>(numMappedFragments);
+                        double numMappedFrags =
+static_cast<double>(numMappedFragments);
                         double alphaSum = 0.0;
-                        for (auto c : allSamples[sampleID]) { alphaSum += static_cast<double>(c); }
+                        for (auto c : allSamples[sampleID]) { alphaSum +=
+static_cast<double>(c); }
                         if (alphaSum > ::minWeight) {
                             double scaleFrac = 1.0 / alphaSum;
                             // scaleFrac converts alpha to nucleotide fraction,
-                            // and multiplying by numMappedFrags scales by the total
-                            // number of mapped fragments to provide an estimated count.
+                            // and multiplying by numMappedFrags scales by the
+total
+                            // number of mapped fragments to provide an
+estimated count.
                             for (size_t tn = 0; tn < numTranscripts; ++tn) {
                                 alphas[tn] = static_cast<int>(
                                         std::round(
                                             numMappedFrags *
-                                            (static_cast<double>(allSamples[sampleID][tn]) * scaleFrac)));
+                                            (static_cast<double>(allSamples[sampleID][tn])
+* scaleFrac)));
                             }
                         } else { // This shouldn't happen!
-                            jointLog->error("Gibbs sampler had insufficient number of fragments!"
-                                    "Something is probably wrong; please check that you "
-                                    "have run salmon correctly and report this to GitHub.");
+                            jointLog->error("Gibbs sampler had insufficient
+number of fragments!"
+                                    "Something is probably wrong; please check
+that you "
+                                    "have run salmon correctly and report this
+to GitHub.");
                         }
-                    } else { // otherwise, just copy over from the sampled counts
+                    } else { // otherwise, just copy over from the sampled
+counts
                         for (size_t tn = 0; tn < numTranscripts; ++tn) {
-                            alphas[tn] = static_cast<int>(allSamples[sampleID][tn]);
+                            alphas[tn] =
+static_cast<int>(allSamples[sampleID][tn]);
                         }
                     }
 
@@ -313,29 +824,54 @@ bool CollapsedGibbsSampler::sample(ExpT& readExp,
     });
     return true;
 }
+*/
 
-template
-bool CollapsedGibbsSampler::sample<ReadExperiment>(ReadExperiment& readExp,
-        SalmonOpts& sopt,
-        std::function<bool(const std::vector<int>&)>& writeBootstrap,
-        uint32_t maxIter);
+template bool CollapsedGibbsSampler::sample<ReadExperiment>(
+    ReadExperiment& readExp, SalmonOpts& sopt,
+    std::function<bool(const std::vector<double>&)>& writeBootstrap,
+    uint32_t maxIter);
 
-template
-bool CollapsedGibbsSampler::sample<AlignmentLibrary<UnpairedRead>>(
-        AlignmentLibrary<UnpairedRead>& readExp,
-        SalmonOpts& sopt,
-        std::function<bool(const std::vector<int>&)>& writeBootstrap,
-        uint32_t maxIter);
+template bool CollapsedGibbsSampler::sample<AlignmentLibrary<UnpairedRead>>(
+    AlignmentLibrary<UnpairedRead>& readExp, SalmonOpts& sopt,
+    std::function<bool(const std::vector<double>&)>& writeBootstrap,
+    uint32_t maxIter);
 
+template bool CollapsedGibbsSampler::sample<AlignmentLibrary<ReadPair>>(
+    AlignmentLibrary<ReadPair>& readExp, SalmonOpts& sopt,
+    std::function<bool(const std::vector<double>&)>& writeBootstrap,
+    uint32_t maxIter);
+/*
+template
+bool CollapsedGibbsSampler::sampleMultipleChains<ReadExperiment>(ReadExperiment&
+readExp,
+                                                   SalmonOpts& sopt,
+                                                   std::function<bool(const
+std::vector<double>&)>& writeBootstrap,
+                                                   uint32_t maxIter);
 
 template
-bool CollapsedGibbsSampler::sample<AlignmentLibrary<ReadPair>>(
-        AlignmentLibrary<ReadPair>& readExp,
-        SalmonOpts& sopt,
-        std::function<bool(const std::vector<int>&)>& writeBootstrap,
-        uint32_t maxIter);
+bool
+CollapsedGibbsSampler::sampleMultipleChains<AlignmentLibrary<UnpairedRead>>(
+                                                                   AlignmentLibrary<UnpairedRead>&
+readExp,
+                                                                   SalmonOpts&
+sopt,
+                                                                   std::function<bool(const
+std::vector<double>&)>& writeBootstrap,
+                                                                   uint32_t
+maxIter);
 
 
+template
+bool CollapsedGibbsSampler::sampleMultipleChains<AlignmentLibrary<ReadPair>>(
+                                                               AlignmentLibrary<ReadPair>&
+readExp,
+                                                               SalmonOpts& sopt,
+                                                               std::function<bool(const
+std::vector<double>&)>& writeBootstrap,
+                                                               uint32_t
+maxIter);
+*/
 
 /*
     // Deprecated Gibbs output code
@@ -356,19 +892,20 @@ bool CollapsedGibbsSampler::sample<AlignmentLibrary<ReadPair>>(
                       if (val > ds[tid].maxVal) { ds[tid].maxVal = val; }
                       meanNumReads += (1.0 / numSamples) * val;
                     }
-        		    ds[tid].meanVal = meanNumReads;
+                    ds[tid].meanVal = meanNumReads;
                     transcripts[tid].setMass(ds[tid].meanVal);
                 }
     });
 
     bfs::path gibbsSampleFile = sopt.outputDirectory / "samples.txt";
-    sopt.jointLog->info("Writing posterior samples to {}", gibbsSampleFile.string());
+    sopt.jointLog->info("Writing posterior samples to {}",
+   gibbsSampleFile.string());
 
     std::ofstream statStream(gibbsSampleFile.string());
     statStream << "# txpName\tsample_1\tsample_2\t...\tsample_n\n";
 
     for (size_t i = 0; i < numTranscripts; ++i) {
-	    statStream << transcripts[i].RefName;
+        statStream << transcripts[i].RefName;
         for (size_t s = 0; s < allSamples.size(); ++s) {
             statStream << '\t' << allSamples[s][i];
         }
@@ -381,9 +918,10 @@ bool CollapsedGibbsSampler::sample<AlignmentLibrary<ReadPair>>(
     // Truncate tiny expression values
     double txpSumTrunc = 0.0;
     for (size_t i = 0; i < transcripts.size(); ++i) {
-	// maybe use the first decile instead of the mean for the cutoff;
-	// this could let too much through
-        if (transcripts[i].mass(false) <= cutoff) { transcripts[i].setMass(0.0); }
+    // maybe use the first decile instead of the mean for the cutoff;
+    // this could let too much through
+        if (transcripts[i].mass(false) <= cutoff) { transcripts[i].setMass(0.0);
+   }
         txpSumTrunc += transcripts[i].mass(false);
     }
 
diff --git a/src/FASTAParser.cpp b/src/FASTAParser.cpp
index bfb5236..5c6accd 100644
--- a/src/FASTAParser.cpp
+++ b/src/FASTAParser.cpp
@@ -27,6 +27,13 @@ void FASTAParser::populateTargets(std::vector<Transcript>& refs, SalmonOpts& sop
 	nameToID[ref.RefName] = ref.id;
     }
 
+    // Separators for the header (default ' ' and '\t')
+    // If we have the gencode flag, then add '|'.
+    std::string sepStr = " \t";
+    if (sopt.gencodeRef) {
+        sepStr += '|';
+    }
+
     std::vector<std::string> readFiles{fname_};
     size_t maxReadGroup{1000}; // Number of files to read simultaneously
     size_t concurrentFile{1}; // Number of reads in each "job"
@@ -40,17 +47,21 @@ void FASTAParser::populateTargets(std::vector<Transcript>& refs, SalmonOpts& sop
     std::uniform_int_distribution<> dis(0, 3);
     uint64_t numNucleotidesReplaced{0};
 
+    // All header names we encounter in the fasta file
+    std::unordered_set<std::string> fastaNames;
+
     while(true) {
         typename single_parser::job j(parser); // Get a job from the parser: a bunch of read (at most max_read_group)
         if(j.is_empty()) break;           // If got nothing, quit
 
         for(size_t i = 0; i < j->nb_filled; ++i) { // For all the read we got
             std::string& header = j->data[i].header;
-            std::string name = header.substr(0, header.find(' '));
+            std::string name = header.substr(0, header.find_first_of(sepStr));
+            fastaNames.insert(name);
 
             auto it = nameToID.find(name);
             if (it == nameToID.end()) {
-                std::cerr << "WARNING: Transcript " << name << " appears in the reference but did not appear in the BAM\n";
+              sopt.jointLog->warn("Transcript {} appears in the reference but did not appear in the BAM", name);
             } else {
 
 	      std::string& seq = j->data[i].seq;
@@ -80,7 +91,28 @@ void FASTAParser::populateTargets(std::vector<Transcript>& refs, SalmonOpts& sop
         }
     }
 
-    std::cerr << "replaced " << numNucleotidesReplaced << " non-ACGT nucleotides with random nucleotides\n";
+    // Check that every sequence present in the BAM header was also present in the
+    // transcriptome fasta.
+    bool missingTxpError{false};
+    for (auto& kv : nameToID) {
+      auto& name = kv.first;
+      if (fastaNames.find(name) == fastaNames.end()) {
+        sopt.jointLog->critical("Transcript {} appeared in the BAM header, but was not in the provided FASTA file", name);
+        missingTxpError = true;
+      }
+    }
+
+    if (missingTxpError) {
+      sopt.jointLog->critical("Please provide a reference FASTA file that includes all targets present in the BAM header\n"
+      "If you have access to the genome FASTA and GTF used for alignment \n"
+      "consider generating a transcriptome fasta using a command like: \n"
+      "gffread -w output.fa -g genome.fa genome.gtf\n"
+      "you can find the gffread utility at (http://ccb.jhu.edu/software/stringtie/gff.shtml)");
+      sopt.jointLog->flush();
+      std::exit(1);
+    }
+
+    sopt.jointLog->info("replaced {} non-ACGT nucleotides with random nucleotides",  numNucleotidesReplaced);
 
 }
 
diff --git a/src/FragmentLengthDistribution.cpp b/src/FragmentLengthDistribution.cpp
index 53e486b..41ae6bc 100644
--- a/src/FragmentLengthDistribution.cpp
+++ b/src/FragmentLengthDistribution.cpp
@@ -4,6 +4,7 @@
  *
  *  Created by Adam Roberts on 1/30/13.
  *  Copyright 2013 Adam Roberts. All rights reserved.
+ *  Modified 2014, 2015, 2016, 2017 by Rob Patro.
  */
 
 #include "FragmentLengthDistribution.hpp"
@@ -130,11 +131,15 @@ void FragmentLengthDistribution::addVal(size_t len, double mass) {
  * Returns the *LOG* probability of observing a fragment of length *len*.
  */
 double FragmentLengthDistribution::pmf(size_t len) const {
+  if (haveCachedCMF_) {
+    return (len < cachedPMF_.size()) ? cachedPMF_[len] : cachedPMF_.back();
+  } else {
     len /= binSize_;
     if (len > maxVal()) {
         len = maxVal();
     }
     return hist_[len]-totMass_;
+  }
 }
 
 /**
@@ -154,10 +159,9 @@ void FragmentLengthDistribution::dumpPMF(
     }
 }
 
-
 double FragmentLengthDistribution::cmf(size_t len) const {
     if(haveCachedCMF_) {
-        return cachedCMF_[len];
+      return (len < cachedCMF_.size()) ? cachedCMF_[len] : cachedCMF_.back();
     } else {
         double cum = salmon::math::LOG_0;
         len /= binSize_;
@@ -172,14 +176,51 @@ double FragmentLengthDistribution::cmf(size_t len) const {
     }
 }
 
+std::vector<double> getLockedPMF(FragmentLengthDistribution* fld){
+  std::vector<double> pmfOut;
+  auto maxV = fld->maxVal();
+  pmfOut.reserve(maxV + 1);
+  double totMass = salmon::math::LOG_0;
+  for (size_t i = 0; i <= maxV; ++i) {
+    pmfOut.push_back(fld->pmf(i));
+    totMass = salmon::math::logAdd(totMass, pmfOut.back());
+  }
+  for (size_t i = 0; i <= maxV; ++i) {
+    pmfOut[i] -= totMass;
+  }
+  return pmfOut;
+}
+
 void FragmentLengthDistribution::cacheCMF() {
-    std::lock_guard<std::mutex> lg(fldMut_);
-    if (!haveCachedCMF_) {
-        cachedCMF_ = cmf();
+  //std::lock_guard<std::mutex> lg(fldMut_);
+    if (sl_.try_lock()) {
+      if (!haveCachedCMF_) {
+        size_t minV, maxV;
+        cachedPMF_ = getLockedPMF(this);
+        cachedCMF_ = cmf(cachedPMF_);
         haveCachedCMF_ = true;
+      }
+
+      sl_.unlock();
     }
 }
 
+/**
+ * NOTE: It is *assumed* that pmf is properly normalized!
+ **/
+vector<double> FragmentLengthDistribution::cmf(const std::vector<double>& pmf) const {
+  double cum = salmon::math::LOG_0;
+  double totalMass = salmon::math::LOG_0;
+  vector<double> cdf(pmf.size());
+  for (size_t i = 0; i < pmf.size(); ++i) {
+    cum = salmon::math::logAdd(cum, pmf[i]);
+    cdf[i] = cum;
+  }
+  //assert(approxEq(cum, totMass_));
+
+  return cdf;
+}
+
 vector<double> FragmentLengthDistribution::cmf() const {
   double cum = salmon::math::LOG_0;
   vector<double> cdf(hist_.size());
diff --git a/src/GZipWriter.cpp b/src/GZipWriter.cpp
index c165c52..dd56574 100644
--- a/src/GZipWriter.cpp
+++ b/src/GZipWriter.cpp
@@ -69,6 +69,7 @@ bool GZipWriter::writeEquivCounts(
   auto& transcripts = experiment.transcripts();
   std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
         experiment.equivalenceClassBuilder().eqVec();
+  bool dumpRichWeights = opts.dumpEqWeights;
 
   // Number of transcripts
   equivFile << transcripts.size() << '\n';
@@ -89,6 +90,10 @@ bool GZipWriter::writeEquivCounts(
     equivFile << txps.size() << '\t';
     // each group member
     for (auto tid : txps) { equivFile << tid << '\t'; }
+    if (dumpRichWeights) {
+      const auto& auxs = eq.second.combinedWeights;
+      for (auto aux : auxs) { equivFile << aux << '\t'; }
+    }
     // count for this class
     equivFile << count << '\n';
   }
@@ -123,9 +128,7 @@ std::vector<std::string> getLibTypeStrings(const AlignmentLibrary<AlnT>& experim
 template <typename ExpT>
 bool GZipWriter::writeMeta(
     const SalmonOpts& opts,
-    const ExpT& experiment,
-    const std::string& tstring // the start time of the run
-    ) {
+    const ExpT& experiment) {
 
   namespace bfs = boost::filesystem;
 
@@ -248,6 +251,64 @@ bool GZipWriter::writeMeta(
           expgc.writeBinary(out);
       }
   }
+  
+  if (opts.posBiasCorrect) {
+    // the length classes
+    const auto& lenBounds = experiment.getLengthQuantiles();
+    
+    // lambda to write out a vector of SimplePosBias models (along with the length bounds) to file.
+    auto writePosModel= [&lenBounds, this](bfs::path fpath, const std::vector<SimplePosBias>& model) -> bool {
+      auto flags = std::ios_base::out | std::ios_base::binary;
+      boost::iostreams::filtering_ostream out;
+      out.push(boost::iostreams::gzip_compressor(6));
+      out.push(boost::iostreams::file_sink(fpath.string(), flags));
+      // Write out the number of different models
+      uint32_t numModels = static_cast<uint32_t>(lenBounds.size());
+      out.write(reinterpret_cast<char*>(&numModels), sizeof(numModels));
+      // Write out the length class for each model
+      for (const auto& b : lenBounds) {
+        out.write(reinterpret_cast<char*>(const_cast<uint32_t*>(&b)), sizeof(b));
+      }
+      // write out each
+      for (auto& pb : model) { 
+	bool success = pb.writeBinary(out);
+	if (!success) {
+	  this->logger_->error("Could not write out positional bias model to {}!", fpath.string());
+	}
+      }
+      return true;
+    };
+
+    // 5' observed 
+    {
+      bfs::path obsPosPath = auxDir / "obs5_pos.gz";
+      // Get the pos bias vector
+      auto& posBiases = experiment.posBias(salmon::utils::Direction::FORWARD);
+      writePosModel(obsPosPath, posBiases);
+    }
+    //3' observed
+    {
+      bfs::path obsPosPath = auxDir / "obs3_pos.gz";
+      // Get the pos bias vector
+      auto& posBiases = experiment.posBias(salmon::utils::Direction::REVERSE_COMPLEMENT);
+      writePosModel(obsPosPath, posBiases);
+    }
+    // 5' expected
+    {
+      bfs::path expPosPath = auxDir / "exp5_pos.gz";
+      // Get the pos bias vector
+      auto& posBiases = experiment.posBiasExpected(salmon::utils::Direction::FORWARD);
+      writePosModel(expPosPath, posBiases);
+    }
+    // 3' expected
+    {
+      bfs::path expPosPath = auxDir / "exp3_pos.gz";
+      // Get the pos bias vector
+      auto& posBiases = experiment.posBiasExpected(salmon::utils::Direction::REVERSE_COMPLEMENT);
+      writePosModel(expPosPath, posBiases);
+    }
+  }
+
   /*
   bfs::path normGCPath = auxDir / "expected_gc.gz";
   writeVectorToFile(normGCPath, experiment.expectedGCBias());
@@ -290,13 +351,59 @@ bool GZipWriter::writeMeta(
       oa(cereal::make_nvp("mapping_type", mapTypeStr));
 
       oa(cereal::make_nvp("num_targets", transcripts.size()));
+
+      // True if we dumped the equivalence classes, false otherwise
+      oa(cereal::make_nvp("serialized_eq_classes", opts.dumpEq));
+      
+      // For now, this vector is empty unless we dumped the equivalence classes
+      // with weights.  In which case it contains the string "scalar_weights".
+      std::vector<std::string> props;
+      if (opts.dumpEqWeights) { props.push_back("scalar_weights"); }
+      oa(cereal::make_nvp("eq_class_properties", props));
+
+      oa(cereal::make_nvp("length_classes", experiment.getLengthQuantiles()));
       oa(cereal::make_nvp("num_bootstraps", numSamples));
       oa(cereal::make_nvp("num_processed", experiment.numObservedFragments()));
       oa(cereal::make_nvp("num_mapped", experiment.numMappedFragments()));
       oa(cereal::make_nvp("percent_mapped", experiment.effectiveMappingRate() * 100.0));
       oa(cereal::make_nvp("call", std::string("quant")));
-      oa(cereal::make_nvp("start_time", tstring));
+      oa(cereal::make_nvp("start_time", opts.runStartTime));
+      oa(cereal::make_nvp("end_time", opts.runStopTime));
   }
+
+  {
+    bfs::path ambigInfo = auxDir / "ambig_info.tsv";
+    std::ofstream os(ambigInfo.string());
+    os << "UniqueCount\tAmbigCount\n";
+
+    auto& transcripts = experiment.transcripts();
+    std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
+      const_cast<ExpT&>(experiment).equivalenceClassBuilder().eqVec();
+
+    class CountPair {
+    public:
+      uint32_t unique=0;
+      uint32_t potential=0;
+    };
+
+    std::vector<CountPair> counts(transcripts.size());
+    for (auto& eq : eqVec) {
+      uint64_t count = eq.second.count;
+      const TranscriptGroup& tgroup = eq.first;
+      const std::vector<uint32_t>& txps = tgroup.txps;
+      if (txps.size() > 1) {
+        for (auto tid : txps) {
+          counts[tid].potential += count;
+        }
+      } else {
+        counts[txps.front()].unique += count;
+      }
+    }
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+      os << counts[i].unique << '\t' << counts[i].potential << '\n';
+    }
+  }
+
   return true;
 }
 
@@ -342,14 +449,36 @@ bool GZipWriter::writeAbundances(
       double tfrac = (npm / effLength) / tfracDenom;
       double tpm = tfrac * million;
       fmt::print(output.get(), "{}\t{}\t{}\t{}\t{}\n",
-              transcript.RefName, transcript.RefLength, effLength,
+              transcript.RefName, transcript.CompleteLength, effLength,
               tpm, count);
   }
   return true;
 }
 
+bool GZipWriter::setSamplingPath(const SalmonOpts& sopt) {
+  namespace bfs = boost::filesystem;
+
+  bfs::path auxDir = path_ / sopt.auxDir;
+  if (!bfs::exists(auxDir)) {
+    bool auxSuccess = boost::filesystem::create_directories(auxDir);
+    if (!auxSuccess) {
+      sopt.jointLog->critical("Could not create auxiliary directory {}", auxDir.string());
+      return false;
+    }
+  }
+  bsPath_ = auxDir / "bootstrap";
+  if (!bfs::exists(bsPath_)) {
+    bool bsSuccess = boost::filesystem::create_directories(bsPath_);
+    if (!bsSuccess) {
+      sopt.jointLog->critical("Could not create sampling directory {}", bsPath_.string());
+      return false;
+    }
+  }
+  return true;
+}
+
 template <typename T>
-bool GZipWriter::writeBootstrap(const std::vector<T>& abund) {
+bool GZipWriter::writeBootstrap(const std::vector<T>& abund, bool quiet) {
 #if defined __APPLE__
             spin_lock::scoped_lock sl(writeMutex_);
 #else
@@ -369,16 +498,18 @@ bool GZipWriter::writeBootstrap(const std::vector<T>& abund) {
         size_t elSize = sizeof(typename std::vector<T>::value_type);
         ofile.write(reinterpret_cast<char*>(const_cast<T*>(abund.data())),
                     elSize * num);
-        logger_->info("wrote {} bootstraps", numBootstrapsWritten_.load()+1);
+        if (!quiet){
+          logger_->info("wrote {} bootstraps", numBootstrapsWritten_.load()+1);
+        }
         ++numBootstrapsWritten_;
         return true;
 }
 
 template
-bool GZipWriter::writeBootstrap<double>(const std::vector<double>& abund);
+bool GZipWriter::writeBootstrap<double>(const std::vector<double>& abund, bool quiet);
 
 template
-bool GZipWriter::writeBootstrap<int>(const std::vector<int>& abund);
+bool GZipWriter::writeBootstrap<int>(const std::vector<int>& abund, bool quiet);
 
 template
 bool GZipWriter::writeEquivCounts<ReadExperiment>(const SalmonOpts& sopt,
@@ -402,20 +533,17 @@ bool GZipWriter::writeAbundances<AlignmentLibrary<ReadPair>>(const SalmonOpts& s
 template
 bool GZipWriter::writeMeta<ReadExperiment>(
     const SalmonOpts& opts,
-    const ReadExperiment& experiment,
-    const std::string& tstring);
+    const ReadExperiment& experiment);
 
 template
 bool GZipWriter::writeMeta<AlignmentLibrary<UnpairedRead>>(
     const SalmonOpts& opts,
-    const AlignmentLibrary<UnpairedRead>& experiment,
-    const std::string& tstring);
+    const AlignmentLibrary<UnpairedRead>& experiment);
 
 template
 bool GZipWriter::writeMeta<AlignmentLibrary<ReadPair>>(
     const SalmonOpts& opts,
-    const AlignmentLibrary<ReadPair>& experiment,
-    const std::string& tstring);
+    const AlignmentLibrary<ReadPair>& experiment);
 
 template
 std::vector<std::string> getLibTypeStrings(const AlignmentLibrary<UnpairedRead>& experiment);
diff --git a/src/JellyfishMerCounter.cpp b/src/JellyfishMerCounter.cpp
deleted file mode 100644
index 5d60418..0000000
--- a/src/JellyfishMerCounter.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/**
->HEADER
-    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
-
-    This file is part of Salmon.
-
-    Salmon is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Salmon is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-/*  This file is part of Jellyfish.
-
-    Jellyfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Jellyfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-/*  This file was reproduced directly from Jellyfish's mer_counter.cc file.  It is
-    meant to act as a pseudo-independent driver that builds Jellyfish hash for a
-    transcriptome, but which is callable directly (as a function) from Sailfish.
- */
-
-#include <cstdlib>
-#include <unistd.h>
-#include <assert.h>
-#include <signal.h>
-
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <map>
-#include <sstream>
-#include <memory>
-#include <chrono>
-
-#include <jellyfish/err.hpp>
-#include <jellyfish/thread_exec.hpp>
-#include <jellyfish/hash_counter.hpp>
-#include <jellyfish/locks_pthread.hpp>
-#include <jellyfish/stream_manager.hpp>
-#include <jellyfish/mer_overlap_sequence_parser.hpp>
-#include <jellyfish/whole_sequence_parser.hpp>
-#include <jellyfish/mer_iterator.hpp>
-#include <jellyfish/mer_qual_iterator.hpp>
-#include <jellyfish/jellyfish.hpp>
-#include <jellyfish/mer_dna_bloom_counter.hpp>
-#include <jellyfish/generator_manager.hpp>
-#include "merge_files.hpp"
-#include "count_main_cmdline.hpp"
-
-static count_main_cmdline args; // Command line switches and arguments
-
-using std::chrono::system_clock;
-using std::chrono::duration;
-using std::chrono::duration_cast;
-
-template<typename DtnType>
-inline double as_seconds(DtnType dtn) { return duration_cast<duration<double>>(dtn).count(); }
-
-using jellyfish::mer_dna;
-using jellyfish::mer_dna_bloom_counter;
-using jellyfish::mer_dna_bloom_filter;
-typedef std::vector<const char*> file_vector;
-
-// Types for parsing arbitrary sequence ignoring quality scores
-typedef jellyfish::mer_overlap_sequence_parser<jellyfish::stream_manager<file_vector::const_iterator> > sequence_parser;
-typedef jellyfish::mer_iterator<sequence_parser, mer_dna> mer_iterator;
-
-// Types for parsing reads with quality score. Interface match type
-// above.
-class sequence_qual_parser :
-  public jellyfish::whole_sequence_parser<jellyfish::stream_manager<file_vector::const_iterator> >
-{
-  typedef jellyfish::stream_manager<file_vector::const_iterator> StreamIterator;
-  typedef jellyfish::whole_sequence_parser<StreamIterator> super;
-public:
-  sequence_qual_parser(uint16_t mer_len, uint32_t max_producers, uint32_t size, size_t buf_size,
-                       StreamIterator& streams) :
-    super(size, 100, max_producers, streams)
-  { }
-};
-
-class mer_qual_iterator : public jellyfish::mer_qual_iterator<sequence_qual_parser, mer_dna> {
-  typedef jellyfish::mer_qual_iterator<sequence_qual_parser, mer_dna> super;
-public:
-  mer_qual_iterator(sequence_qual_parser& parser, bool canonical = false) :
-    super(parser, args.min_qual_char_arg[0], canonical)
-  { }
-};
-
-// k-mer filters. Organized in a linked list, interpreted as a &&
-// (logical and). I.e. all filter must return true for the result to
-// be true. By default, filter returns true.
-struct filter {
-  filter* prev_;
-  filter(filter* prev = 0) : prev_(prev) { }
-  virtual ~filter() { }
-  virtual bool operator()(const mer_dna& x) { return and_res(true, x); }
-  bool and_res(bool r, const mer_dna& x) const {
-    return r ? (prev_ ? (*prev_)(x) : true) : false;
-  }
-};
-
-struct filter_bc : public filter {
-  const mer_dna_bloom_counter& counter_;
-  filter_bc(const mer_dna_bloom_counter& counter, filter* prev = 0) :
-    filter(prev),
-    counter_(counter)
-  { }
-  bool operator()(const mer_dna& m) {
-    unsigned int c = counter_.check(m);
-    return and_res(c > 1, m);
-  }
-};
-
-struct filter_bf : public filter {
-  mer_dna_bloom_filter& bf_;
-  filter_bf(mer_dna_bloom_filter& bf, filter* prev = 0) :
-    filter(prev),
-    bf_(bf)
-  { }
-  bool operator()(const mer_dna& m) {
-    unsigned int c = bf_.insert(m);
-    return and_res(c > 0, m);
-  }
-};
-
-enum OPERATION { COUNT, PRIME, UPDATE };
-template<typename PathIterator, typename MerIteratorType, typename ParserType>
-class mer_counter_base : public jellyfish::thread_exec {
-  int                                     nb_threads_;
-  mer_hash&                               ary_;
-  jellyfish::stream_manager<PathIterator> streams_;
-  ParserType                              parser_;
-  filter*                                 filter_;
-  OPERATION                               op_;
-
-public:
-  mer_counter_base(int nb_threads, mer_hash& ary,
-                   PathIterator file_begin, PathIterator file_end,
-                   PathIterator pipe_begin, PathIterator pipe_end,
-                   uint32_t concurent_files,
-                   OPERATION op, filter* filter = new struct filter) :
-    ary_(ary),
-    streams_(file_begin, file_end, pipe_begin, pipe_end, concurent_files),
-    parser_(mer_dna::k(), streams_.nb_streams(), 3 * nb_threads, 4096, streams_),
-    filter_(filter),
-    op_(op)
-  { }
-
-  virtual void start(int thid) {
-    size_t count = 0;
-    MerIteratorType mers(parser_, args.canonical_flag);
-
-    switch(op_) {
-     case COUNT:
-      for( ; mers; ++mers) {
-        if((*filter_)(*mers))
-          ary_.add(*mers, 1);
-        ++count;
-      }
-      break;
-
-    case PRIME:
-      for( ; mers; ++mers) {
-        if((*filter_)(*mers))
-          ary_.set(*mers);
-        ++count;
-      }
-      break;
-
-    case UPDATE:
-      mer_dna tmp;
-      for( ; mers; ++mers) {
-        if((*filter_)(*mers))
-          ary_.update_add(*mers, 1, tmp);
-        ++count;
-      }
-      break;
-    }
-
-    ary_.done();
-  }
-};
-
-// Counter with and without quality value
-typedef mer_counter_base<file_vector::const_iterator, mer_iterator, sequence_parser> mer_counter;
-typedef mer_counter_base<file_vector::const_iterator, mer_qual_iterator, sequence_qual_parser> mer_qual_counter;
-
-mer_dna_bloom_counter* load_bloom_filter(const char* path) {
-  std::ifstream in(path, std::ios::in|std::ios::binary);
-  jellyfish::file_header header(in);
-  if(!in.good())
-    die << "Failed to parse bloom filter file '" << path << "'";
-  if(header.format() != "bloomcounter")
-    die << "Invalid format '" << header.format() << "'. Expected 'bloomcounter'";
-  if(header.key_len() != mer_dna::k() * 2)
-    die << "Invalid mer length in bloom filter";
-  jellyfish::hash_pair<mer_dna> fns(header.matrix(1), header.matrix(2));
-  auto res = new mer_dna_bloom_counter(header.size(), header.nb_hashes(), in, fns);
-  if(!in.good())
-    die << "Bloom filter file is truncated";
-  in.close();
-  return res;
-}
-
-// If get a termination signal, kill the manager and then kill myself.
-static pid_t manager_pid = 0;
-static void signal_handler(int sig) {
-  if(manager_pid)
-    kill(manager_pid, SIGTERM);
-  signal(sig, SIG_DFL);
-  kill(getpid(), sig);
-  _exit(EXIT_FAILURE); // Should not be reached
-}
-
-int jellyfish_count_main(int argc, char *argv[])
-{
-  auto start_time = system_clock::now();
-
-  jellyfish::file_header header;
-  header.fill_standard();
-  header.set_cmdline(argc, argv);
-
-  args.parse(argc, argv);
-
-  if(args.min_qual_char_given && args.min_qual_char_arg.size() != 1)
-    count_main_cmdline::error("[-Q, --min-qual-char] must be one character.");
-
-  mer_dna::k(args.mer_len_arg);
-
-  std::unique_ptr<jellyfish::generator_manager> generator_manager;
-  if(args.generator_given) {
-    auto gm =
-      new jellyfish::generator_manager(args.generator_arg, args.Generators_arg,
-                                       args.shell_given ? args.shell_arg : (const char*)0);
-    generator_manager.reset(gm);
-    generator_manager->start();
-    manager_pid = generator_manager->pid();
-    struct sigaction act;
-    memset(&act, '\0', sizeof(act));
-    act.sa_handler = signal_handler;
-    assert(sigaction(SIGTERM, &act, 0) == 0);
-  }
-
-  header.canonical(args.canonical_flag);
-  mer_hash ary(args.size_arg, args.mer_len_arg * 2, args.counter_len_arg, args.threads_arg, args.reprobes_arg);
-  if(args.disk_flag)
-    ary.do_size_doubling(false);
-
-  std::auto_ptr<jellyfish::dumper_t<mer_array> > dumper;
-  if(args.text_flag)
-    dumper.reset(new text_dumper(args.threads_arg, args.output_arg, &header));
-  else
-    dumper.reset(new binary_dumper(args.out_counter_len_arg, ary.key_len(), args.threads_arg, args.output_arg, &header));
-  ary.dumper(dumper.get());
-
-  auto after_init_time = system_clock::now();
-
-  OPERATION do_op = COUNT;
-  if(args.if_given) {
-    mer_counter counter(args.threads_arg, ary,
-                        args.if_arg.begin(), args.if_arg.end(),
-                        args.if_arg.end(), args.if_arg.end(), // no multi pipes
-                        args.Files_arg, PRIME);
-    counter.exec_join(args.threads_arg);
-    do_op = UPDATE;
-  }
-
-  // Iterators to the multi pipe paths. If no generator manager,
-  // generate an empty range.
-  auto pipes_begin = generator_manager.get() ? generator_manager->pipes().begin() : args.file_arg.end();
-  auto pipes_end = (bool)generator_manager ? generator_manager->pipes().end() : args.file_arg.end();
-
-  // Bloom counter read from file to filter out low frequency
-  // k-mers. Two pass algorithm.
-  std::unique_ptr<filter> mer_filter(new filter);
-  std::unique_ptr<mer_dna_bloom_counter> bc;
-  if(args.bc_given) {
-    bc.reset(load_bloom_filter(args.bc_arg));
-    mer_filter.reset(new filter_bc(*bc));
-  }
-
-  // Bloom filter to filter out low frequency k-mers. One pass
-  // algorithm.
-  std::unique_ptr<mer_dna_bloom_filter> bf;
-  if(args.bf_size_given) {
-    bf.reset(new mer_dna_bloom_filter(args.bf_fp_arg, args.bf_size_arg));
-    mer_filter.reset(new filter_bf(*bf));
-  }
-
-  if(args.min_qual_char_given) {
-    mer_qual_counter counter(args.threads_arg, ary,
-                             args.file_arg.begin(), args.file_arg.end(),
-                             pipes_begin, pipes_end,
-                             args.Files_arg,
-                             do_op, mer_filter.get());
-    counter.exec_join(args.threads_arg);
-  } else {
-    mer_counter counter(args.threads_arg, ary,
-                        args.file_arg.begin(), args.file_arg.end(),
-                        pipes_begin, pipes_end,
-                        args.Files_arg,
-                        do_op, mer_filter.get());
-    counter.exec_join(args.threads_arg);
-  }
-
-  // If we have a manager, wait for it
-  if(generator_manager) {
-    signal(SIGTERM, SIG_DFL);
-    manager_pid = 0;
-    if(!generator_manager->wait())
-      die << "Some generator commands failed";
-    generator_manager.reset();
-  }
-
-  auto after_count_time = system_clock::now();
-
-  // If no intermediate files, dump directly into output file. If not, will do a round of merging
-  if(!args.no_write_flag) {
-    if(dumper->nb_files() == 0) {
-      dumper->one_file(true);
-      if(args.lower_count_given)
-        dumper->min(args.lower_count_arg);
-      if(args.upper_count_given)
-        dumper->max(args.upper_count_arg);
-      dumper->dump(ary.ary());
-    } else {
-      dumper->dump(ary.ary());
-      if(!args.no_merge_flag) {
-        std::vector<const char*> files = dumper->file_names_cstr();
-        uint64_t min = args.lower_count_given ? args.lower_count_arg : 0;
-        uint64_t max = args.upper_count_given ? args.upper_count_arg : std::numeric_limits<uint64_t>::max();
-        try {
-          merge_files(files, args.output_arg, header, min, max);
-        } catch(MergeError e) {
-          die << e.what();
-        }
-        if(!args.no_unlink_flag) {
-          for(int i =0; i < dumper->nb_files(); ++i)
-            unlink(files[i]);
-        }
-      } // if(!args.no_merge_flag
-    } // if(!args.no_merge_flag
-  }
-
-  auto after_dump_time = system_clock::now();
-
-  if(args.timing_given) {
-    std::ofstream timing_file(args.timing_arg);
-    timing_file << "Init     " << as_seconds(after_init_time - start_time) << "\n"
-                << "Counting " << as_seconds(after_count_time - after_init_time) << "\n"
-                << "Writing  " << as_seconds(after_dump_time - after_count_time) << "\n";
-  }
-
-  return 0;
-}
diff --git a/src/LookUpTableUtils.cpp b/src/LookUpTableUtils.cpp
deleted file mode 100644
index 2bf0d27..0000000
--- a/src/LookUpTableUtils.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/**
->HEADER
-    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
-
-    This file is part of Salmon.
-
-    Salmon is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Salmon is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cstdint>
-#include <cstring>
-#include <cstdio>
-#include <sstream>
-#include <memory>
-#include <functional>
-#include <unordered_map>
-#include <mutex>
-#include <thread>
-#include <chrono>
-#include <iomanip>
-
-#include "tbb/parallel_for.h"
-#include "tbb/parallel_for_each.h"
-
-#include <boost/range/irange.hpp>
-#include "ezETAProgressBar.hpp"
-#include "LookUpTableUtils.hpp"
-
-
-namespace LUTTools {
-
-/**
- *  \brief Dump the k-mer memberships vector to the file fname
- **/
-void dumpKmerEquivClasses(
-                          const std::vector<KmerID>& memberships,
-                          const std::string& fname) {
-
-  std::ofstream ofile(fname, std::ios::binary);
-
-  // Write the length of the vector to the file
-  size_t vecLen{memberships.size()};
-  ofile.write(reinterpret_cast<const char*>(&vecLen), sizeof(vecLen));
-
-  // Write the actual vector to the file
-  ofile.write(reinterpret_cast<const char*>(&memberships.front()), sizeof(memberships.front()) * vecLen);
-
-  // Close the file
-  ofile.close();
-}
-
-
-std::vector<KmerID> readKmerEquivClasses(const std::string& fname) {
-  std::ifstream ifile(fname, std::ios::binary);
-  size_t vecLen{0};
-  ifile.read(reinterpret_cast<char*>(&vecLen), sizeof(vecLen));
-
-  std::vector<KmerID> memberships(vecLen, std::numeric_limits<KmerID>::max());
-  ifile.read(reinterpret_cast<char*>(&memberships.front()), sizeof(memberships.front()) * vecLen);
-
-  ifile.close();
-  return memberships;
-}
-
-void dumpKmerLUT(
-    std::vector<TranscriptList> &transcriptsForKmerClass,
-    const std::string &fname) {
-
-    tbb::parallel_for_each( transcriptsForKmerClass.begin(), transcriptsForKmerClass.end(),
-    [&]( TranscriptList & t ) {
-        std::sort(t.begin(), t.end());
-    });
-
-    std::ofstream ofile(fname, std::ios::binary);
-    // number of kmers
-    auto numk = transcriptsForKmerClass.size();
-    ofile.write(reinterpret_cast<const char *>(&numk), sizeof(numk));
-    for (auto i : boost::irange(size_t(0), numk)) {
-        // write the vector's size
-        auto s = transcriptsForKmerClass[i].size();
-        ofile.write(reinterpret_cast<const char *>(&s), sizeof(s));
-        // write the vector's contents
-        if ( s > 0 ) {
-            ofile.write(reinterpret_cast<const char *>(&transcriptsForKmerClass[i][0]), s * sizeof(transcriptsForKmerClass[i][0]));
-        }
-    }
-    // close the output file
-    ofile.close();
-}
-
-void readKmerLUT(
-    const std::string &fname,
-    std::vector<TranscriptList> &transcriptsForKmer) {
-
-    std::ifstream ifile(fname, std::ios::binary);
-    // get the size of the vector from file
-    size_t numk = 0;
-    ifile.read(reinterpret_cast<char *>(&numk), sizeof(numk));
-    // resize the vector now
-    transcriptsForKmer.resize(numk);
-
-    for (auto i : boost::irange(size_t(0), numk)) {
-        // read the vector's size
-        size_t numTran = 0;
-        ifile.read(reinterpret_cast<char *>(&numTran), sizeof(numTran));
-        // read the vector's contents
-        if ( numTran > 0 ) {
-            transcriptsForKmer[i].resize(numTran);
-            ifile.read(reinterpret_cast<char *>(&transcriptsForKmer[i][0]), numTran * sizeof(TranscriptID));
-        }
-    }
-
-    ifile.close();
-}
-
-
-void writeTranscriptInfo (TranscriptInfo *ti, std::ofstream &ostream) {
-    size_t numKmers = ti->kmers.size();
-    size_t recordSize = sizeof(ti->transcriptID) +
-                        sizeof(ti->geneID) +
-                        sizeof(ti->name.length()) +
-                        ti->name.length() +
-                        sizeof(ti->length) +
-                        sizeof(numKmers) +
-                        sizeof(KmerID) * numKmers;
-
-    ostream.write(reinterpret_cast<const char *>(&recordSize), sizeof(recordSize));
-    ostream.write(reinterpret_cast<const char *>(&ti->transcriptID), sizeof(ti->transcriptID));
-    ostream.write(reinterpret_cast<const char *>(&ti->geneID), sizeof(ti->geneID));
-    auto l = ti->name.length();
-    ostream.write(reinterpret_cast<const char *>(&l), sizeof(l));
-    ostream.write(reinterpret_cast<const char *>(ti->name.c_str()), l);
-    ostream.write(reinterpret_cast<const char *>(&ti->length), sizeof(ti->length));
-    ostream.write(reinterpret_cast<const char *>(&numKmers), sizeof(numKmers));
-    ostream.write(reinterpret_cast<const char *>(&ti->kmers[0]), numKmers * sizeof(KmerID));
-}
-
-std::unique_ptr<TranscriptInfo> readTranscriptInfo(std::ifstream &istream) {
-    std::unique_ptr<TranscriptInfo> ti(new TranscriptInfo);
-    size_t recordSize = 0;
-    istream.read(reinterpret_cast<char *>(&recordSize), sizeof(recordSize));
-    istream.read(reinterpret_cast<char *>(&ti->transcriptID), sizeof(ti->transcriptID));
-    istream.read(reinterpret_cast<char *>(&ti->geneID), sizeof(ti->geneID));
-    size_t slen = 0;
-    istream.read(reinterpret_cast<char *>(&slen), sizeof(slen));
-    char *name = new char[slen+1];
-    name[slen] = '\0';
-    istream.read(name, slen);
-    ti->name = std::string(name);
-    // read the transcript's length
-    istream.read(reinterpret_cast<char *>(&ti->length), sizeof(ti->length));
-    size_t numKmers = 0;
-    istream.read(reinterpret_cast<char *>(&numKmers), sizeof(numKmers));
-    ti->kmers = std::vector<KmerID>(numKmers, 0);
-    istream.read(reinterpret_cast<char *>(&ti->kmers[0]), sizeof(KmerID)*numKmers);
-    return ti;
-}
-
-/*
-std::vector<std::unique_ptr<TranscriptInfo>> getTranscriptsFromFile(const std::string &tlutfname,
-        const std::vector<Offset> &offsets,
-std::pair<CTVecIt, CTVecIt> be) {
-
-    Offset INVALID = std::numeric_limits<Offset>::max();
-    std::cerr << "before creating transcript vector" << std::endl;
-    auto batchSize = std::distance(be.first, be.second);
-    std::cerr << "batch size = " << batchSize << "\n";
-    std::vector<std::unique_ptr<TranscriptInfo>> transcripts;//(batchSize);
-    //transcripts.reserve();
-    std::cerr << "after creating transcript vector " << std::endl;
-
-    std::cerr << "opening input file" << std::endl;
-    std::ifstream ifile(tlutfname, std::ios::binary);
-    std::cerr << "opened" << std::endl;
-    size_t idx = 0;
-    std::cerr << "iterating through batch" << std::endl;
-    for (auto it = be.first; it != be.second; ++it) {
-        std::cerr << "getting offset for transcript " << it->tid << std::endl;
-        auto offset = offsets[it->tid];
-        if ( offset == INVALID ) {
-            std::cerr << "encountered invalid offset for transcript " << it->tid << std::endl;
-            std::exit(1);
-        }
-        std::cerr << "seeking to " << offset << std::endl;
-        ifile.seekg(offset);
-        std::cerr << "reading in transcript from file" << std::endl;
-        transcripts.emplace_back(readTranscriptInfo(ifile));
-        ++idx;
-        // Check that the transcript we read is the one we expected
-        if (it->tid != transcripts.back()->transcriptID) {
-            std::cerr << "read the wrong transcript!\n";
-            std::exit(1);
-        }
-    }
-
-    ifile.close();
-    std::cerr << "done reading all transcripts from this batch" << std::endl;
-
-    return std::move(transcripts);
-}
-*/
-
-std::vector<Offset> buildTLUTIndex(const std::string &tlutfname, size_t numTranscripts) {
-
-    std::cerr << "creating vector of " << numTranscripts << " offsets \n";
-    Offset INVALID = std::numeric_limits<Offset>::max();
-    std::vector<Offset> offsets(numTranscripts, INVALID);
-    std::cerr << "done\n";
-
-    std::cerr << "opening file\n";
-    std::ifstream ifile(tlutfname, std::ios::binary);
-    std::cerr << "done\n";
-
-    size_t numRecords {0};
-    std::cerr << "reading numRecords\n";
-    ifile.read(reinterpret_cast<char *>(&numRecords), sizeof(numRecords));
-    std::cerr << "numRecords = " << numRecords << std::endl;
-    std::cerr << "numTranscripts = " << numTranscripts << std::endl;
-
-    std::cerr << "building transcript lookup table index" << std::endl;
-    ez::ezETAProgressBar pb(numRecords);
-    pb.start();
-    Offset offset = sizeof(numRecords);
-    for (auto recNum : boost::irange(size_t(0), numRecords)) {
-        size_t recordSize = 0;
-        TranscriptID tid = 0;
-        ifile.read(reinterpret_cast<char *>(&recordSize), sizeof(recordSize));
-        ifile.read(reinterpret_cast<char *>(&tid), sizeof(tid));
-        offsets[tid] = offset;
-        offset += recordSize + sizeof(recordSize);
-        ifile.seekg(offset);
-        ++pb;
-    }
-
-    ifile.close();
-    return offsets;
-}
-}
diff --git a/src/SailfishUtils.cpp b/src/SailfishUtils.cpp
deleted file mode 100644
index f127c69..0000000
--- a/src/SailfishUtils.cpp
+++ /dev/null
@@ -1,699 +0,0 @@
-/**
->HEADER
-    Copyright (c) 2015 Rob Patro robp at cs.cmu.edu
-
-    This file is part of Salmon.
-
-    Salmon is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Salmon is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Salmon.  If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-#include <boost/thread/thread.hpp>
-#include <boost/filesystem.hpp>
-#include <algorithm>
-#include <iostream>
-#include <tuple>
-#include <unordered_set>
-#include <unordered_map>
-#include <vector>
-#include <boost/filesystem.hpp>
-#include <boost/range/join.hpp>
-
-#include "gff.h"
-
-#include "jellyfish/stream_manager.hpp"
-#include "jellyfish/whole_sequence_parser.hpp"
-
-#include "jellyfish/mer_dna.hpp"
-#include "TranscriptGeneMap.hpp"
-#include "GenomicFeature.hpp"
-#include "SalmonUtils.hpp"
-
-namespace salmon{
-namespace utils {
-using std::string;
-using NameVector = std::vector<string>;
-using IndexVector = std::vector<size_t>;
-using KmerVector = std::vector<uint64_t>;
-
-/**
- * This function parses the library format string that specifies the format in which
- * the reads are to be expected.
- */
-LibraryFormat parseLibraryFormatStringNew(std::string& fmt) {
-	using std::vector;
-	using std::string;
-	using std::map;
-	using std::stringstream;
-
-    map<string, LibraryFormat> formatMap = {
-        {"IU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U)},
-        {"ISF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::SA)},
-        {"ISR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::AS)},
-        {"OU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::U)},
-        {"OSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::SA)},
-        {"OSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::AWAY, ReadStrandedness::AS)},
-        {"MU", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::U)},
-        {"MSF", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::S)},
-        {"MSR", LibraryFormat(ReadType::PAIRED_END, ReadOrientation::SAME, ReadStrandedness::A)},
-        {"U", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U)},
-        {"SF", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::S)},
-        {"SR", LibraryFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::A)}};
-
-	// inspired by http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c
-	// first convert the string to upper-case
-	for (auto& c : fmt) { c = std::toupper(c); }
-
-
-    auto libFmtIt = formatMap.find(fmt);
-
-	if (libFmtIt == formatMap.end()) {
-		stringstream errstr;
-		errstr << "unknown library format string : " << fmt;
-		throw std::invalid_argument(errstr.str());
-	}
-
-    return libFmtIt->second;
-}
-
-/**
- * Parses a set of __ordered__ command line options and extracts the relevant
- * read libraries from them.
- */
-std::vector<ReadLibrary> extractReadLibraries(boost::program_options::parsed_options& orderedOptions) {
-	// The current (default) format for paired end data
-	LibraryFormat peFormat(ReadType::PAIRED_END, ReadOrientation::TOWARD, ReadStrandedness::U);
-	// The current (default) format for single end data
-	LibraryFormat seFormat(ReadType::SINGLE_END, ReadOrientation::NONE, ReadStrandedness::U);
-
-	std::vector<ReadLibrary> peLibs{ReadLibrary(peFormat)};
-	std::vector<ReadLibrary> seLibs{ReadLibrary(seFormat)};
-	for (auto& opt : orderedOptions.options) {
-		// Update the library type
-		if (opt.string_key == "libType") {
-			auto libFmt = parseLibraryFormatStringNew(opt.value[0]);
-			if (libFmt.type == ReadType::PAIRED_END) {
-				peFormat = libFmt;
-				peLibs.emplace_back(libFmt);
-			} else {
-				seFormat = libFmt;
-				seLibs.emplace_back(libFmt);
-			}
-		}
-		if (opt.string_key == "mates1") {
-			peLibs.back().addMates1(opt.value);
-		}
-		if (opt.string_key == "mates2") {
-			peLibs.back().addMates2(opt.value);
-		}
-		if (opt.string_key == "unmatedReads") {
-			seLibs.back().addUnmated(opt.value);
-		}
-	}
-
-	std::vector<ReadLibrary> libs;
-	libs.reserve(peLibs.size() + seLibs.size());
-	for (auto& lib : boost::range::join(seLibs, peLibs)) {
-		if (lib.format().type == ReadType::SINGLE_END) {
-			if (lib.unmated().size() == 0) {
-				// Didn't use default single end library type
-				continue;
-			}
-		} else if (lib.format().type == ReadType::PAIRED_END) {
-			if (lib.mates1().size() == 0 or lib.mates2().size() == 0) {
-                // Didn't use default paired-end library type
-				continue;
-			}
-		}
-		libs.push_back(lib);
-	}
-    size_t numLibs = libs.size();
-	std::cerr << "there " << ((numLibs > 1) ? "are " : "is ") << libs.size() << ((numLibs > 1) ? " libs\n" : " lib\n");
-	return libs;
-}
-
-
-
-/**
- * This function parses the library format string that specifies the format in which
- * the reads are to be expected.
- */
-LibraryFormat parseLibraryFormatString(std::string& fmt) {
-    using std::vector;
-    using std::string;
-    using std::map;
-    using std::stringstream;
-
-    // inspired by http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c
-
-    // first convert the string to upper-case
-    for (auto& c : fmt) { c = std::toupper(c); }
-    // split on the delimiter ':', and put the key, value (k=v) pairs into a map
-    stringstream ss(fmt);
-    string item;
-    map<string, string> kvmap;
-    while (std::getline(ss, item, ':')) {
-        auto splitPos = item.find('=', 0);
-        string key{item.substr(0, splitPos)};
-        string value{item.substr(splitPos+1)};
-        kvmap[key] = value;
-    }
-
-    map<string, ReadType> readType = {{"SE", ReadType::SINGLE_END}, {"PE", ReadType::PAIRED_END}};
-    map<string, ReadOrientation> orientationType = {{">>", ReadOrientation::SAME},
-                                           {"<>", ReadOrientation::AWAY},
-                                           {"><", ReadOrientation::TOWARD},
-                                           {"*", ReadOrientation::NONE}};
-    map<string, ReadStrandedness> strandType = {{"SA", ReadStrandedness::SA},
-                                    {"AS", ReadStrandedness::AS},
-                                    {"A", ReadStrandedness::A},
-                                    {"S", ReadStrandedness::S},
-                                    {"U", ReadStrandedness::U}};
-    auto it = kvmap.find("T");
-    string typeStr = "";
-    if (it != kvmap.end()) {
-        typeStr = it->second;
-    } else {
-        it = kvmap.find("TYPE");
-        if (it != kvmap.end()) {
-            typeStr = it->second;
-        }
-    }
-
-    if (typeStr != "SE" and typeStr != "PE") {
-        string e = typeStr + " is not a valid read type; must be one of {SE, PE}";
-        throw std::invalid_argument(e);
-    }
-
-    ReadType type = (typeStr == "SE") ? ReadType::SINGLE_END : ReadType::PAIRED_END;
-    ReadOrientation orientation = (type == ReadType::SINGLE_END) ? ReadOrientation::NONE : ReadOrientation::TOWARD;
-    ReadStrandedness strandedness{ReadStrandedness::U};
-    // Construct the LibraryFormat class from the key, value map
-    for (auto& kv : kvmap) {
-        auto& k = kv.first; auto& v = kv.second;
-        if (k == "O" or k == "ORIENTATION") {
-            auto it = orientationType.find(v);
-            if (it != orientationType.end()) { orientation = orientationType[it->first]; } else {
-                string e = v + " is not a valid orientation type; must be one of {>>, <>, ><}";
-                throw std::invalid_argument(e);
-            }
-
-        }
-        if (k == "S" or k == "STRAND") {
-            auto it = strandType.find(v);
-            if (it != strandType.end()) { strandedness = strandType[it->first]; } else {
-                string e = v + " is not a valid strand type; must be one of {SA, AS, S, A, U}";
-                throw std::invalid_argument(e);
-            }
-        }
-
-    }
-    LibraryFormat lf(type, orientation, strandedness);
-    return lf;
-}
-
-
-
-uint64_t encode(uint64_t tid, uint64_t offset) {
-    uint64_t res = (((tid & 0xFFFFFFFF) << 32) | (offset & 0xFFFFFFFF));
-    return res;
-}
-
-uint32_t transcript(uint64_t enc) {
-    uint32_t t = (enc & 0xFFFFFFFF00000000) >> 32;
-    return t;
-}
-
-uint32_t offset(uint64_t enc) {
-    uint32_t o = enc & 0xFFFFFFFF;
-    return o;
-}
-
-size_t numberOfReadsInFastaFile(const std::string& fname) {
-    constexpr size_t bufferSize = 16184;
-    char buffer[bufferSize];
-    std::ifstream ifile(fname, std::ifstream::in);
-    ifile.rdbuf()->pubsetbuf(buffer, bufferSize);
-
-    size_t numReads = 0;
-    std::string s;
-    while (ifile >> s) { if (s.front() == '>') { ++numReads; } }
-
-    ifile.close();
-
-    return numReads;
-}
-
-bool readKmerOrder( const std::string& fname, std::vector<uint64_t>& kmers ) {
-
-  std::ifstream mlist(fname, std::ios::in | std::ios::binary);
-  // Get the number of kmers from file
-  size_t numKmers{0};
-  mlist.read( reinterpret_cast<char*>( &numKmers ), sizeof( size_t ) );
-
-  // Resize the array that will hold the sorted kmers
-  kmers.resize(numKmers, 0);
-  mlist.read( reinterpret_cast<char*>( &kmers[0] ), sizeof( uint64_t) * kmers.size() );
-
-  mlist.close();
-
-  return true;
-}
-
-template <template<typename> class S, typename T>
-bool overlap( const S<T> &a, const S<T> &b ) {
-    // Query from the smaller set to the larger set
-    if ( a.size() <= b.size() ) {
-        for ( auto & ae : a ) {
-            if (b.find(ae) != b.end()) {
-                return true;
-            }
-        }
-    } else {
-        for ( auto & be : b ) {
-            if (a.find(be) != b.end()) {
-                return true;
-            }
-        }
-    }
-    // If nothing from the smaller set is in the larger set, then they don't overlap
-    return false;
-}
-
-
-TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname, std::string key) {
-
-    using std::unordered_set;
-    using std::unordered_map;
-    using std::vector;
-    using std::tuple;
-    using std::string;
-    using std::get;
-
-    // Use GffReader to read the file
-    GffReader reader(const_cast<char*>(fname.c_str()));
-    // Remember the optional attributes
-    reader.readAll(true);
-
-    struct TranscriptKeyPair {
-        const char* transcript_id;
-        const char* key;
-        TranscriptKeyPair(const char* t, const char* k) :
-            transcript_id(t), key(k) {}
-    };
-
-    // The user can group transcripts by gene_id, gene_name, or
-    // an optinal attribute that they provide as a string.
-    enum class TranscriptKey { GENE_ID, GENE_NAME, DYNAMIC };
-
-    // Select the proper attribute by which to group
-    TranscriptKey tkey = TranscriptKey::GENE_ID;
-
-    if (key == "gene_id") {
-    } else if (key == "gene_name") {
-        tkey = TranscriptKey::GENE_NAME;
-    } else {
-        tkey = TranscriptKey::DYNAMIC;
-    }
-
-    // Iterate over all transcript features and build the
-    // transcript <-> key vector.
-    auto nfeat = reader.gflst.Count();
-    std::vector<TranscriptKeyPair> feats;
-    for (int i=0; i < nfeat; ++i) {
-        auto f = reader.gflst[i];
-        if (f->isTranscript()) {
-            const char* keyStr;
-            switch (tkey) {
-                case TranscriptKey::GENE_ID:
-                    keyStr = f->getGeneID();
-                    break;
-                case TranscriptKey::GENE_NAME:
-                    keyStr = f->getGeneName();
-                    break;
-                case TranscriptKey::DYNAMIC:
-                    keyStr = f->getAttr(key.c_str());
-                    break;
-            }
-            feats.emplace_back(f->getID(), keyStr);
-        }
-    }
-
-    // Given the transcript <-> key vector, build the
-    // TranscriptGeneMap.
-
-    IndexVector t2g;
-    NameVector transcriptNames;
-    NameVector geneNames;
-
-    // holds the mapping from transcript ID to gene ID
-    IndexVector t2gUnordered;
-    // holds the set of gene IDs
-    unordered_map<string, size_t> geneNameToID;
-
-    // To read the input and assign ids
-    size_t transcriptCounter = 0;
-    size_t geneCounter = 0;
-    string transcript;
-    string gene;
-
-    std::sort( feats.begin(), feats.end(),
-    []( const TranscriptKeyPair & a, const TranscriptKeyPair & b) -> bool {
-        return std::strcmp(a.transcript_id, b.transcript_id) < 0;
-    } );
-
-    std::string currentTranscript = "";
-    for ( auto & feat : feats ) {
-
-        std::string gene(feat.key);
-        std::string transcript(feat.transcript_id);
-
-        if ( transcript != currentTranscript ) {
-            auto geneIt = geneNameToID.find(gene);
-            size_t geneID = 0;
-
-            if ( geneIt == geneNameToID.end() ) {
-                // If we haven't seen this gene yet, give it a new ID
-                geneNameToID[gene] = geneCounter;
-                geneID = geneCounter;
-                geneNames.push_back(gene);
-                ++geneCounter;
-            } else {
-                // Otherwise lookup the ID
-                geneID = geneIt->second;
-            }
-
-            transcriptNames.push_back(transcript);
-            t2g.push_back(geneID);
-
-            //++transcriptID;
-            currentTranscript = transcript;
-        }
-
-    }
-
-    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
-
-}
-
-
-TranscriptGeneMap readTranscriptToGeneMap( std::ifstream &ifile ) {
-
-    using std::unordered_set;
-    using std::unordered_map;
-    using std::vector;
-    using std::tuple;
-    using std::string;
-    using std::get;
-
-    using NameID = tuple<string, size_t>;
-
-    IndexVector t2g;
-    NameVector transcriptNames;
-    NameVector geneNames;
-
-    // holds the transcript name ID mapping
-    vector<NameID> transcripts;
-    // holds the mapping from transcript ID to gene ID
-    IndexVector t2gUnordered;
-    // holds the set of gene IDs
-    unordered_map<string, size_t> geneNameToID;
-
-    // To read the input and assign ids
-    size_t transcriptCounter = 0;
-    size_t geneCounter = 0;
-    string transcript;
-    string gene;
-
-    while ( ifile >> transcript >> gene ) {
-        // The transcript and it's ID
-        transcripts.push_back( make_tuple(transcript, transcriptCounter) );
-
-        auto geneIt = geneNameToID.find(gene);
-        size_t geneID = 0;
-
-        if ( geneIt == geneNameToID.end() ) {
-            // If we haven't seen this gene yet, give it a new ID
-            geneNameToID[gene] = geneCounter;
-            geneID = geneCounter;
-            geneNames.push_back(gene);
-            ++geneCounter;
-        } else {
-            // Otherwise lookup the ID
-            geneID = geneIt->second;
-        }
-
-        // Map the transcript to the gene in terms of their IDs
-        t2gUnordered.push_back(geneID);
-
-        ++transcriptCounter;
-    }
-
-    std::sort( transcripts.begin(), transcripts.end(),
-               []( const NameID & a, const NameID & b) -> bool { return get<0>(a) < get<0>(b); } );
-
-    // Resize these vectors for fast access
-    transcriptNames.resize(t2gUnordered.size());
-    t2g.resize(t2gUnordered.size());
-
-    for ( size_t newID = 0; newID < transcripts.size(); ++newID ) {
-        // For each transcript, map it to the appropriate gene
-        string oldName; size_t oldID;
-        std::tie(oldName, oldID) = transcripts[newID];
-        t2g[newID] = t2gUnordered[oldID];
-        transcriptNames[newID] = oldName;
-    }
-
-    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
-}
-
-
-TranscriptGeneMap transcriptToGeneMapFromFasta( const std::string& transcriptsFile ) {
-    using std::vector;
-    using stream_manager = jellyfish::stream_manager<char**>;
-    using sequence_parser = jellyfish::whole_sequence_parser<stream_manager>;
-    namespace bfs = boost::filesystem;
-
-    NameVector transcriptNames;
-    NameVector geneNames {"gene"};
-
-    vector<bfs::path> paths{transcriptsFile};
-
-    // Create a jellyfish parser
-    const int concurrentFile{1};
-    char** fnames = new char*[1];
-    fnames[0] = const_cast<char*>(transcriptsFile.c_str());
-    stream_manager streams(fnames, fnames + 1, concurrentFile);
-
-    size_t maxReadGroupSize{100};
-    sequence_parser parser(4, maxReadGroupSize, concurrentFile, streams);
-
-    // while there are transcripts left to process
-    while (true) {
-        sequence_parser::job j(parser);
-        // If this job is empty, then we're done
-        if (j.is_empty()) { break; }
-
-        for (size_t i=0; i < j->nb_filled; ++i) {
-            // The transcript name
-            std::string fullHeader(j->data[i].header);
-            std::string header = fullHeader.substr(0, fullHeader.find(' '));
-            transcriptNames.emplace_back(header);
-        }
-    }
-
-    // Sort the transcript names
-    std::sort(transcriptNames.begin(), transcriptNames.end());
-
-    // Since we have no real gene groupings, the t2g vector is trivial,
-    // everything maps to gene 0.
-    IndexVector t2g(transcriptNames.size(), 0);
-
-    return TranscriptGeneMap(transcriptNames, geneNames, t2g);
-}
-
-class ExpressionRecord {
-    public:
-        ExpressionRecord(const std::string& targetIn, uint32_t lengthIn,
-                         std::vector<double>& expValsIn) :
-            target(targetIn), length(lengthIn), expVals(expValsIn) {}
-
-        ExpressionRecord( ExpressionRecord&& other ) {
-            std::swap(target, other.target);
-            length = other.length;
-            std::swap(expVals, other.expVals);
-        }
-
-        ExpressionRecord(std::vector<std::string>& inputLine) {
-            if (inputLine.size() < 3) {
-                std::string err ("Any expression line must contain at least 3 tokens");
-                throw std::invalid_argument(err);
-            } else {
-                auto it = inputLine.begin();
-                target = *it; ++it;
-                length = std::stoi(*it); ++it;
-                for (; it != inputLine.end(); ++it) {
-                    expVals.push_back(std::stod(*it));
-                }
-            }
-        }
-
-        std::string target;
-        uint32_t length;
-        std::vector<double> expVals;
-};
-
-// From : http://stackoverflow.com/questions/9435385/split-a-string-using-c11
-std::vector<std::string> split(const std::string& str, int delimiter(int) = ::isspace){
-    using namespace std;
-    vector<string> result;
-    auto e=str.end();
-    auto i=str.begin();
-    while (i != e) {
-        i = find_if_not(i,e, delimiter);
-        if (i == e) break;
-        auto j = find_if(i,e, delimiter);
-        result.push_back(string(i,j));
-        i = j;
-    }
-    return result;
-}
-
-void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm, boost::filesystem::path& inputPath) {
-
-    using std::vector;
-    using std::string;
-    using std::ofstream;
-    using std::unordered_map;
-    using std::move;
-    using std::cerr;
-    using std::max;
-
-    std::ifstream expFile(inputPath.string());
-
-    if (!expFile.is_open()) {
-        perror("Error reading file");
-    }
-
-    //====================== From GeneSum ====================
-    vector<string> comments;
-    unordered_map<string, vector<ExpressionRecord>> geneExps;
-    string l;
-    size_t ln{0};
-
-    while (getline(expFile, l)) {
-        if (++ln % 1000 == 0) {
-            cerr << "\r\rParsed " << ln << " expression lines";
-        }
-        auto it = find_if(l.begin(), l.end(),
-                    [](char c) -> bool {return !isspace(c);});
-        if (it != l.end()) {
-            if (*it == '#') {
-                comments.push_back(l);
-            } else {
-                vector<string> toks = split(l);
-                ExpressionRecord er(toks);
-                auto gn = tgm.geneName(er.target);
-                geneExps[gn].push_back(move(er));
-            }
-        }
-    }
-    cerr << "\ndone\n";
-    expFile.close();
-
-    cerr << "Aggregating expressions to gene level . . .";
-    boost::filesystem::path outputFilePath(inputPath);
-    outputFilePath.replace_extension(".genes.sf");
-    ofstream outFile(outputFilePath.string());
-
-    // preserve any comments in the output
-    for (auto& c : comments) {
-        outFile << c << '\n';
-    }
-
-    for (auto& kv : geneExps) {
-        auto& gn = kv.first;
-
-        uint32_t geneLength{kv.second.front().length};
-        vector<double> expVals(kv.second.front().expVals.size(), 0);
-        const size_t NE{expVals.size()};
-
-        for (auto& tranExp : kv.second) {
-            geneLength = max(geneLength, tranExp.length);
-            for (size_t i = 0; i < NE; ++i) { expVals[i] += tranExp.expVals[i]; }
-        }
-
-        outFile << gn << '\t' << geneLength;
-        for (size_t i = 0; i < NE; ++i) {
-            outFile << '\t' << expVals[i];
-        }
-        outFile << '\n';
-    }
-
-    outFile.close();
-    cerr << " done\n";
-    //====================== From GeneSum =====================
-}
-
-void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
-                                boost::filesystem::path& estDir,
-                                bool haveBiasCorrectedFile) {
-    namespace bfs = boost::filesystem;
-    std::cerr << "Computing gene-level abundance estimates\n";
-    bfs::path gtfExtension(".gtf");
-    auto extension = geneMapPath.extension();
-
-    TranscriptGeneMap tranGeneMap;
-    // parse the map as a GTF file
-    if (extension == gtfExtension) {
-        // Using libgff
-        tranGeneMap = salmon::utils::transcriptGeneMapFromGTF(geneMapPath.string(), "gene_id");
-    } else { // parse the map as a simple format files
-        std::ifstream tgfile(geneMapPath.string());
-        tranGeneMap = salmon::utils::readTranscriptToGeneMap(tgfile);
-        tgfile.close();
-    }
-
-    std::cerr << "There were " << tranGeneMap.numTranscripts() << " transcripts mapping to "
-        << tranGeneMap.numGenes() << " genes\n";
-
-    bfs::path estFilePath = estDir / "quant.sf";
-    if (!bfs::exists(estFilePath)) {
-        std::stringstream errstr;
-        errstr << "Attempting to compute gene-level esimtates, but could not \n"
-            << "find isoform-level file " << estFilePath;
-        throw std::invalid_argument(errstr.str());
-    } else {
-        salmon::utils::aggregateEstimatesToGeneLevel(tranGeneMap, estFilePath);
-    }
-
-    /** Create a gene-level summary of the bias-corrected estimates as well if these exist **/
-    if (haveBiasCorrectedFile) {
-        bfs::path biasCorrectEstFilePath = estDir / "quant_bias_corrected.sf";
-        if (!bfs::exists(biasCorrectEstFilePath)) {
-            std::stringstream errstr;
-            errstr << "Attempting to compute gene-level esimtates, but could not \n"
-                << "find bias-corrected isoform-level file " << biasCorrectEstFilePath;
-            throw std::invalid_argument(errstr.str());
-        } else {
-            salmon::utils::aggregateEstimatesToGeneLevel(tranGeneMap, biasCorrectEstFilePath);
-        }
-    }
-}
-
-
-
-}
-}
diff --git a/src/Salmon.cpp b/src/Salmon.cpp
index e4c7025..763ee8c 100644
--- a/src/Salmon.cpp
+++ b/src/Salmon.cpp
@@ -1,6 +1,6 @@
 /**
 >HEADER
-    Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
+    Copyright (c) 2013 -- 2016 Rob Patro rob.patro at cs.stonybrook.edu
 
     This file is part of Salmon.
 
@@ -39,25 +39,27 @@
 #include <boost/program_options/parsers.hpp>
 #include <boost/range/irange.hpp>
 #include <boost/filesystem.hpp>
+#include <boost/any.hpp>
 
 // C++ string formatting library
 #include "spdlog/fmt/fmt.h"
 
-#include "BiasIndex.hpp"
-#include "SailfishUtils.hpp"
 #include "GenomicFeature.hpp"
 #include "SalmonConfig.hpp"
 #include "VersionChecker.hpp"
 
-int help(int argc, char* argv[]) {
+int help(std::vector<std::string> opts) { //}int argc, char* argv[]) {
     fmt::MemoryWriter helpMsg;
     helpMsg.write("Salmon v{}\n\n", salmon::version);
     helpMsg.write("Usage:  salmon -h|--help or \n"
                   "        salmon -v|--version or \n"
+		  "        salmon -c|--cite or \n"
                   "        salmon [--no-version-check] <COMMAND> [-h | options]\n\n");
     helpMsg.write("Commands:\n");
+    helpMsg.write("     cite  Show salmon citation information\n");
     helpMsg.write("     index Create a salmon index\n");
     helpMsg.write("     quant Quantify a sample\n");
+    //helpMsg.write("     quantmerge Merge multiple quantifications into a single file\n");
     helpMsg.write("     swim  Perform super-secret operation\n");
 
     /*
@@ -75,7 +77,7 @@ int help(int argc, char* argv[]) {
     */
 
     std::cerr << helpMsg.str();
-    return 1;
+    return 0;
 }
 
 
@@ -99,7 +101,7 @@ int dualModeMessage() {
     salmon quant --help-reads
     )";
     std::cerr << "    Salmon v" << salmon::version << helpmsg << "\n";
-    return 1;
+    return 0;
 }
 
 
@@ -122,6 +124,36 @@ int salmonSwim(int argc, char* argv[]) {
 
 }
 
+/**
+ * Bonus!
+ */
+void printCite() {
+
+  std::cerr << R"(
+Reference:
+==========
+
+Salmon provides accurate, fast, and bias-aware transcript expression estimates using dual-phase inference
+Rob Patro, Geet Duggal, Michael I Love, Rafael A Irizarry, Carl Kingsford
+bioRxiv 021592; doi: http://dx.doi.org/10.1101/021592
+
+bibtex:
+=======
+
+ at article {Patro:2016,
+  author = {Patro, Rob and Duggal, Geet and Love, Michael I and Irizarry, Rafael A and Kingsford, Carl},
+  title = {Salmon provides accurate, fast, and bias-aware transcript expression estimates using dual-phase inference},
+  year = {2016},
+  doi = {10.1101/021592},
+  publisher = {Cold Spring Harbor Labs Journals},
+  URL = {http://biorxiv.org/content/early/2016/08/30/021592},
+  journal = {bioRxiv}
+}
+)";
+
+}
+
+
 int salmonIndex(int argc, char* argv[]);
 int salmonQuantify(int argc, char* argv[]);
 int salmonAlignmentQuantify(int argc, char* argv[]);
@@ -134,39 +166,32 @@ int main( int argc, char* argv[] ) {
 
   // With no arguments, print help
   if (argc == 1) {
-      help(argc, argv);
+      std::vector<std::string> o;
+      help(o);//argc, argv);
       std::exit(1);
   }
 
   try {
-
-    po::options_description hidden("hidden");
-    hidden.add_options()
-    ("command", po::value<string>(), "command to run {index, quant, sf}");
-
+      
+    // subcommand parsing code inspired by : https://gist.github.com/randomphrase/10801888
     po::options_description sfopts("Allowed Options");
     sfopts.add_options()
-    ("version,v", "print version string")
-    ("no-version-check", "don't check with the server to see if this is the latest version")
-    ("help,h", "produce help message")
+        ("version,v", "print version string")
+        ("no-version-check", "don't check with the server to see if this is the latest version")
+        ("cite,c", "show citation information")
+        ("help,h", "produce help message")
+        ("command", po::value<string>(), "command to run {index, quant, sf}")
+        ("subargs", po::value<std::vector<std::string>>(), "Arguments for command");
     ;
 
     po::options_description all("Allowed Options");
-    all.add(sfopts).add(hidden);
+    all.add(sfopts);
 
     po::positional_options_description pd;
-    pd.add("command", 1);
-
-    size_t topLevelArgc = argc;
-    for (size_t i : boost::irange(size_t{1}, static_cast<size_t>(argc))) {
-      if (argv[i][0] != '-') {
-        topLevelArgc = i+1;
-        break;
-      }
-    }
+    pd.add("command", 1).add("subargs", -1);
 
     po::variables_map vm;
-    po::parsed_options parsed = po::command_line_parser(topLevelArgc, argv).options(all).positional(pd).allow_unregistered().run();
+    po::parsed_options parsed = po::command_line_parser(argc, argv).options(all).positional(pd).allow_unregistered().run();
     po::store(parsed, vm);
 
     if (vm.count("version")) {
@@ -175,33 +200,55 @@ int main( int argc, char* argv[] ) {
     }
 
     if (vm.count("help") and !vm.count("command")) {
-        help(argc, argv);
+        std::vector<std::string> o;
+        help(o);
         std::exit(0);
     }
 
+    if (vm.count("cite") and !vm.count("command")) {
+      printCite();
+      std::exit(0);
+    }
+
     if (!vm.count("no-version-check")){
       std::string versionMessage = getVersionMessage();
       std::cerr << versionMessage;
     }
     
-    po::notify(vm);
+    //po::notify(vm);
+
+    std::string cmd = vm["command"].as<std::string>();
+    std::vector<std::string> opts = po::collect_unrecognized(parsed.options, po::include_positional);
+    opts.erase(opts.begin());
+    // if there was a help and a command, then add the help back since it was parsed
+    if (vm.count("help")) { opts.insert(opts.begin(), "--help"); }
 
     std::unordered_map<string, std::function<int(int, char*[])>> cmds({
       {"index", salmonIndex},
       {"quant", salmonQuantify},
+      //{"quantmerge", salmonQuantMerge},
       {"swim", salmonSwim}
     });
 
-    string cmd = vm["command"].as<string>();
-
+    /*
+    //string cmd = vm["command"].as<string>();
     int subCommandArgc = argc - topLevelArgc + 1;
     char** argv2 = new char*[subCommandArgc];
     argv2[0] = argv[0];
     std::copy_n( &argv[topLevelArgc], argc-topLevelArgc, &argv2[1] );
+    */
 
+    int subCommandArgc = opts.size() + 1;
+    std::unique_ptr<char*[]> argv2(new char*[subCommandArgc]);
+    argv2[0] = argv[0];
+    for (size_t i = 0; i < subCommandArgc - 1; ++i) {
+        argv2[i+1] = &*opts[i].begin();
+    }
+    
     auto cmdMain = cmds.find(cmd);
     if (cmdMain == cmds.end()) {
-      help(subCommandArgc, argv2);
+        //help(subCommandArgc, argv2);
+        return help(opts);
     } else {
       // If the command is quant; determine whether
       // we're quantifying with raw sequences or alignemnts
@@ -211,18 +258,17 @@ int main( int argc, char* argv[] ) {
         if (strncmp(argv2[1], "--help-alignment", 16) == 0) {
             std::vector<char> helpStr{'-','-','h','e','l','p','\0'};
             char* helpArgv[] = {argv[0], &helpStr[0]};
-            salmonAlignmentQuantify(2, helpArgv);
+            return salmonAlignmentQuantify(2, helpArgv);
         } else if (strncmp(argv2[1], "--help-reads", 12) == 0) {
             std::vector<char> helpStr{'-','-','h','e','l','p','\0'};
             char* helpArgv[] = {argv[0], &helpStr[0]};
-            salmonQuantify(2, helpArgv);
+            return salmonQuantify(2, helpArgv);
         }
 
         // detect general help request
         if (strncmp(argv2[1], "--help", 6) == 0 or
             strncmp(argv2[1], "-h", 2) == 0) {
-            dualModeMessage();
-            std::exit(0);
+            return dualModeMessage();
         }
 
         // otherwise, detect and dispatch the correct mode
@@ -235,15 +281,14 @@ int main( int argc, char* argv[] ) {
             }
         }
         if (useSalmonAlign) {
-            salmonAlignmentQuantify(subCommandArgc, argv2);
+            return salmonAlignmentQuantify(subCommandArgc, argv2.get());
         } else {
-            salmonQuantify(subCommandArgc, argv2);
+            return salmonQuantify(subCommandArgc, argv2.get());
         }
       } else {
-        cmdMain->second(subCommandArgc, argv2);
+        return cmdMain->second(subCommandArgc, argv2.get());
       }
     }
-    delete[] argv2;
 
   } catch (po::error &e) {
     std::cerr << "Program Option Error (main) : [" << e.what() << "].\n Exiting.\n";
diff --git a/src/SalmonQuantify.cpp b/src/SalmonQuantify.cpp
index ab669bc..08aeadd 100644
--- a/src/SalmonQuantify.cpp
+++ b/src/SalmonQuantify.cpp
@@ -19,8 +19,6 @@
 <HEADER
 **/
 
-#include "btree_map.h"
-#include "btree_set.h"
 #include <algorithm>
 #include <atomic>
 #include <cassert>
@@ -117,7 +115,7 @@ extern "C" {
 #include "GZipWriter.hpp"
 #include "HitManager.hpp"
 #include "KmerIntervalMap.hpp"
-#include "PairSequenceParser.hpp"
+//#include "PairSequenceParser.hpp"
 #include "RapMapUtils.hpp"
 #include "ReadExperiment.hpp"
 #include "SACollector.hpp"
@@ -177,13 +175,15 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
 
   using salmon::math::LOG_0;
   using salmon::math::LOG_1;
+  using salmon::math::LOG_EPSILON;
   using salmon::math::LOG_ONEHALF;
   using salmon::math::logAdd;
   using salmon::math::logSub;
 
   const uint64_t numBurninFrags = salmonOpts.numBurninFrags;
 
-  auto log = spdlog::get("jointLog");
+  auto& log = salmonOpts.jointLog;
+  //auto log = spdlog::get("jointLog");
   size_t numTranscripts{transcripts.size()};
   size_t localNumAssignedFragments{0};
   size_t priorNumAssignedFragments{numAssignedFragments};
@@ -211,6 +211,9 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
   bool useFragLengthDist{!salmonOpts.noFragLengthDist};
   bool noFragLenFactor{salmonOpts.noFragLenFactor};
   bool useRankEqClasses{salmonOpts.rankEqClasses};
+  bool noLengthCorrection{salmonOpts.noLengthCorrection};
+  // JAN 13
+  bool useAuxParams = ((localNumAssignedFragments + numAssignedFragments) >= salmonOpts.numPreBurninFrags);
 
   // If we're auto detecting the library type
   auto* detector = readLib.getDetector();
@@ -235,6 +238,11 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
 
   double startingCumulativeMass =
       fmCalc.cumulativeLogMassAt(firstTimestepOfRound);
+
+  auto isUnexpectedOrphan = [expectedLibraryFormat](AlnT& aln) -> bool {
+    return (expectedLibraryFormat.type == ReadType::PAIRED_END and aln.mateStatus != rapmap::utils::MateStatus::PAIRED_END_PAIRED);
+  };
+
   int i{0};
   {
     // Iterate over each group of alignments (a group consists of all alignments
@@ -289,6 +297,10 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
       hasCompatibleMapping = false;
       // For each alignment of this read
       for (auto& aln : alnGroup.alignments()) {
+
+        useAuxParams = ((localNumAssignedFragments + numAssignedFragments) >= salmonOpts.numPreBurninFrags);
+        bool considerCondProb{burnedIn or useAuxParams};
+
         auto transcriptID = aln.transcriptID();
         auto& transcript = transcripts[transcriptID];
         transcriptUnique =
@@ -303,22 +315,56 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
         // transcript-level term (based on abundance and) an
         // alignment-level term.
         double logRefLength{salmon::math::LOG_0};
-        if (salmonOpts.noEffectiveLengthCorrection or !burnedIn) {
-          logRefLength = std::log(transcript.RefLength);
+
+        if (noLengthCorrection) {
+          logRefLength = 1.0;
+        } else if (salmonOpts.noEffectiveLengthCorrection or !burnedIn) {
+          logRefLength = std::log(static_cast<double>(transcript.RefLength));
         } else {
           logRefLength = transcript.getCachedLogEffectiveLength();
-        }
+        } 
 
         double transcriptLogCount = transcript.mass(initialRound);
+        auto flen = aln.fragLength();
+        // If we have a properly-paired read then use the "pedantic"
+        // definition here.
+        if (aln.mateStatus == rapmap::utils::MateStatus::PAIRED_END_PAIRED and
+            aln.fwd != aln.mateIsFwd) {
+          flen = aln.fragLengthPedantic(transcript.RefLength); 
+        }
+
 
         // If the transcript had a non-zero count (including pseudocount)
         if (std::abs(transcriptLogCount) != LOG_0) {
 
           // The probability of drawing a fragment of this length;
           double logFragProb = LOG_1;
-          if (burnedIn and useFragLengthDist and aln.fragLength() > 0) {
-            logFragProb =
-                fragLengthDist.pmf(static_cast<size_t>(aln.fragLength()));
+          // If we are expecting a paired-end library, and this is an orphan,
+          // then logFragProb should be small
+          if (isUnexpectedOrphan(aln)) { 
+            logFragProb = LOG_EPSILON;
+          }
+          
+          if (flen > 0.0 and useFragLengthDist and considerCondProb) {
+            size_t fl = flen;
+            double lenProb = fragLengthDist.pmf(fl); 
+            if (burnedIn) {
+              /* condition fragment length prob on txp length */
+              double refLengthCM = fragLengthDist.cmf(static_cast<size_t>(refLength)); 
+              bool computeMass = fl < refLength and !salmon::math::isLog0(refLengthCM);
+              logFragProb = (computeMass) ?
+                                      (lenProb - refLengthCM) :
+                salmon::math::LOG_EPSILON;
+              if (computeMass and refLengthCM < lenProb) {
+                // Threading is hard!  It's possible that an update to the PMF snuck in between when we asked to cache the CMF and when the
+                // "burnedIn" variable was last seen as false.
+                log->info("reference length = {}, CMF[refLen] = {}, fragLen = {}, PMF[fragLen] = {}", refLength, std::exp(refLengthCM), aln.fragLength(), std::exp(lenProb));
+              }
+            } else if (useAuxParams) {
+              logFragProb = lenProb;
+            }
+            //logFragProb = lenProb;
+            //logFragProb = fragLengthDist.pmf(static_cast<size_t>(aln.fragLength()));
           }
 
           // TESTING
@@ -386,11 +432,18 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
           **/
 
           // Allow for a non-uniform fragment start position distribution
+
           double startPosProb{-logRefLength};
+          // DEC 9
+          if (aln.mateStatus == rapmap::utils::MateStatus::PAIRED_END_PAIRED and !noLengthCorrection) {
+            startPosProb = (flen <= refLength) ? -std::log(refLength - flen + 1) : salmon::math::LOG_EPSILON;
+          }
+
           double fragStartLogNumerator{salmon::math::LOG_1};
           double fragStartLogDenominator{salmon::math::LOG_1};
 
           auto hitPos = aln.hitPos();
+          /** NOTE: no more FSPD
           if (useFSPD and burnedIn and hitPos < refLength) {
             auto& fragStartDist = fragStartDists[transcript.lengthClassIndex()];
             // Get the log(numerator) and log(denominator) for the fragment
@@ -404,6 +457,7 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
                                ? fragStartLogNumerator - fragStartLogDenominator
                                : salmon::math::LOG_0;
           }
+          **/
 
           // Increment the count of this type of read that we've seen
           ++libTypeCounts[aln.libFormat().formatID()];
@@ -585,21 +639,33 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
           }
         }
 
-        if (gcBiasCorrect and aln.libFormat().type == ReadType::PAIRED_END) {
-          int32_t start = std::min(aln.pos, aln.matePos);
-          int32_t stop = start + aln.fragLen - 1;
-
-          // WITH CONTEXT
-          if (start >= 0 and stop < transcript.RefLength) {
-              auto desc = transcript.gcDesc(start, stop);
-              observedGCMass.inc(desc, aln.logProb);
-            /*
-            int32_t gcFrac = transcript.gcFrac(start, stop);
-            // Add this fragment's contribution
-            observedGCMass[gcFrac] =
-                salmon::math::logAdd(observedGCMass[gcFrac], aln.logProb);
-            */
-          }
+        if (gcBiasCorrect) {
+            if (aln.libFormat().type == ReadType::PAIRED_END) {
+                int32_t start = std::min(aln.pos, aln.matePos);
+                int32_t stop = start + aln.fragLen - 1;
+                // WITH CONTEXT
+                if (start >= 0 and stop < transcript.RefLength) {
+                    bool valid{false};
+                    auto desc = transcript.gcDesc(start, stop, valid);
+                    if (valid) { observedGCMass.inc(desc, aln.logProb); }
+                }
+            } else if(expectedLibraryFormat.type == ReadType::SINGLE_END) { 
+	      // Both expected and observed should be single end here
+                // For single-end reads, simply assume that every fragment
+                // has a length equal to the conditional mean (given the 
+                // current transcript's length).
+                auto cmeans = readExp.condMeans();
+                auto cmean = static_cast<int32_t>((transcript.RefLength >= cmeans.size()) ? cmeans.back() : cmeans[transcript.RefLength]);
+                int32_t start = aln.fwd ? aln.pos : std::max(0, aln.pos - cmean);
+                int32_t stop = start + cmean;
+                // WITH CONTEXT
+                if (start >= 0 and stop < transcript.RefLength) {
+                  bool valid{false};
+                  auto desc = transcript.gcDesc(start, stop, valid);
+                  if (valid) {observedGCMass.inc(desc, aln.logProb);}
+                }
+            } 
+
         }
         double r = uni(randEng);
         if (!burnedIn and r < std::exp(aln.logProb)) {
@@ -654,6 +720,7 @@ void processMiniBatch(ReadExperiment& readExp, ForgettingMassCalculator& fmCalc,
     // NOTE: only one thread should succeed here, and that
     // thread will set burnedIn to true.
     readExp.updateTranscriptLengthsAtomic(burnedIn);
+    fragLengthDist.cacheCMF();
   }
   if (initialRound) {
     readLib.updateLibTypeCounts(libTypeCounts);
@@ -733,8 +800,13 @@ void processReadsQuasi(
 
   // Write unmapped reads
   fmt::MemoryWriter unmappedNames;
-  auto unmappedLogger = spdlog::get("unmappedLog");
-  bool writeUnmapped = (unmappedLogger.get() == nullptr) ? false : true;
+  bool writeUnmapped = salmonOpts.writeUnmappedNames;
+  spdlog::logger* unmappedLogger = (writeUnmapped) ? salmonOpts.unmappedLog.get() : nullptr;
+
+  // Write unmapped reads
+  fmt::MemoryWriter orphanLinks;
+  bool writeOrphanLinks = salmonOpts.writeOrphanLinks;
+  spdlog::logger* orphanLinkLogger = (writeOrphanLinks) ? salmonOpts.orphanLinkLog.get() : nullptr;
 
   auto& readBiasFW =
       observedBiasParams
@@ -764,6 +836,17 @@ void processReadsQuasi(
   size_t readLenLeft{0};
   size_t readLenRight{0};
   SACollector<RapMapIndexT> hitCollector(qidx);
+
+  if (salmonOpts.fasterMapping) {
+      hitCollector.enableNIP();
+  } else {
+      hitCollector.disableNIP();
+  } 
+  hitCollector.setStrictCheck(true);
+  if (salmonOpts.quasiCoverage > 0.0) {
+      hitCollector.setCoverageRequirement(salmonOpts.quasiCoverage);
+  }
+
   SASearcher<RapMapIndexT> saSearcher(qidx);
   std::vector<QuasiAlignment> leftHits;
   std::vector<QuasiAlignment> rightHits;
@@ -806,11 +889,11 @@ void processReadsQuasi(
       bool lh = tooShortLeft ? false : hitCollector(rp.first.seq,
                                                     leftHits, saSearcher,
                                                     MateStatus::PAIRED_END_LEFT,
-                                                    true, consistentHits);
+                                                    consistentHits);
 
       bool rh = tooShortRight ? false : hitCollector(rp.second.seq,
                                    rightHits, saSearcher,
-                                   MateStatus::PAIRED_END_RIGHT, true,
+                                   MateStatus::PAIRED_END_RIGHT, 
                                    consistentHits);
 
       // Consider a read as too short if both ends are too short
@@ -842,6 +925,27 @@ void processReadsQuasi(
         }
       }
 
+      // NOTE: This will currently not work with "strict intersect", i.e.
+      // nothing will be output here with strict intersect.
+      if (writeOrphanLinks) {
+          // If we are not using strict intersection, then joint hits
+          // can only be zero when either:
+          // 1) there are *no* hits or
+          // 2) there are hits for *both* the left and right reads, but not to the same txp
+          if (!strictIntersect and jointHits.size() == 0) {
+              if (leftHits.size() > 0 and rightHits.size() > 0) {
+                  for (auto& h : leftHits) {
+                      orphanLinks << h.transcriptID() << ',' << h.pos << "\t";
+                  }
+                  orphanLinks << ":";
+                  for (auto& h : rightHits) {
+                      orphanLinks << h.transcriptID() << ',' << h.pos << "\t";
+                  }
+                  orphanLinks << "\n";
+              }
+          }
+      }
+
       // If we have mappings, then process them.
       bool isPaired{false};
       if (jointHits.size() > 0) {
@@ -1065,6 +1169,16 @@ void processReadsQuasi(
         sstream.clear();
     } 
 
+    if (writeOrphanLinks) {
+        std::string outStr(orphanLinks.str());
+        // Get rid of last newline
+        if (!outStr.empty()) {
+            outStr.pop_back();
+            orphanLinkLogger->info(std::move(outStr));
+        }
+        orphanLinks.clear();
+    }
+
     prevObservedFrags = numObservedFragments;
     AlnGroupVecRange<QuasiAlignment> hitLists = boost::make_iterator_range(
         structureVec.begin(), structureVec.begin() + rangeSize);
@@ -1116,8 +1230,8 @@ void processReadsQuasi(
 
   // Write unmapped reads
   fmt::MemoryWriter unmappedNames;
-  auto unmappedLogger = spdlog::get("unmappedLog");
-  bool writeUnmapped = (unmappedLogger.get() == nullptr) ? false : true;
+  bool writeUnmapped = salmonOpts.writeUnmappedNames;
+  spdlog::logger* unmappedLogger = (writeUnmapped) ? salmonOpts.unmappedLog.get() : nullptr;
 
   auto& readBiasFW = observedBiasParams.seqBiasModelFW;
   auto& readBiasRC = observedBiasParams.seqBiasModelRC;
@@ -1141,6 +1255,17 @@ void processReadsQuasi(
   bool quiet{salmonOpts.quiet};
 
   SACollector<RapMapIndexT> hitCollector(qidx);
+  if (salmonOpts.fasterMapping) {
+      hitCollector.enableNIP();
+  } else {
+      hitCollector.disableNIP();
+  } 
+
+  hitCollector.setStrictCheck(true);
+  if (salmonOpts.quasiCoverage > 0.0) {
+      hitCollector.setCoverageRequirement(salmonOpts.quasiCoverage);
+  }
+
   SASearcher<RapMapIndexT> saSearcher(qidx);
   rapmap::utils::HitCounters hctr;
   
@@ -1174,7 +1299,7 @@ void processReadsQuasi(
           tooShort ? false
           : hitCollector(rp.seq,
                                   jointHits, saSearcher,
-                                  MateStatus::SINGLE_END, true, consistentHits);
+                                  MateStatus::SINGLE_END, consistentHits);
 
       // If the fragment was too short, record it
       if (tooShort) {
@@ -1629,21 +1754,17 @@ void processReadLibrary(
       threads[i].join();
     }
 
+    /** GC-fragment bias **/
     // Set the global distribution based on the sum of local
     // distributions.
+    double gcFracFwd{0.0};
+    double globalMass{salmon::math::LOG_0};
+    double globalFwdMass{salmon::math::LOG_0};
+    auto& globalGCMass = readExp.observedGC();
     for (auto& gcp : observedBiasParams) {
-      /*
-              auto& fw = readExp.readBias(salmon::utils::Direction::FORWARD);
-              auto& rc =
-         readExp.readBias(salmon::utils::Direction::REVERSE_COMPLEMENT);
+      auto& gcm = gcp.observedGCMass;
+      globalGCMass.combineCounts(gcm);
 
-              auto& fwloc = gcp.seqBiasFW;
-              auto& rcloc = gcp.seqBiasRC;
-              for (size_t i = 0; i < fwloc.counts.size(); ++i) {
-                  fw.counts[i] += fwloc.counts[i];
-                  rc.counts[i] += rcloc.counts[i];
-              }
-      */
       auto& fw = readExp.readBiasModelObserved(salmon::utils::Direction::FORWARD);
       auto& rc =
           readExp.readBiasModelObserved(salmon::utils::Direction::REVERSE_COMPLEMENT);
@@ -1663,6 +1784,54 @@ void processReadLibrary(
         posBiasesFW[i].combine(gcp.posBiasFW[i]);
         posBiasesRC[i].combine(gcp.posBiasRC[i]);
       }
+
+      globalMass = salmon::math::logAdd(globalMass, gcp.massFwd);
+      globalMass = salmon::math::logAdd(globalMass, gcp.massRC);
+      globalFwdMass = salmon::math::logAdd(globalFwdMass, gcp.massFwd);
+    }
+    globalGCMass.normalize();
+
+    if (globalMass != salmon::math::LOG_0) {
+      if (globalFwdMass != salmon::math::LOG_0) {
+        gcFracFwd = std::exp(globalFwdMass - globalMass);
+      }
+      readExp.setGCFracForward(gcFracFwd);
+    }
+
+    // finalize the positional biases
+    if (salmonOpts.posBiasCorrect) {
+      auto& posBiasesFW = readExp.posBias(salmon::utils::Direction::FORWARD);
+      auto& posBiasesRC =
+          readExp.posBias(salmon::utils::Direction::REVERSE_COMPLEMENT);
+      for (size_t i = 0; i < posBiasesFW.size(); ++i) {
+        posBiasesFW[i].finalize();
+        posBiasesRC[i].finalize();
+      }
+    }
+
+    /** END GC-fragment bias **/
+
+    /* OLD SINGLE END BIAS
+    // Set the global distribution based on the sum of local
+    // distributions.
+    for (auto& gcp : observedBiasParams) {
+      auto& fw = readExp.readBiasModelObserved(salmon::utils::Direction::FORWARD);
+      auto& rc =
+          readExp.readBiasModelObserved(salmon::utils::Direction::REVERSE_COMPLEMENT);
+
+      auto& fwloc = gcp.seqBiasModelFW;
+      auto& rcloc = gcp.seqBiasModelRC;
+      fw.combineCounts(fwloc);
+      rc.combineCounts(rcloc);
+
+      // positional biases
+      auto& posBiasesFW = readExp.posBias(salmon::utils::Direction::FORWARD);
+      auto& posBiasesRC =
+          readExp.posBias(salmon::utils::Direction::REVERSE_COMPLEMENT);
+      for (size_t i = 0; i < posBiasesFW.size(); ++i) {
+        posBiasesFW[i].combine(gcp.posBiasFW[i]);
+        posBiasesRC[i].combine(gcp.posBiasRC[i]);
+      }
     }
     // finalize the positional biases
     if (salmonOpts.posBiasCorrect) {
@@ -1674,8 +1843,8 @@ void processReadLibrary(
         posBiasesRC[i].finalize();
       }
     }
+    END OLD SINGLE-END BIAS */
 
-    /** END: bias models **/
 
   } // ------ END Single-end --------
 }
@@ -1692,7 +1861,7 @@ void quantifyLibrary(ReadExperiment& experiment, bool greedyChain,
                      mem_opt_t* memOptions, SalmonOpts& salmonOpts,
                      double coverageThresh, uint32_t numQuantThreads) {
 
-  bool burnedIn{false};
+  bool burnedIn = (salmonOpts.numBurninFrags == 0);
   uint64_t numRequiredFragments = salmonOpts.numRequiredFragments;
   std::atomic<uint64_t> upperBoundHits{0};
   // ErrorModel errMod(1.00);
@@ -1707,7 +1876,7 @@ void quantifyLibrary(ReadExperiment& experiment, bool greedyChain,
   std::atomic<uint64_t> totalAssignedFragments{0};
   uint64_t prevNumAssignedFragments{0};
 
-  auto jointLog = spdlog::get("jointLog");
+  auto jointLog = salmonOpts.jointLog;
 
   ForgettingMassCalculator fmCalc(salmonOpts.forgettingFactor);
   size_t prefillSize = 1000000000 / miniBatchSize;
@@ -1958,10 +2127,10 @@ int salmonQuantify(int argc, char* argv[]) {
       "quasi-mapping.")
     (
      "seqBias",
-     po::value(&(sopt.biasCorrect))->zero_tokens(),
+     po::bool_switch(&(sopt.biasCorrect))->default_value(false),
      "Perform sequence-specific bias correction.")
     (
-      "gcBias", po::value(&(sopt.gcBiasCorrect))->zero_tokens(),
+      "gcBias", po::bool_switch(&(sopt.gcBiasCorrect))->default_value(false),
       "[beta] Perform fragment GC bias correction")
     (
       "threads,p",
@@ -1994,15 +2163,22 @@ int salmonQuantify(int argc, char* argv[]) {
       "it belongs "
       "separated by a tab.  The extension of the file is used to determine how "
       "the file "
-      "should be parsed.  Files ending in \'.gtf\' or \'.gff\' are assumed to "
+      "should be parsed.  Files ending in \'.gtf\', \'.gff\' or \'.gff3\' are assumed to "
       "be in GTF "
       "format; files with any other extension are assumed to be in the simple "
-      "format.")
+      "format. In GTF / GFF format, the \"transcript_id\" is assumed to contain the "
+      "transcript identifier and the \"gene_id\" is assumed to contain the corresponding "
+      "gene identifier.")
   (
-   "writeMappings", po::value<string>(&sopt.qmFileName)->default_value("")->implicit_value("-"),
+   "writeMappings,z", po::value<string>(&sopt.qmFileName)->default_value("")->implicit_value("-"),
    "If this option is provided, then the quasi-mapping results will be written out in SAM-compatible "
    "format.  By default, output will be directed to stdout, but an alternative file name can be "
-   "provided instead.");
+   "provided instead.")
+  (
+   "meta", po::bool_switch(&(sopt.meta))->default_value(false),
+   "If you're using Salmon on a metagenomic dataset, consider setting this flag to disable parts of the "
+   "abundance estimation model that make less sense for metagenomic data."
+  );
 
   sopt.noRichEqClasses = false;
   // mapping cache has been deprecated
@@ -2026,6 +2202,10 @@ int salmonQuantify(int argc, char* argv[]) {
                                           "make use of a very large number of
       threads.")
       */
+    ("alternativeInitMode", po::bool_switch(&(sopt.alternativeInitMode))->default_value(false),
+     "[Experimental]: Use an alternative strategy (rather than simple interpolation between) the "
+     "online and uniform abundance estimates to initalize the EM / VBEM algorithm."
+     )
     (
      "auxDir", po::value<std::string>(&(sopt.auxDir))->default_value("aux_info"),
      "The sub-directory of the quantification directory where auxiliary "
@@ -2040,6 +2220,15 @@ int salmonQuantify(int argc, char* argv[]) {
 						 "dumpEq", po::bool_switch(&(sopt.dumpEq))->default_value(false),
 						 "Dump the equivalence class counts "
 						 "that were computed during quasi-mapping")
+    ("dumpEqWeights,d",
+     po::bool_switch(&(sopt.dumpEqWeights))->default_value(false),
+     "Includes \"rich\" equivlance class weights in the output when equivalence "
+     "class information is being dumped to file.")
+    ("fasterMapping",
+     po::bool_switch(&(sopt.fasterMapping))->default_value(false),
+     "[Developer]: Disables some extra checks during quasi-mapping. This may make mapping a "
+     "little bit faster at the potential cost of returning too many mappings (i.e. some sub-optimal mappings) "
+     "for certain reads. Only use this option if you know what it does (enables NIP-skipping)")
     (
      "gcSizeSamp",
      po::value<std::uint32_t>(&(sopt.gcSampFactor))->default_value(1),
@@ -2069,11 +2258,11 @@ int salmonQuantify(int argc, char* argv[]) {
      "distribution")
     (
      "fldMean",
-     po::value<size_t>(&(sopt.fragLenDistPriorMean))->default_value(200),
+     po::value<size_t>(&(sopt.fragLenDistPriorMean))->default_value(250),
      "The mean used in the fragment length distribution prior")
     (
      "fldSD",
-     po::value<size_t>(&(sopt.fragLenDistPriorSD))->default_value(80),
+     po::value<size_t>(&(sopt.fragLenDistPriorSD))->default_value(25),
      "The standard deviation used in the fragment length distribution "
      "prior")
     (
@@ -2097,6 +2286,13 @@ int salmonQuantify(int argc, char* argv[]) {
      po::value<uint32_t>(&(sopt.maxReadOccs))->default_value(100),
      "Reads \"mapping\" to more than this many places won't be "
      "considered.")
+    ("noLengthCorrection",
+     po::bool_switch(&(sopt.noLengthCorrection))->default_value(false),
+     "[experimental] : Entirely disables length correction when estimating "
+     "the abundance of transcripts.  This option can be used with protocols "
+     "where one expects that fragments derive from their underlying targets "
+     "without regard to that target's length (e.g. QuantSeq)"
+     )
     (
      "noEffectiveLengthCorrection",
      po::bool_switch(&(sopt.noEffectiveLengthCorrection))
@@ -2167,6 +2363,13 @@ int salmonQuantify(int argc, char* argv[]) {
      "Number of bootstrap samples to generate. Note: "
      "This is mutually exclusive with Gibbs sampling.")
     (
+     "thinningFactor",
+     po::value<uint32_t>(&(sopt.thinningFactor))->default_value(16),
+     "Number of steps to discard for every sample kept from the Gibbs chain. "
+     "The larger this number, the less chance that subsequent samples are "
+     "auto-correlated, but the slower sampling becomes."
+    )
+    (
      "quiet,q", po::bool_switch(&(sopt.quiet))->default_value(false),
      "Be quiet while doing quantification (don't write informative "
      "output to the console unless something goes wrong).")
@@ -2181,9 +2384,20 @@ int salmonQuantify(int argc, char* argv[]) {
      "as a per-nucleotide prior, unless the --perTranscriptPrior flag "
      "is also given, in which case this is used as a transcript-level prior")
     (
+     "writeOrphanLinks",
+     po::bool_switch(&(sopt.writeOrphanLinks))->default_value(false),
+     "Write the transcripts that are linked by orphaned reads.")
+    (
      "writeUnmappedNames",
      po::bool_switch(&(sopt.writeUnmappedNames))->default_value(false),
-     "Write the names of un-mapped reads to the file unmapped_names.txt in the auxiliary directory.");
+     "Write the names of un-mapped reads to the file unmapped_names.txt in the auxiliary directory.")
+    ("quasiCoverage,x",
+     po::value<double>(&(sopt.quasiCoverage))->default_value(0.0),
+     "[Experimental]: The fraction of the read that must be covered by MMPs (of length >= 31) if "
+     "this read is to be considered as \"mapped\".  This may help to avoid \"spurious\" mappings. "
+     "A value of 0 (the default) denotes no coverage threshold (a single 31-mer can yield a mapping).  "
+     "Since coverage by exact matching, large, MMPs is a rather strict condition, this value should likely "
+     "be set to something low, if used.");
 
 
   po::options_description fmd("\noptions that apply to the old FMD index");
@@ -2328,6 +2542,13 @@ transcript abundance from RNA-seq reads
     if (!optionsOK) {
       std::exit(1);
     }
+    bool optionsValidate =
+      salmon::utils::validateOptions(sopt);
+    if (!optionsValidate) {
+      sopt.jointLog->flush();
+      spdlog::drop_all();
+      std::exit(1);
+    }
  
     auto fileLog = sopt.fileLog;
     auto jointLog = sopt.jointLog;
@@ -2388,10 +2609,10 @@ transcript abundance from RNA-seq reads
       if (sopt.gcBiasCorrect) {
         for (auto& rl : readLibraries) {
           if (rl.format().type != ReadType::PAIRED_END) {
-            jointLog->warn("Fragment GC bias correction is currently only "
-                           "implemented for paired-end libraries.  Disabling "
-                           "fragment GC bias correction for this run");
-            sopt.gcBiasCorrect = false;
+            jointLog->warn("Fragment GC bias correction is currently *experimental* "
+                           "in single-end libraries.  Please use this option "
+                           "with caution.");
+            //sopt.gcBiasCorrect = false;
           }
         }
       }
@@ -2422,12 +2643,6 @@ transcript abundance from RNA-seq reads
 
     GZipWriter gzw(outputDirectory, jointLog);
 
-    // If we are dumping the equivalence classes, then
-    // do it here.
-    if (sopt.dumpEq) {
-      gzw.writeEquivCounts(sopt, experiment);
-    }
-
     // Now that the streaming pass is complete, we have
     // our initial estimates, and our rich equivalence
     // classes.  Perform further optimization until
@@ -2458,21 +2673,27 @@ transcript abundance from RNA-seq reads
 
     // Write the main results
     gzw.writeAbundances(sopt, experiment);
-    // Write meta-information about the run
-    gzw.writeMeta(sopt, experiment, sopt.runStartTime);
+
+    // If we are dumping the equivalence classes, then
+    // do it here.
+    if (sopt.dumpEq) {
+      gzw.writeEquivCounts(sopt, experiment);
+    }
 
     if (sopt.numGibbsSamples > 0) {
 
       jointLog->info("Starting Gibbs Sampler");
       CollapsedGibbsSampler sampler;
+      gzw.setSamplingPath(sopt);
       // The function we'll use as a callback to write samples
-      std::function<bool(const std::vector<int>&)> bsWriter =
-          [&gzw](const std::vector<int>& alphas) -> bool {
-        return gzw.writeBootstrap(alphas);
+      std::function<bool(const std::vector<double>&)> bsWriter =
+          [&gzw](const std::vector<double>& alphas) -> bool {
+        return gzw.writeBootstrap(alphas, true);
       };
 
       bool sampleSuccess =
-          sampler.sample(experiment, sopt, bsWriter, sopt.numGibbsSamples);
+        //sampler.sampleMultipleChains(experiment, sopt, bsWriter, sopt.numGibbsSamples);
+        sampler.sample(experiment, sopt, bsWriter, sopt.numGibbsSamples);
       if (!sampleSuccess) {
         jointLog->error("Encountered error during Gibb sampling .\n"
                         "This should not happen.\n"
@@ -2481,6 +2702,7 @@ transcript abundance from RNA-seq reads
       }
       jointLog->info("Finished Gibbs Sampler");
     } else if (sopt.numBootstraps > 0) {
+      gzw.setSamplingPath(sopt);
       // The function we'll use as a callback to write samples
       std::function<bool(const std::vector<double>&)> bsWriter =
           [&gzw](const std::vector<double>& alphas) -> bool {
@@ -2538,7 +2760,7 @@ transcript abundance from RNA-seq reads
     }
 
     if (sopt.writeUnmappedNames) {
-      auto l = spdlog::get("unmappedLog");
+       auto l = sopt.unmappedLog.get();
       // If the logger was created, then flush it and
       // close the associated file.
       if (l) {
@@ -2547,6 +2769,16 @@ transcript abundance from RNA-seq reads
       }
     }
     
+    if (sopt.writeOrphanLinks) {
+        auto l = sopt.orphanLinkLog.get();
+        // If the logger was created, then flush it and
+        // close the associated file.
+        if (l) {
+            l->flush();
+            if (sopt.orphanLinkFile) { sopt.orphanLinkFile->close(); }
+        }
+    }
+
     // if we wrote quasimappings, flush that buffer
     if (sopt.qmFileName != "" ){
         sopt.qmLog->flush();
@@ -2554,6 +2786,12 @@ transcript abundance from RNA-seq reads
         // the file
         if (sopt.qmFileName != "-") { sopt.qmFile.close(); }
     }
+
+    sopt.runStopTime = salmon::utils::getCurrentTimeAsString();
+
+    // Write meta-information about the run
+    gzw.writeMeta(sopt, experiment);
+
   } catch (po::error& e) {
     std::cerr << "Exception : [" << e.what() << "]. Exiting.\n";
     std::exit(1);
diff --git a/src/SalmonQuantifyAlignments.cpp b/src/SalmonQuantifyAlignments.cpp
index d1de11d..91f88b6 100644
--- a/src/SalmonQuantifyAlignments.cpp
+++ b/src/SalmonQuantifyAlignments.cpp
@@ -162,6 +162,8 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
     bool gcBiasCorrect = salmonOpts.gcBiasCorrect;
 
     using salmon::math::LOG_0;
+    using salmon::math::LOG_1;
+    using salmon::math::LOG_EPSILON;
     using salmon::math::logAdd;
     using salmon::math::logSub;
 
@@ -188,8 +190,9 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
     double startingCumulativeMass = fmCalc.cumulativeLogMassAt(firstTimestepOfRound);
     auto expectedLibraryFormat = alnLib.format();
     uint32_t numBurninFrags{salmonOpts.numBurninFrags};
+    bool noLengthCorrection{salmonOpts.noLengthCorrection};
 
-    bool useAuxParams = (processedReads > salmonOpts.numPreBurninFrags);
+    bool useAuxParams = (processedReads >= salmonOpts.numPreBurninFrags);
 
     std::chrono::microseconds sleepTime(1);
     MiniBatchInfo<AlignmentGroup<FragT*>>* miniBatch = nullptr;
@@ -198,6 +201,10 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
 
     double maxZeroFrac{0.0};
 
+    auto isUnexpectedOrphan = [expectedLibraryFormat](FragT* aln) -> bool {
+      return (expectedLibraryFormat.type == ReadType::PAIRED_END and !aln->isPaired());
+    };
+
     while (!doneParsing or !workQueue.empty()) {
         uint32_t zeroProbFrags{0};
 
@@ -219,7 +226,8 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
 	    // If we actually got some work
         if (miniBatch != nullptr) {
 
-            useAuxParams = (processedReads > salmonOpts.numPreBurninFrags);
+            useAuxParams = (processedReads >= salmonOpts.numPreBurninFrags);
+            bool considerCondProb = (useAuxParams or burnedIn);
             ++activeBatches;
             batchReads = 0;
             zeroProbFrags = 0;
@@ -265,25 +273,49 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
                         transcriptUnique = transcriptUnique and (transcriptID == firstTranscriptID);
 
                         double refLength = transcript.RefLength > 0 ? transcript.RefLength : 1.0;
-                        double logFragProb = salmon::math::LOG_1;
+                        auto flen = aln->fragLen();
+                        // If we have a properly-paired read then use the "pedantic"
+                        // definition here.
+                        if (aln->isPaired() and aln->isInward()) { 
+                          flen = aln->fragLengthPedantic(refLength); 
+                        }
 
-                        if (!salmonOpts.noFragLengthDist and useAuxParams) {
-                            /** Forget reads that are not paired **/
-                            /*
-                            if(aln->fragLen() == 0) {
-                                if (aln->isLeft() and transcript.RefLength - aln->left() < fragLengthDist.maxVal()) {
-                                    logFragProb = fragLengthDist.cmf(transcript.RefLength - aln->left());
-                                } else if (aln->isRight() and aln->right() < fragLengthDist.maxVal()) {
-                                    logFragProb = fragLengthDist.cmf(aln->right());
-                                }
-                            } else {
+                        // The probability of drawing a fragment of this length;
+                        double logFragProb = LOG_1;
+                        // If we are expecting a paired-end library, and this is an orphan,
+                        // then logFragProb should be small
+                        if (isUnexpectedOrphan(aln)) {
+                          logFragProb = LOG_EPSILON;
+                        }
+
+                        if (flen > 0.0 and aln->isPaired() and useFragLengthDist and considerCondProb) {
+                          size_t fl = flen;
+                          double lenProb = fragLengthDist.pmf(fl); 
+                          if (burnedIn) {
+                            /* condition fragment length prob on txp length */
+                            double refLengthCM = fragLengthDist.cmf(static_cast<size_t>(refLength)); 
+                            bool computeMass = fl < refLength and !salmon::math::isLog0(refLengthCM);
+                            logFragProb = (computeMass) ?
+                                                    (lenProb - refLengthCM) :
+                              salmon::math::LOG_EPSILON;
+                            if (computeMass and refLengthCM < lenProb) {
+                              // Threading is hard!  It's possible that an update to the PMF snuck in between when we asked to cache the CMF and when the
+                              // "burnedIn" variable was last seen as false.
+                              log->info("reference length = {}, CMF[refLen] = {}, fragLen = {}, PMF[fragLen] = {}",
+                                        refLength, std::exp(refLengthCM), aln->fragLen(), std::exp(lenProb));
                             }
-                            */
-                            auto fragLen = aln->fragLengthPedantic(transcript.RefLength);
-                            if(aln->isPaired() and fragLen > 0) {
-                                logFragProb = fragLengthDist.pmf(static_cast<size_t>(fragLen));
+                          } else if (useAuxParams) {
+                            logFragProb = lenProb;
+                          }
+                        }
+
+                        /*
+                        if (!salmonOpts.noFragLengthDist and useAuxParams) {
+                            if(aln->isPaired() and flen > 0) {
+                                logFragProb = fragLengthDist.pmf(static_cast<size_t>(flen));
                             }
                         }
+                        */
 
                         // TESTING
                         if (noFragLenFactor) { logFragProb = LOG_1; }
@@ -310,7 +342,9 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
                         // transcript-level term (based on abundance and) an
                         // alignment-level term.
                         double logRefLength{salmon::math::LOG_0};
-                        if (salmonOpts.noEffectiveLengthCorrection or !burnedIn) {
+                        if (noLengthCorrection) {
+                          logRefLength = 1.0;
+                        } else if (salmonOpts.noEffectiveLengthCorrection or !burnedIn) {
                             logRefLength = std::log(transcript.RefLength);
                         } else {
                             logRefLength = transcript.getCachedLogEffectiveLength();
@@ -356,6 +390,11 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
 
 			// Allow for a non-uniform fragment start position distribution
 			double startPosProb{-logRefLength};
+      if (aln->isPaired() and !noLengthCorrection) {
+        startPosProb = (flen <= refLength) ? -std::log(refLength - flen + 1) : salmon::math::LOG_EPSILON;
+      }
+
+
 			double fragStartLogNumerator{salmon::math::LOG_1};
 			double fragStartLogDenominator{salmon::math::LOG_1};
 
@@ -571,29 +610,44 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
 			}
 
 			// Collect the GC-fragment bias samples
-			if (gcBiasCorrect and aln->isPaired()) {
-			  ReadPair* alnp = reinterpret_cast<ReadPair*>(aln);
-			  bam_seq_t* r1 = alnp->read1; 
-			  bam_seq_t* r2 = alnp->read2; 
-			  if (r1 != nullptr and r2 != nullptr) {
-                  bool fwd1{bam_strand(r1) == 0};
-                  bool fwd2{bam_strand(r2) == 0};
-                  int32_t start = alnp->left(); 
-                  int32_t stop = alnp->right(); 
-
-                  if (start >= 0 and stop < transcript.RefLength) {
-		      auto desc = transcript.gcDesc(start, stop);
-                      observedGCMass.inc(desc, aln->logProb);
-                   }
-
-          /*
-			    if (start >= 0 and stop < transcript.RefLength) {
-			      int32_t gcFrac = transcript.gcFrac(start, stop);
-			      // Add this fragment's contribution
-			      observedGCMass[gcFrac] = salmon::math::logAdd(observedGCMass[gcFrac], newMass); 
-			    }
-          */
-			  }
+			if (gcBiasCorrect) {
+                if (aln->isPaired()) {
+                    ReadPair* alnp = reinterpret_cast<ReadPair*>(aln);
+                    bam_seq_t* r1 = alnp->read1; 
+                    bam_seq_t* r2 = alnp->read2; 
+                    if (r1 != nullptr and r2 != nullptr) {
+                        bool fwd1{bam_strand(r1) == 0};
+                        bool fwd2{bam_strand(r2) == 0};
+                        int32_t start = alnp->left(); 
+                        int32_t stop = alnp->right(); 
+
+                        if (start >= 0 and stop < transcript.RefLength) {
+                          bool valid{false};
+                          auto desc = transcript.gcDesc(start, stop, valid);
+                          if (valid) { observedGCMass.inc(desc, aln->logProb); }
+                        }
+                    }
+                } else if (expectedLibraryFormat.type == ReadType::SINGLE_END) {
+		  // Both expected and observed should be single end here
+                    UnpairedRead* alnp = reinterpret_cast<UnpairedRead*>(aln);
+                    bam_seq_t* r = alnp->read;
+                    if (r != nullptr) {
+                        bool fwd{alnp->fwd()};
+                        // For single-end reads, simply assume that every fragment
+                        // has a length equal to the conditional mean (given the 
+                        // current transcript's length).
+                        auto cmeans = alnLib.condMeans();
+                        auto cmean = static_cast<int32_t>((transcript.RefLength >= cmeans.size()) ? cmeans.back() : cmeans[transcript.RefLength]);
+                        int32_t start = fwd ? alnp->pos() : std::max(0, alnp->pos() - cmean);
+                        int32_t stop = start + cmean;
+                        // WITH CONTEXT
+                        if (start >= 0 and stop < transcript.RefLength) {
+                          bool valid{false};
+                          auto desc = transcript.gcDesc(start, stop, valid);
+                          if(valid) { observedGCMass.inc(desc, aln->logProb); }
+                        }
+                    }
+                }
 			}
 			// END: GC-fragment bias
 			
@@ -736,10 +790,10 @@ void processMiniBatch(AlignmentLibrary<FragT>& alnLib,
                         fspd.update();
                     }
                 }
-                fragLengthDist.cacheCMF();
                 // NOTE: only one thread should succeed here, and that
                 // thread will set burnedIn to true
                 alnLib.updateTranscriptLengthsAtomic(burnedIn);
+                fragLengthDist.cacheCMF();
             }
 
             if (zeroProbFrags > 0) {
@@ -770,7 +824,7 @@ bool quantifyLibrary(
         size_t numRequiredFragments,
         SalmonOpts& salmonOpts) {
 
-    std::atomic<bool> burnedIn{false};
+  std::atomic<bool> burnedIn{salmonOpts.numBurninFrags == 0};
 
     auto& refs = alnLib.transcripts();
     size_t numTranscripts = refs.size();
@@ -1097,16 +1151,14 @@ bool processSample(AlignmentLibrary<ReadT>& alnLib,
     GZipWriter gzw(outputDirectory, jointLog);
     // Write the main results
     gzw.writeAbundances(sopt, alnLib);
-    // Write meta-information about the run
-    gzw.writeMeta(sopt, alnLib, runStartTime);
 
     if (sopt.numGibbsSamples > 0) {
 
         jointLog->info("Starting Gibbs Sampler");
         CollapsedGibbsSampler sampler;
         // The function we'll use as a callback to write samples
-        std::function<bool(const std::vector<int>&)> bsWriter =
-            [&gzw](const std::vector<int>& alphas) -> bool {
+        std::function<bool(const std::vector<double>&)> bsWriter =
+            [&gzw](const std::vector<double>& alphas) -> bool {
                 return gzw.writeBootstrap(alphas);
             };
 
@@ -1161,6 +1213,10 @@ bool processSample(AlignmentLibrary<ReadT>& alnLib,
         }
     }
 
+    sopt.runStopTime = salmon::utils::getCurrentTimeAsString();
+    // Write meta-information about the run
+    gzw.writeMeta(sopt, alnLib);
+
     return true;
 }
 
@@ -1203,14 +1259,18 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
                         "the observed frequency of different types of mismatches when computing the likelihood of "
                         "a given alignment.")
     ("output,o", po::value<std::string>()->required(), "Output quantification directory.")
+    ("meta", po::bool_switch(&(sopt.meta))->default_value(false), "If you're using Salmon on a metagenomic dataset, "
+     "consider setting this flag to disable parts of the abundance estimation model that make less sense for metagenomic data.")
     ("geneMap,g", po::value<std::string>(), "File containing a mapping of transcripts to genes.  If this file is provided "
                                         "Salmon will output both quant.sf and quant.genes.sf files, where the latter "
                                         "contains aggregated gene-level abundance estimates.  The transcript to gene mapping "
                                         "should be provided as either a GTF file, or a in a simple tab-delimited format "
                                         "where each line contains the name of a transcript and the gene to which it belongs "
                                         "separated by a tab.  The extension of the file is used to determine how the file "
-                                        "should be parsed.  Files ending in \'.gtf\' or \'.gff\' are assumed to be in GTF "
-                                        "format; files with any other extension are assumed to be in the simple format.");
+                                        "should be parsed.  Files ending in \'.gtf\', \'.gff\' or \'.gff3\' are assumed to be in GTF "
+     "format; files with any other extension are assumed to be in the simple format. In GTF / GFF format, the \"transcript_id\" is assumed to contain the "
+     "transcript identifier and the \"gene_id\" is assumed to contain the corresponding "
+     "gene identifier.");
 
     // no sequence bias for now
     sopt.useMassBanking = false;
@@ -1219,6 +1279,10 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
 
     po::options_description advanced("\nadvanced options");
     advanced.add_options()
+    ("alternativeInitMode", po::bool_switch(&(sopt.alternativeInitMode))->default_value(false),
+       "[Experimental]: Use an alternative strategy (rather than simple interpolation between) the "
+       "online and uniform abundance estimates to initalize the EM / VBEM algorithm."
+    )
     ("auxDir", po::value<std::string>(&(sopt.auxDir))->default_value("aux_info"), "The sub-directory of the quantification directory where auxiliary information "
      			"e.g. bootstraps, bias parameters, etc. will be written.")
     ("noBiasLengthThreshold", po::bool_switch(&(sopt.noBiasLengthThreshold))->default_value(false), "[experimental] : "
@@ -1226,12 +1290,15 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
           "how short bias correction can make effective lengths. This can increase the precision "
           "of bias correction, but harm robustness.  The default correction applies a thresholdi.")
     ("fldMax" , po::value<size_t>(&(sopt.fragLenDistMax))->default_value(1000), "The maximum fragment length to consider when building the empirical distribution")
-    ("fldMean", po::value<size_t>(&(sopt.fragLenDistPriorMean))->default_value(200), "The mean used in the fragment length distribution prior")
-    ("fldSD" , po::value<size_t>(&(sopt.fragLenDistPriorSD))->default_value(80), "The standard deviation used in the fragment length distribution prior")
+    ("fldMean", po::value<size_t>(&(sopt.fragLenDistPriorMean))->default_value(250), "The mean used in the fragment length distribution prior")
+    ("fldSD" , po::value<size_t>(&(sopt.fragLenDistPriorSD))->default_value(25), "The standard deviation used in the fragment length distribution prior")
     ("forgettingFactor,f", po::value<double>(&(sopt.forgettingFactor))->default_value(0.65), "The forgetting factor used "
                         "in the online learning schedule.  A smaller value results in quicker learning, but higher variance "
                         "and may be unstable.  A larger value results in slower learning but may be more stable.  Value should "
                         "be in the interval (0.5, 1.0].")
+    ("gencode", po::bool_switch(&(sopt.gencodeRef))->default_value(false), "This flag will expect the input transcript fasta to be "
+         "in GENCODE format, and will split the transcript name at the first \'|\' character.  These reduced names will be used in "
+         "the output and when looking for these transcripts in a gene to transcript GTF.")
     ("gcSizeSamp", po::value<std::uint32_t>(&(sopt.gcSampFactor))->default_value(1), "The value by which to down-sample transcripts when representing the "
          "GC content.  Larger values will reduce memory usage, but may decrease the fidelity of bias modeling results.")
    ("biasSpeedSamp",
@@ -1300,7 +1367,10 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
     ("numGibbsSamples", po::value<uint32_t>(&(sopt.numGibbsSamples))->default_value(0), "Number of Gibbs sampling rounds to "
      "perform.")
     ("numBootstraps", po::value<uint32_t>(&(sopt.numBootstraps))->default_value(0), "Number of bootstrap samples to generate. Note: "
-      "This is mutually exclusive with Gibbs sampling.");
+      "This is mutually exclusive with Gibbs sampling.")
+    ("thinningFactor", po::value<uint32_t>(&(sopt.thinningFactor))->default_value(16), "Number of steps to discard for every sample "
+       "kept from the Gibbs chain. The larger this number, the less chance that subsequent samples are auto-correlated, "
+       "but the slower sampling becomes."); 
 
     po::options_description testing("\n"
             "testing options");
@@ -1318,7 +1388,7 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
     po::options_description hidden("\nhidden options");
     hidden.add_options()
       (
-       "numGCBins", po::value<size_t>(&(sopt.numFragGCBins))->default_value(100),
+       "numGCBins", po::value<size_t>(&(sopt.numFragGCBins))->default_value(25),
        "Number of bins to use when modeling fragment GC bias")
       (
        "conditionalGCBins", po::value<size_t>(&(sopt.numConditionalGCBins))->default_value(3),
@@ -1371,6 +1441,12 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
             std::exit(1);
         }
 
+        // Metagenomic option
+        if (sopt.meta) {
+            sopt.initUniform = true;
+            sopt.noRichEqClasses = true;
+        }
+
         std::stringstream commentStream;
         commentStream << "# salmon (alignment-based) v" << salmon::version << "\n";
         commentStream << "# [ program ] => salmon \n";
@@ -1493,7 +1569,9 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
         spdlog::set_async_mode(max_q_size);
 
         auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_mt>(logPath.string(), true);
-        auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+        auto rawConsoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
+        auto consoleSink =
+          std::make_shared<spdlog::sinks::ansicolor_sink>(rawConsoleSink);
         auto consoleLog = spdlog::create("consoleLog", {consoleSink});
         auto fileLog = spdlog::create("fileLog", {fileSink});
         auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
@@ -1510,13 +1588,30 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
 
 	
 	// Verify that no inconsistent options were provided
-        if (sopt.numGibbsSamples > 0 and sopt.numBootstraps > 0) {
+  bool optionsValidate =
+    salmon::utils::validateOptions(sopt);
+  if (!optionsValidate) {
+    sopt.jointLog->flush();
+    spdlog::drop_all();
+    std::exit(1);
+  }
+
+  if (sopt.numGibbsSamples > 0 and sopt.numBootstraps > 0) {
             jointLog->error("You cannot perform both Gibbs sampling and bootstrapping. "
                             "Please choose one.");
             jointLog->flush();
             std::exit(1);
         }
 
+        if (sopt.numGibbsSamples > 0) {
+          if (! sopt.thinningFactor >= 1) {
+            jointLog->error("The Gibbs sampling thinning factor (--thinningFactor) "
+                            "cannot be smaller than 1.");
+            jointLog->flush();
+            std::exit(1);
+          }
+        }
+
         if (!sopt.sampleOutput and sopt.sampleUnaligned) {
             fmt::MemoryWriter wstr;
             wstr << "WARNING: you passed in the (-u/--sampleUnaligned) flag, but did not request a sampled "
@@ -1587,10 +1682,10 @@ int salmonAlignmentQuantify(int argc, char* argv[]) {
                 {
 		  // We can only do fragment GC bias correction, for the time being, with paired-end reads
 		  if (sopt.gcBiasCorrect) {
-		    jointLog->warn("Fragment GC bias correction is currently only "
-				   "implemented for paired-end libraries.  Disabling "
-				   "fragment GC bias correction for this run");
-		    sopt.gcBiasCorrect = false;
+            jointLog->warn("Fragment GC bias correction is currently *experimental* "
+                           "in single-end libraries.  Please use this option "
+                           "with caution.");
+		    //sopt.gcBiasCorrect = false;
 		  } 
 
 		    AlignmentLibrary<UnpairedRead> alnLib(alignmentFiles,
diff --git a/src/SalmonUtils.cpp b/src/SalmonUtils.cpp
index cdb9a2d..e7f5980 100644
--- a/src/SalmonUtils.cpp
+++ b/src/SalmonUtils.cpp
@@ -294,6 +294,7 @@ void writeAbundancesFromCollapsed(const SalmonOpts& sopt, ExpLib& alnLib,
     double refLength = sopt.noEffectiveLengthCorrection
                            ? transcript.RefLength
                            : std::exp(transcript.getCachedLogEffectiveLength());
+    if (sopt.noLengthCorrection) { refLength = 100.0; }
     tfracDenom += (transcript.projectedCounts / numMappedFrags) / refLength;
   }
 
@@ -306,10 +307,11 @@ void writeAbundancesFromCollapsed(const SalmonOpts& sopt, ExpLib& alnLib,
     double count = transcript.projectedCounts;
     double npm = (transcript.projectedCounts / numMappedFrags);
     double effLength = std::exp(logLength);
+    if (sopt.noLengthCorrection) { effLength = 100.0; }
     double tfrac = (npm / effLength) / tfracDenom;
     double tpm = tfrac * million;
     fmt::print(output.get(), "{}\t{}\t{}\t{}\t{}\n", transcript.RefName,
-               transcript.RefLength, effLength, tpm, count);
+               transcript.CompleteLength, effLength, tpm, count);
   }
 }
 
@@ -402,6 +404,7 @@ void writeAbundances(const SalmonOpts& sopt, ExpLib& alnLib,
     double logLength = sopt.noEffectiveLengthCorrection
                            ? std::log(transcript.RefLength)
                            : transcript.getCachedLogEffectiveLength();
+    if (sopt.noLengthCorrection) { logLength = 1.0; }
     /*
     if (!sopt.noSeqBiasModel) {
         double avgLogBias = transcript.getAverageSequenceBias(
@@ -749,14 +752,14 @@ extractReadLibraries(boost::program_options::parsed_options& orderedOptions) {
     }
     libs.push_back(lib);
   }
-  
+
   auto log = spdlog::get("jointLog");
   size_t numLibs = libs.size();
   if (numLibs == 1) {
       log->info("There is 1 library.");
   } else if (numLibs > 1) {
       log->info("There are {} libraries.", numLibs);
-  } 
+  }
   return libs;
 }
 
@@ -855,7 +858,7 @@ LibraryFormat parseLibraryFormatString(std::string& fmt) {
   bool peekBAMIsPaired(const boost::filesystem::path& file) {
     namespace bfs = boost::filesystem;
     std::string readMode = "r";
-    
+
     if (bfs::is_regular_file(file)) {
       if (bfs::is_empty(file)) {
 	fmt::MemoryWriter errstr;
@@ -885,7 +888,7 @@ LibraryFormat parseLibraryFormatString(std::string& fmt) {
 
     bool didRead = (scram_get_seq(fp, &read) >= 0);
     bool isPaired{false};
-    
+
     if (didRead) {
       isPaired = bam_flag(read) & BAM_FPAIRED;
     } else {
@@ -900,7 +903,7 @@ LibraryFormat parseLibraryFormatString(std::string& fmt) {
     staden::utils::bam_destroy(read);
     return isPaired;
   }
-  
+
 uint64_t encode(uint64_t tid, uint64_t offset) {
   uint64_t res = (((tid & 0xFFFFFFFF) << 32) | (offset & 0xFFFFFFFF));
   return res;
@@ -983,6 +986,10 @@ TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname,
   using std::string;
   using std::get;
 
+  // Get the logger
+  auto logger = spdlog::get("jointLog");
+
+
   // Use GffReader to read the file
   GffReader reader(const_cast<char*>(fname.c_str()));
   // Remember the optional attributes
@@ -996,7 +1003,7 @@ TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname,
   };
 
   // The user can group transcripts by gene_id, gene_name, or
-  // an optinal attribute that they provide as a string.
+  // an optional attribute that they provide as a string.
   enum class TranscriptKey { GENE_ID, GENE_NAME, DYNAMIC };
 
   // Select the proper attribute by which to group
@@ -1016,7 +1023,7 @@ TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname,
   for (int i = 0; i < nfeat; ++i) {
     auto f = reader.gflst[i];
     if (f->isTranscript()) {
-      const char* keyStr;
+      const char* keyStr = nullptr;
       switch (tkey) {
       case TranscriptKey::GENE_ID:
         keyStr = f->getGeneID();
@@ -1028,7 +1035,17 @@ TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname,
         keyStr = f->getAttr(key.c_str());
         break;
       }
-      feats.emplace_back(f->getID(), keyStr);
+      if (keyStr != nullptr and keyStr != NULL and f->hasGffID()) {
+        feats.emplace_back(f->getID(), keyStr);
+      } else {
+        if (!f->hasGffID()){
+          logger->warn("Feature has no GFF ID");
+        }
+        if (keyStr == NULL) {
+          const char* fid = f->hasGffID() ? f->getID() : "NO_GFF_ID";
+          logger->warn("Could not find key for feature {}", fid);
+        }
+      }
     }
   }
 
@@ -1263,6 +1280,54 @@ std::vector<std::string> split(const std::string& str,
   return result;
 }
 
+std::string getCurrentTimeAsString() {
+    // Get the time at the start of the run
+    std::time_t result = std::time(NULL);
+    auto time = std::string(std::asctime(std::localtime(&result)));
+    time.pop_back(); // remove the newline
+    return time;
+}
+
+/**
+ * Validate the options regardless of the mode (quasi or alignment). 
+ * Assumes a logger already exists.
+ **/
+bool validateOptions(SalmonOpts& sopt) {
+
+  // The growing list of thou shalt nots
+
+  /**
+  Since bias correction is dependent on
+  modifying effective lengths, we can not
+  allow it if we are not employing any length
+  correction.
+  **/
+
+  /** Warnings, not errors **/
+  if (sopt.numBurninFrags < sopt.numPreBurninFrags) {
+    sopt.jointLog->warn("You set the number of burnin fragments (--numAuxModelSamples) to be less than the number of \n"
+                   "pre-burnin fragments (--numPreAuxModelSamples), but it must be at least as large.  The \n"
+                   "number of pre-burnin fragments and burnin fragments is being set to the same value "
+                   "({})", sopt.numBurninFrags);
+    sopt.numPreBurninFrags = sopt.numBurninFrags;
+  }
+
+  /** Errors **/
+  if (sopt.noLengthCorrection) {
+    bool anyBiasCorrect =
+      sopt.gcBiasCorrect or sopt.biasCorrect or sopt.posBiasCorrect;
+    if (anyBiasCorrect) {
+      sopt.jointLog->critical("Since bias correction relies on modifying "
+                              "effective lengths, you cannot enable bias "
+                              "correction simultaneously with the --noLengthCorrection "
+                              "option.");
+      return false;
+    }
+  }
+
+  return true;
+}
+
 /**
  * Validate the options for quasi-mapping-based salmon, and create the necessary
  *output directories and
@@ -1280,9 +1345,7 @@ bool processQuantOptions(SalmonOpts& sopt,
   sopt.numBiasSamples.store(numBiasSamples);
 
   // Get the time at the start of the run
-  std::time_t result = std::time(NULL);
-  sopt.runStartTime = std::string(std::asctime(std::localtime(&result)));
-  sopt.runStartTime.pop_back(); // remove the newline
+  sopt.runStartTime = getCurrentTimeAsString();
 
   // Verify the geneMap before we start doing any real work.
   bfs::path geneMapPath;
@@ -1325,25 +1388,31 @@ bool processQuantOptions(SalmonOpts& sopt,
     std::cerr << "Logs will be written to " << logDirectory.string() << "\n";
   }
 
-  // Determine what we'll do with quasi-mapping results 
+  // Metagenomic option
+  if (sopt.meta) {
+      sopt.initUniform = true;
+      sopt.noRichEqClasses = true;
+  }
+
+  // Determine what we'll do with quasi-mapping results
   bool writeQuasimappings = (sopt.qmFileName != "");
 
   bfs::path logPath = logDirectory / "salmon_quant.log";
   // must be a power-of-two
   size_t max_q_size = 2097152;
-                      
-  // make it larger if we're writing mappings or 
+
+  // make it larger if we're writing mappings or
   // unmapped names.
   std::streambuf* qmBuf;
-  if (writeQuasimappings or sopt.writeUnmappedNames) {
+  if (writeQuasimappings or sopt.writeUnmappedNames or sopt.writeOrphanLinks) {
       max_q_size = 16777216;
-  }  
+  }
 
   spdlog::set_async_mode(max_q_size);
 
   auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_mt>(
       logPath.string());
-  auto rawConsoleSink = std::make_shared<spdlog::sinks::stdout_sink_mt>();
+  auto rawConsoleSink = std::make_shared<spdlog::sinks::stderr_sink_mt>();
   auto consoleSink =
       std::make_shared<spdlog::sinks::ansicolor_sink>(rawConsoleSink);
   auto consoleLog = spdlog::create("stderrLog", {consoleSink});
@@ -1381,6 +1450,7 @@ bool processQuantOptions(SalmonOpts& sopt,
       spdlog::register_logger(outLog);
       outLog->set_pattern("%v");
       sopt.unmappedFile.reset(outFile);
+      sopt.unmappedLog = outLog;
     } else {
       jointLog->error("Couldn't create auxiliary directory in which to place "
                       "\"unmapped_names.txt\"");
@@ -1388,18 +1458,51 @@ bool processQuantOptions(SalmonOpts& sopt,
     }
   }
 
+  // Create the file (and logger) for outputting unmapped reads, if the user has
+  // asked for it.
+  if (sopt.writeOrphanLinks) {
+    boost::filesystem::path auxDir = sopt.outputDirectory / sopt.auxDir;
+    bool auxSuccess = boost::filesystem::is_directory(auxDir);
+    if (!auxSuccess) {
+      auxSuccess = boost::filesystem::create_directories(auxDir);
+    }
+    if (auxSuccess) {
+      bfs::path orphanLinkFile = auxDir / "orphan_links.txt";
+      std::ofstream* outFile = new std::ofstream(orphanLinkFile.string());
+
+      // Must be a power of 2
+      //size_t queueSize{268435456};
+      //spdlog::set_async_mode(queueSize);
+      auto outputSink =
+          std::make_shared<spdlog::sinks::ostream_sink_mt>(*outFile);
+
+      std::shared_ptr<spdlog::logger> outLog =
+          std::make_shared<spdlog::logger>("orphanLinkLog", outputSink);
+      spdlog::register_logger(outLog);
+      outLog->set_pattern("%v");
+      sopt.orphanLinkFile.reset(outFile);
+      sopt.orphanLinkLog = outLog;
+    } else {
+      jointLog->error("Couldn't create auxiliary directory in which to place "
+                      "\"orphan_links.txt\"");
+      return false;
+    }
+  }
+
   if (writeQuasimappings) {
       // output to stdout
       if (sopt.qmFileName == "-") {
           qmBuf = std::cout.rdbuf();
       } else { // output to the requested path, making the directory if it doesn't exist
-          // get the parent directory
-          bfs::path qmDir = boost::filesystem::path(sopt.qmFileName).parent_path();
+	// get the absolute file path
+	sopt.qmFileName = boost::filesystem::absolute(sopt.qmFileName).string();
+	  // get the parent directory
+	  bfs::path qmDir = boost::filesystem::path(sopt.qmFileName).parent_path();
           // if it's not already a directory that exists
           bool qmDirSuccess = boost::filesystem::is_directory(qmDir);
           // try to create it
           if (!qmDirSuccess) {
-              qmDirSuccess = boost::filesystem::create_directories(qmDir); 
+              qmDirSuccess = boost::filesystem::create_directories(qmDir);
           }
           // if the directory already existed, or we created it successfully, open the file
           if (qmDirSuccess) {
@@ -1415,11 +1518,11 @@ bool processQuantOptions(SalmonOpts& sopt,
       // Now set the output stream to the buffer, which is
       // either std::cout, or a file.
       sopt.qmStream.reset(new std::ostream(qmBuf));
-      
+
       auto outputSink = std::make_shared<spdlog::sinks::ostream_sink_mt>(*(sopt.qmStream.get()));
       sopt.qmLog = std::make_shared<spdlog::logger>("qmStream", outputSink);
       sopt.qmLog->set_pattern("%v");
-  } 
+  }
 
   // Verify that no inconsistent options were provided
   if (sopt.numGibbsSamples > 0 and sopt.numBootstraps > 0) {
@@ -1428,6 +1531,14 @@ bool processQuantOptions(SalmonOpts& sopt,
     jointLog->flush();
     return false;
   }
+  if (sopt.numGibbsSamples > 0) {
+    if (! sopt.thinningFactor >= 1) {
+      jointLog->error("The Gibbs sampling thinning factor (--thinningFactor) "
+                      "cannot be smaller than 1.");
+      jointLog->flush();
+      return false;
+    }
+  }
 
   {
     if (sopt.noFragLengthDist and !sopt.noEffectiveLengthCorrection) {
@@ -1448,7 +1559,7 @@ bool processQuantOptions(SalmonOpts& sopt,
     jointLog->flush();
     return false;
   }
-  
+
   // maybe arbitrary, but if it's smaller than this, consider it
   // equal to LOG_0.
   if (sopt.incompatPrior < 1e-320 or sopt.incompatPrior == 0.0) {
@@ -1460,6 +1571,11 @@ bool processQuantOptions(SalmonOpts& sopt,
       sopt.ignoreIncompat = false;
   }
 
+  // Dumping equivalnce class weights implies dumping equivalence classes
+  if (sopt.dumpEqWeights and !sopt.dumpEq) {
+    sopt.dumpEq = true;
+    jointLog->info("You specified --dumpEqWeights, which implies --dumpEq; that option has been enabled.");
+  }
   return true;
 }
 
@@ -1478,10 +1594,12 @@ bool processQuantOptions(SalmonOpts& sopt,
 template <typename AbundanceVecT, typename ReadExpT>
 Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
                                        Eigen::VectorXd& effLensIn,
-                                       AbundanceVecT& alphas, bool writeBias) {
+                                       AbundanceVecT& alphas, std::vector<bool>& available, bool writeBias) {
 
   using std::vector;
   using BlockedIndexRange = tbb::blocked_range<size_t>;
+  using salmon::math::EPSILON;
+  using salmon::math::LOG_EPSILON;
 
   double minAlpha = 1e-8;
   double minCDFMass = 1e-10;
@@ -1623,7 +1741,70 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
   int outsideContext{3};
   int insideContext{2};
 
+  /**
+   * New context counting
+   */
+  
   int contextSize = outsideContext + insideContext;
+  //double cscale = 100.0 / (2 * contextSize);
+  auto populateContextCounts = [outsideContext, insideContext, contextSize](
+      const Transcript& txp, const char* tseq, Eigen::VectorXd& contextCountsFP,
+      Eigen::VectorXd& contextCountsTP,
+      Eigen::VectorXd& windowLensFP,
+      Eigen::VectorXd& windowLensTP) {
+    auto refLen = static_cast<int32_t>(txp.RefLength);
+    auto lastPos = refLen - 1;
+    if (refLen > contextSize) {
+      // window starts like this
+      // -3 === -2 === -1 === 0 === 1
+      //         3'           5'
+      // and then shifts to the right one base at a time.
+      int windowEnd = insideContext - 1;
+      int windowStart = -outsideContext;
+      int fp = 0;
+      int tp = windowStart + (insideContext - 1);
+      double count = txp.gcAt(windowEnd - 1);
+      for (; tp < refLen; ++fp, ++tp) {
+        if (windowStart > 0) {
+          switch (tseq[windowStart-1]) {
+          case 'G':
+          case 'g':
+          case 'C':
+          case 'c':
+          count -= 1;
+          }
+        }
+        if (windowEnd < refLen) {
+          switch (tseq[windowEnd]) {
+          case 'G':
+          case 'g':
+          case 'C':
+          case 'c':
+            count += 1;
+          }
+        }
+        double actualWindowLength = (windowEnd < contextSize) ? windowEnd + 1 : (windowEnd - windowStart + 1);
+        if (fp < refLen) {
+          contextCountsFP[fp] = count;
+          windowLensFP[fp] = actualWindowLength;
+        }
+        if (tp >=0 ) {
+          contextCountsTP[tp] = count;
+          windowLensTP[tp] = actualWindowLength;
+        }
+        // Shift the end of the window right 1 base
+        if (windowEnd < refLen - 1) { ++windowEnd; }
+        ++windowStart;
+      }
+    }
+  };
+  
+
+  /**
+   * orig context counting
+   **/
+  /*
+int contextSize = outsideContext + insideContext;
   double cscale = 100.0 / (2 * contextSize);
   auto populateContextCounts = [outsideContext, insideContext, contextSize](
       const Transcript& txp, const char* tseq, Eigen::VectorXd& contextCountsFP,
@@ -1666,6 +1847,8 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
       }
     }
   };
+  */
+
 
   /**
    * The local bias terms from each thread can be combined
@@ -1731,8 +1914,12 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
 
           Eigen::VectorXd contextCountsFP(refLen);
           Eigen::VectorXd contextCountsTP(refLen);
-          contextCountsFP.setOnes();
-          contextCountsTP.setOnes();
+          Eigen::VectorXd windowLensFP(refLen);
+          Eigen::VectorXd windowLensTP(refLen);
+          contextCountsFP.setZero();
+          contextCountsTP.setZero();
+          windowLensFP.setZero();
+          windowLensTP.setZero();
 
           // This transcript's sequence
           const char* tseq = txp.Sequence();
@@ -1746,7 +1933,9 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
           int32_t contextLength{expectSeqFW.getContextLength()};
 
           if (gcBiasCorrect and seqBiasCorrect) {
-            populateContextCounts(txp, tseq, contextCountsFP, contextCountsTP);
+            populateContextCounts(txp, tseq,
+                                  contextCountsFP, contextCountsTP,
+                                  windowLensFP, windowLensTP);
           }
 
           // The smallest and largest values of fragment
@@ -1789,9 +1978,17 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
                 if (fragEnd < refLen) {
                   // The GC fraction for this putative fragment
                   auto gcFrac = txp.gcFrac(fragStart, fragEnd);
+                  /*
                   int32_t contextFrac = std::lrint(
-                      (contextCountsFP[fragStart] + contextCountsTP[fragEnd]) *
-                      cscale);
+                                                   (contextCountsFP[fragStart] + contextCountsTP[fragEnd]) *
+                                                   cscale);
+                  */ 
+                  double contextLength = (windowLensFP[fragStart] + windowLensTP[fragEnd]);
+                  int32_t contextFrac = (contextLength > 0) ?
+                    (std::lrint(100.0 *
+                               (contextCountsFP[fragStart] + contextCountsTP[fragEnd]) / contextLength)) :
+                    0;
+                  
                   GCDesc desc{gcFrac, contextFrac};
                   expectGC.inc(desc,
                                weight * (conditionalCDF(fl) - prevFLMass));
@@ -1808,11 +2005,11 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
               int32_t maxFragLenRC = fragStartPos;
               auto densityFW = conditionalCDF(maxFragLenFW);
               auto densityRC = conditionalCDF(maxFragLenRC);
-              if (weight * densityFW > 1e-8) {
+              if (weight * densityFW > EPSILON) {
                 expectPos5[txp.lengthClassIndex()].addMass(
                     fragStartPos, txp.RefLength, std::log(weight * densityFW));
               }
-              if (weight * densityRC > 1e-8) {
+              if (weight * densityRC > EPSILON) {
                 expectPos3[txp.lengthClassIndex()].addMass(
                     fragStartPos, txp.RefLength, std::log(weight * densityRC));
               }
@@ -1846,8 +2043,10 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
    */
   SBModel exp5;
   SBModel exp3;
-  std::vector<SimplePosBias> pos5Exp(5);
-  std::vector<SimplePosBias> pos3Exp(5);
+
+  auto& pos5Exp = readExp.posBiasExpected(salmon::utils::Direction::FORWARD);
+  auto& pos3Exp = readExp.posBiasExpected(salmon::utils::Direction::REVERSE_COMPLEMENT);
+
   auto combineBiasParams =
       [seqBiasCorrect, gcBiasCorrect, posBiasCorrect, &pos5Exp, &pos3Exp, &exp5,
        &exp3, &transcriptGCDist](const CombineableBiasParams& p) -> void {
@@ -1932,8 +2131,10 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
           int32_t locFLDLow = (refLen < cdfMaxArg) ? 1 : fldLow;
           int32_t locFLDHigh = (refLen < cdfMaxArg) ? cdfMaxArg : fldHigh;
 
-          if (alphas[it] >= minAlpha and unprocessedLen > 0 and
-              cdfMaxVal > minCDFMass) {
+          if (alphas[it] >= minAlpha
+              //available[it]
+              and unprocessedLen > 0
+              and cdfMaxVal > minCDFMass) {
 
             Eigen::VectorXd seqFactorsFW(refLen);
             Eigen::VectorXd seqFactorsRC(refLen);
@@ -1942,8 +2143,12 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
 
             Eigen::VectorXd contextCountsFP(refLen);
             Eigen::VectorXd contextCountsTP(refLen);
-            contextCountsFP.setOnes();
-            contextCountsTP.setOnes();
+            Eigen::VectorXd windowLensFP(refLen);
+            Eigen::VectorXd windowLensTP(refLen);
+            contextCountsFP.setZero();
+            contextCountsTP.setZero();
+            windowLensFP.setZero();
+            windowLensTP.setZero();
 
             std::vector<double> posFactorsFW(refLen, 1.0);
             std::vector<double> posFactorsRC(refLen, 1.0);
@@ -1958,8 +2163,9 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
             bool done{fl >= maxLen};
 
             if (gcBiasCorrect and seqBiasCorrect) {
-              populateContextCounts(txp, tseq, contextCountsFP,
-                                    contextCountsTP);
+              populateContextCounts(txp, tseq,
+                                    contextCountsFP, contextCountsTP,
+                                    windowLensFP, windowLensTP);
             }
 
             if (posBiasCorrect) {
@@ -2060,10 +2266,18 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
                       seqFactorsFW[fragStart] * seqFactorsRC[fragEnd];
                   if (gcBiasCorrect) {
                     auto gcFrac = txp.gcFrac(fragStart, fragEnd);
+                    /*
                     int32_t contextFrac =
-                        std::lrint((contextCountsFP[fragStart] +
-                                    contextCountsTP[fragEnd]) *
-                                   cscale);
+                      std::lrint((contextCountsFP[fragStart] +
+                                  contextCountsTP[fragEnd]) *
+                                 cscale);
+                    */ 
+                    double contextLength = (windowLensFP[fragStart] + windowLensTP[fragEnd]);
+                    int32_t contextFrac = (contextLength > 0) ?
+                      (std::lrint(100.0 *
+                                  (contextCountsFP[fragStart] + contextCountsTP[fragEnd]) / contextLength)) :
+                      0;
+                    
                     GCDesc desc{gcFrac, contextFrac};
                     fragFactor *= gcBias.get(desc);
                     /*
@@ -2108,10 +2322,10 @@ Eigen::VectorXd updateEffectiveLengths(SalmonOpts& sopt, ReadExpT& readExp,
 
   // Copy over the expected sequence bias models
   if (seqBiasCorrect) {
-    readExp.setReadBiasModelExpected(std::move(exp5), salmon::utils::Direction::FORWARD); 
-    readExp.setReadBiasModelExpected(std::move(exp3), salmon::utils::Direction::REVERSE_COMPLEMENT); 
+    readExp.setReadBiasModelExpected(std::move(exp5), salmon::utils::Direction::FORWARD);
+    readExp.setReadBiasModelExpected(std::move(exp3), salmon::utils::Direction::REVERSE_COMPLEMENT);
   }
-  
+
   sopt.jointLog->info("processed bias for 100.0% of the transcripts");
   return effLensOut;
 }
@@ -2125,6 +2339,8 @@ void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm,
   using std::move;
   using std::cerr;
   using std::max;
+ 
+  auto logger = spdlog::get("jointLog");
 
   constexpr double minTPM = std::numeric_limits<double>::denorm_min();
   std::ifstream expFile(inputPath.string());
@@ -2141,9 +2357,6 @@ void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm,
 
   bool headerLine{true};
   while (getline(expFile, l)) {
-    if (++ln % 1000 == 0) {
-      cerr << "\r\rParsed " << ln << " expression lines";
-    }
     auto it =
         find_if(l.begin(), l.end(), [](char c) -> bool { return !isspace(c); });
     if (it != l.end()) {
@@ -2154,7 +2367,12 @@ void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm,
         if (!headerLine) {
           vector<string> toks = split(l);
           ExpressionRecord er(toks);
-          auto gn = tgm.geneName(er.target);
+          bool foundGene{false};
+          auto gn = tgm.geneName(er.target, foundGene);
+          if (!foundGene) {
+            logger->warn("couldn't find transcript named [{}] in transcript <-> gene map; "
+                         "returning transcript as it's own gene", er.target);
+          }
           geneExps[gn].push_back(move(er));
         } else { // treat the header line as a comment
           comments.push_back(l);
@@ -2163,10 +2381,9 @@ void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm,
       }
     }
   }
-  cerr << "\ndone\n";
   expFile.close();
 
-  cerr << "Aggregating expressions to gene level . . .";
+  logger->info("Aggregating expressions to gene level");
   boost::filesystem::path outputFilePath(inputPath);
   outputFilePath.replace_extension(".genes.sf");
   ofstream outFile(outputFilePath.string());
@@ -2225,20 +2442,21 @@ void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm,
   }
 
   outFile.close();
-  cerr << " done\n";
+  logger->info("done");
   //====================== From GeneSum =====================
 }
 
 void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
                                 boost::filesystem::path& estDir) {
   namespace bfs = boost::filesystem;
-  std::cerr << "Computing gene-level abundance estimates\n";
-  bfs::path gtfExtension(".gtf");
+  auto logger = spdlog::get("jointLog");
+  logger->info("Computing gene-level abundance estimates");
+  std::set<std::string> validGTFExtensions = {".gtf", ".gff", ".gff3", ".GTF", ".GFF", ".GFF3"};
   auto extension = geneMapPath.extension();
 
   TranscriptGeneMap tranGeneMap;
   // parse the map as a GTF file
-  if (extension == gtfExtension) {
+  if (validGTFExtensions.find(extension.string()) != validGTFExtensions.end()) {
     // Using libgff
     tranGeneMap = salmon::utils::transcriptGeneMapFromGTF(geneMapPath.string(),
                                                           "gene_id");
@@ -2248,9 +2466,8 @@ void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
     tgfile.close();
   }
 
-  std::cerr << "There were " << tranGeneMap.numTranscripts()
-            << " transcripts mapping to " << tranGeneMap.numGenes()
-            << " genes\n";
+  logger->info("There were {} transcripts mapping to {} genes",
+               tranGeneMap.numTranscripts(), tranGeneMap.numGenes());
 
   bfs::path estFilePath = estDir / "quant.sf";
   if (!bfs::exists(estFilePath)) {
@@ -2365,38 +2582,38 @@ template Eigen::VectorXd
 salmon::utils::updateEffectiveLengths<std::vector<tbb::atomic<double>>,
                                       ReadExperiment>(
     SalmonOpts& sopt, ReadExperiment& readExp, Eigen::VectorXd& effLensIn,
-    std::vector<tbb::atomic<double>>& alphas, bool finalRound);
+    std::vector<tbb::atomic<double>>& alphas, std::vector<bool>& available, bool finalRound);
 
 template Eigen::VectorXd
 salmon::utils::updateEffectiveLengths<std::vector<double>, ReadExperiment>(
     SalmonOpts& sopt, ReadExperiment& readExp, Eigen::VectorXd& effLensIn,
-    std::vector<double>& alphas, bool finalRound);
+    std::vector<double>& alphas, std::vector<bool>& available, bool finalRound);
 
 template Eigen::VectorXd
 salmon::utils::updateEffectiveLengths<std::vector<tbb::atomic<double>>,
                                       AlignmentLibrary<ReadPair>>(
     SalmonOpts& sopt, AlignmentLibrary<ReadPair>& readExp,
     Eigen::VectorXd& effLensIn, std::vector<tbb::atomic<double>>& alphas,
-    bool finalRound);
+    std::vector<bool>& available, bool finalRound);
 
 template Eigen::VectorXd
 salmon::utils::updateEffectiveLengths<std::vector<double>,
                                       AlignmentLibrary<ReadPair>>(
     SalmonOpts& sopt, AlignmentLibrary<ReadPair>& readExp,
-    Eigen::VectorXd& effLensIn, std::vector<double>& alphas, bool finalRound);
+    Eigen::VectorXd& effLensIn, std::vector<double>& alphas, std::vector<bool>& available, bool finalRound);
 
 template Eigen::VectorXd
 salmon::utils::updateEffectiveLengths<std::vector<tbb::atomic<double>>,
                                       AlignmentLibrary<UnpairedRead>>(
     SalmonOpts& sopt, AlignmentLibrary<UnpairedRead>& readExp,
     Eigen::VectorXd& effLensIn, std::vector<tbb::atomic<double>>& alphas,
-    bool finalRound);
+    std::vector<bool>& available, bool finalRound);
 
 template Eigen::VectorXd
 salmon::utils::updateEffectiveLengths<std::vector<double>,
                                       AlignmentLibrary<UnpairedRead>>(
     SalmonOpts& sopt, AlignmentLibrary<UnpairedRead>& readExp,
-    Eigen::VectorXd& effLensIn, std::vector<double>& alphas, bool finalRound);
+    Eigen::VectorXd& effLensIn, std::vector<double>& alphas, std::vector<bool>& available, bool finalRound);
 
 //// 0th order model --- code for computing bias factors.
 
diff --git a/src/SimplePosBias.cpp b/src/SimplePosBias.cpp
index 77d822d..563abbc 100644
--- a/src/SimplePosBias.cpp
+++ b/src/SimplePosBias.cpp
@@ -73,6 +73,24 @@ void SimplePosBias::finalize() {
     splineBins[i+1] = positionBins_[i] - 0.01;
   }
   splineBins.back() = 1.0;
-
   s_.set_points(splineBins, splineMass);
+  isLogged_ = false;
+  isFinalized_ = true;
+}
+
+// Seralize this model.
+bool SimplePosBias::writeBinary(boost::iostreams::filtering_ostream& out) const {
+    auto* mutThis = const_cast<SimplePosBias*>(this);
+    // We shouldn't write out a non-finalized model
+    if (!mutThis->isFinalized_) {
+      auto l = spdlog::get("jointLog");
+      l->error("Attempting to write out a non-finalized positional bias model. "
+	       "This should not happen.  Please report this bug on GitHub.");
+      return false;
+    }
+
+    uint32_t modelLen = static_cast<uint32_t>(masses_.size());
+    out.write(reinterpret_cast<char*>(&modelLen), sizeof(modelLen));
+    out.write(reinterpret_cast<char*>(const_cast<decltype(masses_)::value_type*>(masses_.data())), sizeof(masses_.front()) * modelLen);
+    return true;
 }
diff --git a/tests/test_quant.nf b/tests/test_quant.nf
new file mode 100644
index 0000000..5066d8a
--- /dev/null
+++ b/tests/test_quant.nf
@@ -0,0 +1,71 @@
+#!/usr/bin/env nextflow
+
+params.salmon = '/drone/src/github.com/COMBINE-lab/salmon/bin/salmon'
+ref = file('/mnt/data/simulated/sim/Homo_sapiens.GRCh37.75.cdna.pc.fa')
+truthpath = file('/mnt/data/simulated/sim/truth')
+basepath = Channel.from('/mnt/data/simulated/sim/out/out')
+scriptdir = file('/drone/src/github.com/COMBINE-lab/salmon/scripts')
+resdir = './'
+conds = ['A', 'B']
+samples = [1, 2]
+
+process buildIndex {
+        cpus 2
+        input:
+        file ref
+
+        output:
+        file nfindex into index
+
+        """
+        ${params.salmon} index -t $ref -i nfindex
+        """
+}
+
+process quantSim {
+  cpus 16
+
+  input:
+  file index
+  val basepath 
+  each cond from conds
+  each sample from samples
+
+  output:
+  file "sim_quants/${cond}_${sample}" into simqs
+
+	script:
+	if (cond == 'A')
+	    """
+      ${params.salmon} quant -p 16 -i ${index} -l A -1 ${basepath}_${sample}/sample_01_1_shuffled.fa.gz -2 ${basepath}_${sample}/sample_01_2_shuffled.fa.gz -o sim_quants/${cond}_${sample}
+  	    """
+	else
+	    """
+      ${params.salmon} quant -p 16 -i ${index} -l A -1 ${basepath}_${sample}/sample_02_1_shuffled.fa.gz -2 ${basepath}_${sample}/sample_02_2_shuffled.fa.gz -o sim_quants/${cond}_${sample}
+	    """
+}
+
+process evalQuants {
+
+  publishDir "$resdir"
+
+	input:
+  val truthpath
+	file flist from simqs.toList()
+  	
+  output:
+  file sim into simres
+
+	script:
+	"""
+	for f in ${flist}
+	do
+    if [[ \$f == A* ]]
+    then
+      python ${scriptdir}/test_sim_corr.py --sim "\${f}/quant.sf" --est "${truthpath}/truthA.tpm" --out "sim/\${f}_res.json"
+    else
+      python ${scriptdir}/test_sim_corr.py --sim "\${f}/quant.sf" --est "${truthpath}/truthB.tpm" --out "sim/\${f}_res.json"
+    fi
+	done
+	"""
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/salmon.git