[med-svn] [Git][med-team/salmon][upstream] New upstream version 1.10.0+ds1

Andreas Tille (@tille) gitlab at salsa.debian.org
Wed Mar 8 13:23:28 GMT 2023



Andreas Tille pushed to branch upstream at Debian Med / salmon


Commits:
2fde979a by Andreas Tille at 2023-03-08T13:46:24+01:00
New upstream version 1.10.0+ds1
- - - - -


17 changed files:

- CMakeLists.txt
- cmake/Modules/Findlibstadenio.cmake
- cmake/TestSalmonQuasi.cmake
- current_version.txt
- doc/source/conf.py
- docker/Dockerfile
- docker/build_test.sh
- include/AlevinOpts.hpp
- include/SalmonConfig.hpp
- include/SalmonMappingUtils.hpp
- include/Transcript.hpp
- scripts/fetchPufferfish.sh
- scripts/v1_10x/run.sh
- src/Alevin.cpp
- src/AlevinUtils.cpp
- src/CMakeLists.txt
- src/SalmonAlevin.cpp


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.15)
 
 if(DEFINED ENV{CC})
   set(CC $ENV{CC})
@@ -662,7 +662,7 @@ tar -xzvf v2021.5.tar.gz
 SOURCE_DIR ${TBB_SOURCE_DIR}
 INSTALL_DIR ${TBB_INSTALL_DIR}
 PATCH_COMMAND "${TBB_PATCH_STEP}"
-CMAKE_ARGS -DCMAKE_CXX_FLAGS=${TBB_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DTBB_TEST=OFF -DTBB_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+CMAKE_ARGS -DCMAKE_CXX_FLAGS=${TBB_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DTBB_TEST=OFF -DTBB_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
 BUILD_IN_SOURCE TRUE
 )
 
@@ -786,7 +786,7 @@ find_package(CURL)
 if (FETCH_STADEN)
   set(LIBSTADEN_FOUND FALSE)
 else ()
-  find_package(libstadenio)
+  find_package(libstadenio 1.14.15)
 endif()
 
 if (NOT LIBSTADENIO_FOUND)
@@ -794,12 +794,12 @@ if (NOT LIBSTADENIO_FOUND)
   message("==================================================================")
   externalproject_add(libstadenio
     DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
-    DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/staden-io_lib/archive/v1.14.8.1.tar.gz -o staden-io_lib-v1.14.8.tar.gz &&
-      ${SHASUM} f6f30eefa478cfb708f3109a35fb6ffa0e24951d9d971985df2cef5919dd0bc3 staden-io_lib-v1.14.8.tar.gz &&
-      mkdir -p staden-io_lib-1.14.8 &&
-      tar -xzf staden-io_lib-v1.14.8.tar.gz --strip-components=1 -C staden-io_lib-1.14.8 &&
+    DOWNLOAD_COMMAND curl -k -L https://github.com/jkbonfield/io_lib/releases/download/io_lib-1-14-15/io_lib-1.14.15.tar.gz -o staden-io_lib-v1.14.15.tar.gz &&
+      ${SHASUM} 20814c4365e1e2fe6630fb11d0df370dec4c5688af3871de7f1cb0129671401e staden-io_lib-v1.14.15.tar.gz &&
+      mkdir -p staden-io_lib-1.14.15 &&
+      tar -xzf staden-io_lib-v1.14.15.tar.gz --strip-components=1 -C staden-io_lib-1.14.15 &&
       rm -fr staden-io_lib &&
-      mv -f staden-io_lib-1.14.8 staden-io_lib
+      mv -f staden-io_lib-1.14.15 staden-io_lib
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/staden-io_lib
     INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
     CONFIGURE_COMMAND ./configure --enable-shared=no --without-libcurl --prefix=<INSTALL_DIR> LDFLAGS=${LIBSTADEN_LDFLAGS} CFLAGS=${LIBSTADEN_CFLAGS} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
@@ -814,7 +814,7 @@ if (NOT LIBSTADENIO_FOUND)
   endif()
 
   set(FETCHED_STADEN TRUE)
-  set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a")
+  set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a;${GAT_SOURCE_DIR}/external/install/lib/libhtscodecs.a")
 endif()
 
 if (ASAN_BUILD)
@@ -906,11 +906,20 @@ set(CPACK_SOURCE_IGNORE_FILES
 
 message("CPACK_SOURCE_IGNORE_FILES = ${CPACK_SOURCE_IGNORE_FILES}")
 
+# we will use this property later
+define_property(TARGET PROPERTY COMPACT_VECTOR_DIR INHERITED 
+   BRIEF_DOCS "the path to the directory containing the compact_vector include tree"
+   FULL_DOCS "the path to the directory containing the compact_vector include tree")
+
 # Recurse into pufferfish source directory 
 # and build the library 
 set(BUILD_PUFF_FOR_SALMON TRUE)
 add_subdirectory(external/pufferfish)
 
+# make sure we know the path to compact_vector
+get_property(COMPACT_VECTOR_INCLUDE_PATH TARGET graphdump PROPERTY COMPACT_VECTOR_DIR)
+message("fetched path for compact_vector as [${COMPACT_VECTOR_INCLUDE_PATH}]")
+
 # and then the main salmon source directory
 add_subdirectory(src)
 


=====================================
cmake/Modules/Findlibstadenio.cmake
=====================================
@@ -10,7 +10,10 @@ find_path(STADEN_INCLUDE_DIR io_lib
 	HINTS ${STADEN_ROOT} ENV STADEN_ROOT
   PATH_SUFFIXES include)
 
-find_library(STADEN_LIBRARY NAMES staden-read libstaden-read
+find_library(STADEN_LIBRARY NAMES staden-read libstaden-read 
+  HINTS ${STADEN_ROOT} ENV STADEN_ROOT PATH_SUFFIXES lib lib64)
+
+find_library(HTSCODEC_LIBRARY NAMES htscodecs libhtscodecs
   HINTS ${STADEN_ROOT} ENV STADEN_ROOT PATH_SUFFIXES lib lib64)
 
 if(STADEN_INCLUDE_DIR)
@@ -30,7 +33,7 @@ find_package_handle_standard_args(libstadenio DEFAULT_MSG
 
 if (LIBSTADENIO_FOUND)
   message(STATUS "Staden IOLib found (include: ${STADEN_INCLUDE_DIR})")
-  set(STADEN_LIBRARIES ${STADEN_LIBRARY})
+  set(STADEN_LIBRARIES "${STADEN_LIBRARY};${HTSCODEC_LIBRARY}")
 endif()
 
 mark_as_advanced(STADEN_INCLUDE_DIR STADEN_LIBRARY)


=====================================
cmake/TestSalmonQuasi.cmake
=====================================
@@ -14,7 +14,10 @@ execute_process(COMMAND ${SALMON_QUASI_INDEX_CMD}
                 )
 
 if (SALMON_QUASI_INDEX_RESULT)
-    message(FATAL_ERROR "Error running ${SALMON_QUASI_INDEX_COMMAND}")
+    message(FATAL_ERROR
+            "While running: ${SALMON_QUASI_INDEX_CMD}\n"
+            "error: ${SALMON_QUASI_INDEX_RESULT}"
+            )
 endif()
 
 set(SALMON_QUANT_COMMAND ${CMAKE_BINARY_DIR}/salmon quant -i sample_salmon_quasi_index -l IU -1 reads_1.fastq -2 reads_2.fastq -o sample_salmon_quasi_quant)
@@ -23,7 +26,10 @@ execute_process(COMMAND ${SALMON_QUANT_COMMAND}
                 RESULT_VARIABLE SALMON_QUASI_QUANT_RESULT
                 )
 if (SALMON_QUASI_QUANT_RESULT)
-    message(FATAL_ERROR "Error running ${SALMON_QUASI_QUANT_RESULT}")
+    message(FATAL_ERROR
+            "While running: ${SALMON_QUANT_COMMAND}\n"
+            "error: ${SALMON_QUASI_QUANT_RESULT}"
+            )
 endif()
 
 if (EXISTS ${TOPLEVEL_DIR}/sample_data/sample_salmon_quasi_quant/quant.sf)


=====================================
current_version.txt
=====================================
@@ -1,3 +1,3 @@
 VERSION_MAJOR 1
-VERSION_MINOR 9
+VERSION_MINOR 10
 VERSION_PATCH 0


=====================================
doc/source/conf.py
=====================================
@@ -55,9 +55,9 @@ copyright = u'2013-2021, Rob Patro, Geet Duggal, Mike Love, Rafael Irizarry and
 # built documents.
 #
 # The short X.Y version.
-version = '1.9'
+version = '1.10'
 # The full version, including alpha/beta/rc tags.
-release = '1.9.0'
+release = '1.10.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.


=====================================
docker/Dockerfile
=====================================
@@ -6,7 +6,7 @@ MAINTAINER salmon.maintainer at gmail.com
 
 ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \
     ca-certificates zlib1g-dev libcurl4-openssl-dev curl unzip autoconf apt-transport-https ca-certificates gnupg software-properties-common wget
-ENV SALMON_VERSION 1.9.0
+ENV SALMON_VERSION 1.10.0
 
 # salmon binary will be installed in /home/salmon/bin/salmon
 


=====================================
docker/build_test.sh
=====================================
@@ -1,3 +1,3 @@
 #! /bin/bash
-SALMON_VERSION=1.9.0
+SALMON_VERSION=1.10.0
 docker build --no-cache -t combinelab/salmon:${SALMON_VERSION} -t combinelab/salmon:latest .


=====================================
include/AlevinOpts.hpp
=====================================
@@ -13,6 +13,10 @@
 
 enum class BarcodeEnd { FIVE = 5, THREE = 3 };
 enum class Sequence { BARCODE, UMI };
+// refers to which reads to use for mapping; use both for 5' libraries
+enum class ReadsToMap { USE_BOTH, USE_FIRST, USE_SECOND };
+// refers to which reads map to transcripts and to which transcripts they map to
+enum class PairingStatus { UNPAIRED_LEFT, UNPAIRED_RIGHT, PAIRED_FR, PAIRED_RF };
 
 template <class protocolT>
 struct AlevinOpts {
@@ -82,11 +86,11 @@ struct AlevinOpts {
   // initialize EM with uniform prior
   bool initUniform;
   //number of cells
-  uint32_t numCells;
+  uint64_t numCells;
   // minimum number of CB to use for low confidence region
-  uint32_t lowRegionMinNumBarcodes;
+  uint64_t lowRegionMinNumBarcodes;
   // maximum number of barcodes to use
-  uint32_t maxNumBarcodes;
+  uint64_t maxNumBarcodes;
   // number of bootstraps to perform
   uint32_t numBootstraps;
   // number of gibbs samples to perform
@@ -115,27 +119,27 @@ struct AlevinOpts {
   boost::filesystem::path bfhFile;
 
   //meta-info related tags
-  uint32_t totalReads;
-  uint32_t totalUsedReads;
-  uint32_t readsThrown;
+  uint64_t totalReads;
+  uint64_t totalUsedReads;
+  uint64_t readsThrown;
 
-  uint32_t totalCBs;
-  uint32_t totalUsedCBs;
+  uint64_t totalCBs;
+  uint64_t totalUsedCBs;
 
-  uint32_t kneeCutoff;
-  uint32_t intelligentCutoff;
-  uint32_t totalLowConfidenceCBs;
-  uint32_t numFeatures;
-  uint32_t numNoMapCB;
+  uint64_t kneeCutoff;
+  uint64_t intelligentCutoff;
+  uint64_t totalLowConfidenceCBs;
+  uint64_t numFeatures;
+  uint64_t numNoMapCB;
 
-  uint32_t eqReads;
-  uint32_t noisyUmis;
+  uint64_t eqReads;
+  uint64_t noisyUmis;
   double mappingRate;
   double keepCBFraction;
   double vbemNorm;
 
-  uint32_t totalDedupUMIs;
-  uint32_t totalExpGenes;
+  uint64_t totalDedupUMIs;
+  uint64_t totalExpGenes;
 };
 
 #endif // ALEVIN_OPTS_HPP


=====================================
include/SalmonConfig.hpp
=====================================
@@ -26,9 +26,9 @@
 
 namespace salmon {
 constexpr char majorVersion[] = "1";
-constexpr char minorVersion[] = "9";
+constexpr char minorVersion[] = "10";
 constexpr char patchVersion[] = "0";
-constexpr char version[] = "1.9.0";
+constexpr char version[] = "1.10.0";
 constexpr uint32_t indexVersion = 5;
 constexpr char requiredQuasiIndexVersion[] = "p7";
 } // namespace salmon


=====================================
include/SalmonMappingUtils.hpp
=====================================
@@ -22,7 +22,6 @@
 #ifndef __SALMON_MAPPING_UTILS__
 #define __SALMON_MAPPING_UTILS__
 
-
 #include <algorithm>
 #include <atomic>
 #include <cassert>
@@ -39,107 +38,122 @@
 // Future C++ convenience classes
 #include "core/range.hpp"
 
-#include "SalmonDefaults.hpp"
-#include "SalmonMath.hpp"
-#include "SalmonUtils.hpp"
-#include "Transcript.hpp"
+#include "AlevinOpts.hpp"
 #include "AlignmentGroup.hpp"
 #include "ProgramOptionsGenerator.hpp"
 #include "ReadExperiment.hpp"
+#include "SalmonDefaults.hpp"
+#include "SalmonMath.hpp"
 #include "SalmonOpts.hpp"
+#include "SalmonUtils.hpp"
+#include "Transcript.hpp"
 
-#include "pufferfish/Util.hpp"
-#include "pufferfish/MemCollector.hpp"
+#include "parallel_hashmap/phmap.h"
 #include "pufferfish/MemChainer.hpp"
-#include "pufferfish/SAMWriter.hpp"
+#include "pufferfish/MemCollector.hpp"
 #include "pufferfish/PuffAligner.hpp"
-#include "pufferfish/ksw2pp/KSW2Aligner.hpp"
-#include "pufferfish/metro/metrohash64.h"
+#include "pufferfish/SAMWriter.hpp"
 #include "pufferfish/SelectiveAlignmentUtils.hpp"
+#include "pufferfish/Util.hpp"
 #include "pufferfish/itlib/small_vector.hpp"
-#include "parallel_hashmap/phmap.h"
+#include "pufferfish/ksw2pp/KSW2Aligner.hpp"
+#include "pufferfish/metro/metrohash64.h"
 
 namespace salmon {
-  namespace mapping_utils {
+namespace mapping_utils {
 
-    using MateStatus = pufferfish::util::MateStatus;
-    constexpr const int32_t invalid_score_ = std::numeric_limits<int32_t>::min();
-    constexpr const int32_t invalid_index_ = std::numeric_limits<int32_t>::min();
-    constexpr const size_t static_vec_size = 32;
+using MateStatus = pufferfish::util::MateStatus;
+constexpr const int32_t invalid_score_ = std::numeric_limits<int32_t>::min();
+constexpr const int32_t invalid_index_ = std::numeric_limits<int32_t>::min();
+constexpr const size_t static_vec_size = 32;
 
-    class MappingScoreInfo {
-    public:
-      MappingScoreInfo()
-          : bestScore(invalid_score_), secondBestScore(invalid_score_),
-            bestDecoyScore(invalid_score_), decoyThresh(1.0), collect_decoy_info_(false) {}
+class MappingScoreInfo {
+public:
+  MappingScoreInfo()
+      : bestScore(invalid_score_), secondBestScore(invalid_score_),
+        bestDecoyScore(invalid_score_), decoyThresh(1.0),
+        collect_decoy_info_(false) {}
 
-      MappingScoreInfo(double decoyThreshIn) : MappingScoreInfo() {
-        decoyThresh = decoyThreshIn;
-      }
+  MappingScoreInfo(double decoyThreshIn) : MappingScoreInfo() {
+    decoyThresh = decoyThreshIn;
+  }
 
-      void collect_decoys(bool do_collect) { collect_decoy_info_ = do_collect; }
-      bool collect_decoys() const { return collect_decoy_info_; }
-
-      // clear everything but the decoy threshold 
-      void clear(size_t num_hits) {
-        bestScore = invalid_score_;
-        secondBestScore = invalid_score_;
-        bestDecoyScore = invalid_score_;
-        scores_.clear(); scores_.resize(num_hits, invalid_score_);
-        bestScorePerTranscript_.clear();
-        perm_.clear();
-        // NOTE: we do _not_ reset decoyThresh here
-        if (collect_decoy_info_) {
-          bool revert_to_static = best_decoy_hits.size() > static_vec_size;
-          best_decoy_hits.clear();
-          if (revert_to_static) { best_decoy_hits.revert_to_static(); }
-        }
-      }
+  void collect_decoys(bool do_collect) { collect_decoy_info_ = do_collect; }
+  bool collect_decoys() const { return collect_decoy_info_; }
 
-      bool haveOnlyDecoyMappings() const {
-        // if the best non-decoy mapping has score less than decoyThresh *
-        // bestDecoyScore and if the bestDecoyScore is a valid value, then we
-        // have no valid non-decoy mappings.
-        return (bestScore < static_cast<int32_t>(decoyThresh * bestDecoyScore)) and
-               (bestDecoyScore > std::numeric_limits<decltype(bestDecoyScore)>::min());
+  // clear everything but the decoy threshold
+  void clear(size_t num_hits) {
+    bestScore = invalid_score_;
+    secondBestScore = invalid_score_;
+    bestDecoyScore = invalid_score_;
+    scores_.clear();
+    scores_.resize(num_hits, invalid_score_);
+    bestScorePerTranscript_.clear();
+    perm_.clear();
+    // NOTE: we do _not_ reset decoyThresh here
+    if (collect_decoy_info_) {
+      bool revert_to_static = best_decoy_hits.size() > static_vec_size;
+      best_decoy_hits.clear();
+      if (revert_to_static) {
+        best_decoy_hits.revert_to_static();
       }
-      
-      inline bool update_decoy_mappings(int32_t hitScore, size_t idx, uint32_t tid) {
-        const bool better_score = hitScore > bestDecoyScore;
-        if (hitScore > bestDecoyScore) {
-          bestDecoyScore = hitScore;
-          if (collect_decoy_info_) { 
-            best_decoy_hits.clear();
-            best_decoy_hits.push_back(std::make_pair(static_cast<int32_t>(idx), static_cast<int32_t>(tid)));
-          }
-        } else if (collect_decoy_info_ and (hitScore == bestDecoyScore)){
-          best_decoy_hits.push_back(std::make_pair(static_cast<int32_t>(idx), static_cast<int32_t>(tid)));
-        }
-        return better_score;
+    }
+  }
+
+  bool haveOnlyDecoyMappings() const {
+    // if the best non-decoy mapping has score less than decoyThresh *
+    // bestDecoyScore and if the bestDecoyScore is a valid value, then we
+    // have no valid non-decoy mappings.
+    return (bestScore < static_cast<int32_t>(decoyThresh * bestDecoyScore)) and
+           (bestDecoyScore >
+            std::numeric_limits<decltype(bestDecoyScore)>::min());
+  }
+
+  inline bool update_decoy_mappings(int32_t hitScore, size_t idx,
+                                    uint32_t tid) {
+    const bool better_score = hitScore > bestDecoyScore;
+    if (hitScore > bestDecoyScore) {
+      bestDecoyScore = hitScore;
+      if (collect_decoy_info_) {
+        best_decoy_hits.clear();
+        best_decoy_hits.push_back(std::make_pair(static_cast<int32_t>(idx),
+                                                 static_cast<int32_t>(tid)));
       }
+    } else if (collect_decoy_info_ and (hitScore == bestDecoyScore)) {
+      best_decoy_hits.push_back(
+          std::make_pair(static_cast<int32_t>(idx), static_cast<int32_t>(tid)));
+    }
+    return better_score;
+  }
 
-      int32_t bestScore;
-      int32_t secondBestScore;
-      int32_t bestDecoyScore;
-      double decoyThresh;
-      itlib::small_vector<std::pair<int32_t, int32_t>> best_decoy_hits;
-      bool collect_decoy_info_;
-      std::vector<int32_t> scores_;
-      phmap::flat_hash_map<uint32_t, std::pair<int32_t, int32_t>> bestScorePerTranscript_;
-      std::vector<std::pair<int32_t, int32_t>> perm_;
-    };
+  int32_t bestScore;
+  int32_t secondBestScore;
+  int32_t bestDecoyScore;
+  double decoyThresh;
+  itlib::small_vector<std::pair<int32_t, int32_t>> best_decoy_hits;
+  bool collect_decoy_info_;
+  std::vector<int32_t> scores_;
+  phmap::flat_hash_map<uint32_t, std::pair<int32_t, int32_t>>
+      bestScorePerTranscript_;
+  std::vector<std::pair<int32_t, int32_t>> perm_;
+};
 
 template <typename IndexT>
-inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& memCollector, ksw2pp::KSW2Aligner& aligner,
-                        pufferfish::util::AlignmentConfig& aconf, pufferfish::util::MappingConstraintPolicy& mpol) {
+inline bool
+initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& memCollector,
+                   ksw2pp::KSW2Aligner& aligner,
+                   pufferfish::util::AlignmentConfig& aconf,
+                   pufferfish::util::MappingConstraintPolicy& mpol) {
   memCollector.configureMemClusterer(salmonOpts.maxOccsPerHit);
-  double consensusFraction = (salmonOpts.consensusSlack == 0.0) ? 1.0 : (1.0 - salmonOpts.consensusSlack);
+  double consensusFraction = (salmonOpts.consensusSlack == 0.0)
+                                 ? 1.0
+                                 : (1.0 - salmonOpts.consensusSlack);
   memCollector.setConsensusFraction(consensusFraction);
   memCollector.setHitFilterPolicy(salmonOpts.hitFilterPolicy);
   memCollector.setAltSkip(salmonOpts.mismatchSeedSkip);
   memCollector.setChainSubOptThresh(salmonOpts.pre_merge_chain_sub_thresh);
 
-  //Initialize ksw aligner
+  // Initialize ksw aligner
   ksw2pp::KSW2Config config;
   config.dropoff = -1;
   config.gapo = salmonOpts.gapOpenPenalty;
@@ -166,14 +180,17 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& mem
   aconf.mimicBT2 = salmonOpts.mimicBT2;
   aconf.mimicBT2Strict = salmonOpts.mimicStrictBT2;
   aconf.allowOverhangSoftclip = salmonOpts.softclipOverhangs;
-  aconf.allowSoftclip  = salmonOpts.softclip; 
+  aconf.allowSoftclip = salmonOpts.softclip;
   aconf.useAlignmentCache = !salmonOpts.disableAlignmentCache;
   aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::SCORE_ONLY;
 
-  // we actually care about the softclips in the cigar string 
-  // if we are writing output and softclipping (or softclipping of overhangs) is enabled
-  if ( (!salmonOpts.qmFileName.empty()) and (salmonOpts.softclip or salmonOpts.softclipOverhangs) ) {
-    aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR;
+  // we actually care about the softclips in the cigar string
+  // if we are writing output and softclipping (or softclipping of overhangs) is
+  // enabled
+  if ((!salmonOpts.qmFileName.empty()) and
+      (salmonOpts.softclip or salmonOpts.softclipOverhangs)) {
+    aconf.alignmentMode =
+        pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR;
   }
 
   mpol.noOrphans = !salmonOpts.allowOrphans;
@@ -189,25 +206,26 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& mem
 
   mpol.setPostMergeChainSubThresh(salmonOpts.post_merge_chain_sub_thresh);
   mpol.setOrphanChainSubThresh(salmonOpts.orphan_chain_sub_thresh);
-  
+
   return true;
 }
 
-
-inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, size_t idx,
-                  const std::vector<Transcript>& transcripts,
-                  int32_t invalidScore,
-                  salmon::mapping_utils::MappingScoreInfo& msi) {
+inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat,
+                              size_t idx,
+                              const std::vector<Transcript>& transcripts,
+                              int32_t invalidScore,
+                              salmon::mapping_utils::MappingScoreInfo& msi) {
   auto& scores = msi.scores_;
   scores[idx] = hitScore;
   auto& t = transcripts[tid];
   bool isDecoy = t.isDecoy();
-  double decoyCutoff = static_cast<int32_t>(msi.decoyThresh * msi.bestDecoyScore);
+  double decoyCutoff =
+      static_cast<int32_t>(msi.decoyThresh * msi.bestDecoyScore);
 
-  //if (hitScore < decoyCutoff or (hitScore == invalidScore)) { }
+  // if (hitScore < decoyCutoff or (hitScore == invalidScore)) { }
 
   if (isDecoy) {
-    // NOTE: decide here if we need to process any of this if the 
+    // NOTE: decide here if we need to process any of this if the
     // current score is < the best (non-decoy) score. I think not.
 
     // if this is a decoy and its score is better than the best decoy score
@@ -215,10 +233,10 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
     (void)did_update;
     return;
   } else if (hitScore < decoyCutoff or (hitScore == invalidScore)) {
-    // if the current score is to a valid target but doesn't 
+    // if the current score is to a valid target but doesn't
     // exceed the necessary decoy threshold, then skip it.
     return;
-  } 
+  }
   // otherwise, we have a "high-scoring" hit to a non-decoy
 
   auto& perm = msi.perm_;
@@ -230,7 +248,8 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
     // this is the current best
     bestScorePerTranscript[tid].first = hitScore;
     bestScorePerTranscript[tid].second = idx;
-  } else if ((hitScore > it->second.first) or (hitScore == it->second.first and isCompat)) {
+  } else if ((hitScore > it->second.first) or
+             (hitScore == it->second.first and isCompat)) {
     // otherwise, if we had an alignment for this transcript and it's
     // better than the current best, then set the best score to this
     // alignment's score, and invalidate the previous alignment
@@ -241,7 +260,7 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
     // otherwise, there is already a better mapping for this transcript.
     scores[idx] = invalidScore;
   }
-  
+
   if (hitScore > msi.bestScore) {
     msi.secondBestScore = msi.bestScore;
     msi.bestScore = hitScore;
@@ -249,44 +268,50 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
   perm.push_back(std::make_pair(idx, tid));
 }
 
-
 inline void filterAndCollectAlignments(
-                                       std::vector<pufferfish::util::JointMems>& jointHits,
-                                       uint32_t readLen,
-                                       uint32_t mateLen,
-                                       bool singleEnd,
-                                       bool tryAlign,
-                                       bool hardFilter,
-                                       double scoreExp,
-                                       double minAlnProb,
-                                       salmon::mapping_utils::MappingScoreInfo& msi,
-                                       std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
+    std::vector<pufferfish::util::JointMems>& jointHits, uint32_t readLen,
+    uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter,
+    double scoreExp, double minAlnProb,
+    salmon::mapping_utils::MappingScoreInfo& msi,
+    std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
 
   auto invalidScore = std::numeric_limits<decltype(msi.bestDecoyScore)>::min();
-  if (msi.bestDecoyScore == invalidScore) { msi.bestDecoyScore = invalidScore + 1 ;}
+  if (msi.bestDecoyScore == invalidScore) {
+    msi.bestDecoyScore = invalidScore + 1;
+  }
 
-  int32_t decoyThreshold = static_cast<decltype(msi.bestDecoyScore)>(msi.decoyThresh * msi.bestDecoyScore);
+  int32_t decoyThreshold = static_cast<decltype(msi.bestDecoyScore)>(
+      msi.decoyThresh * msi.bestDecoyScore);
 
-  //auto filterScore = (bestDecoyScore < secondBestScore) ? secondBestScore : bestDecoyScore;
+  // auto filterScore = (bestDecoyScore < secondBestScore) ? secondBestScore :
+  // bestDecoyScore;
   auto& scores = msi.scores_;
   auto& perm = msi.perm_;
 
   // throw away any pairs for which we should not produce valid alignments :
   // ======
-  // If we are doing soft-filtering (default), we remove those not exceeding the bestDecoyScore
-  // If we are doing hard-filtering, we remove those less than the bestScore
-  perm.erase(std::remove_if(perm.begin(), perm.end(),
-                            [&scores, hardFilter, &msi, decoyThreshold, invalidScore](const std::pair<int32_t, int32_t>& idxtid) -> bool {
-                              return !hardFilter ?  scores[idxtid.first] < decoyThreshold : scores[idxtid.first] < msi.bestScore;
-                              //return !hardFilter ?  scores[idxtid.first] < filterScore : scores[idxtid.first] < bestScore; 
-                            }), perm.end());
-
-  // Unlike RapMap, pufferfish doesn't guarantee the hits computed above are in order
-  // by transcript, so we find the permutation of indices that puts things in transcript
-  // order.
-  std::sort(perm.begin(), perm.end(), [](const std::pair<int32_t, int32_t>& p1,
-                                         const std::pair<int32_t, int32_t>& p2) {
-                                        return p1.second < p2.second;});
+  // If we are doing soft-filtering (default), we remove those not exceeding the
+  // bestDecoyScore If we are doing hard-filtering, we remove those less than
+  // the bestScore
+  perm.erase(std::remove_if(
+                 perm.begin(), perm.end(),
+                 [&scores, hardFilter, &msi, decoyThreshold, invalidScore](
+                     const std::pair<int32_t, int32_t>& idxtid) -> bool {
+                   return !hardFilter ? scores[idxtid.first] < decoyThreshold
+                                      : scores[idxtid.first] < msi.bestScore;
+                   // return !hardFilter ?  scores[idxtid.first] < filterScore :
+                   // scores[idxtid.first] < bestScore;
+                 }),
+             perm.end());
+
+  // Unlike RapMap, pufferfish doesn't guarantee the hits computed above are in
+  // order by transcript, so we find the permutation of indices that puts things
+  // in transcript order.
+  std::sort(perm.begin(), perm.end(),
+            [](const std::pair<int32_t, int32_t>& p1,
+               const std::pair<int32_t, int32_t>& p2) {
+              return p1.second < p2.second;
+            });
 
   // moving our alinged / score jointMEMs over to QuasiAlignment objects
   double bestScoreD = static_cast<double>(msi.bestScore);
@@ -298,27 +323,33 @@ inline void filterAndCollectAlignments(
     double currScore = scores[ctr];
     double v = bestScoreD - currScore;
     // why -1?
-    double estAlnProb = hardFilter ? -1.0 : std::exp(- scoreExp * v );
+    double estAlnProb = hardFilter ? -1.0 : std::exp(-scoreExp * v);
     // skip any alignment with aln prob < minAlnProb
-    if (!hardFilter and (estAlnProb < minAlnProb)) { continue; }
+    if (!hardFilter and (estAlnProb < minAlnProb)) {
+      continue;
+    }
 
     if (singleEnd or jointHit.isOrphan()) {
       readLen = jointHit.isLeftAvailable() ? readLen : mateLen;
-      jointAlignments.emplace_back(tid,           // reference id
-                                   jointHit.orphanClust()->getTrFirstHitPos(),     // reference pos
-                                   jointHit.orphanClust()->isFw,     // fwd direction
-                                   readLen, // read length
-                                   jointHit.orphanClust()->cigar, // cigar string
-                                   jointHit.fragmentLen,       // fragment length
-                                   false);
-      auto &qaln = jointAlignments.back();
+      jointAlignments.emplace_back(
+          tid,                                        // reference id
+          jointHit.orphanClust()->getTrFirstHitPos(), // reference pos
+          jointHit.orphanClust()->isFw,               // fwd direction
+          readLen,                                    // read length
+          jointHit.orphanClust()->cigar,              // cigar string
+          jointHit.fragmentLen,                       // fragment length
+          false);
+      auto& qaln = jointAlignments.back();
       // NOTE : score should not be filled in from a double
-      qaln.score = !tryAlign ? static_cast<int32_t >(jointHit.orphanClust()->coverage):jointHit.alignmentScore;
+      qaln.score = !tryAlign
+                       ? static_cast<int32_t>(jointHit.orphanClust()->coverage)
+                       : jointHit.alignmentScore;
       qaln.estAlnProb(estAlnProb);
       // NOTE : wth is numHits?
-      qaln.numHits = static_cast<uint32_t >(jointHits.size());//orphanClust()->coverage;
+      qaln.numHits =
+          static_cast<uint32_t>(jointHits.size()); // orphanClust()->coverage;
       qaln.mateStatus = jointHit.mateStatus;
-      if(singleEnd) {
+      if (singleEnd) {
         qaln.mateLen = readLen;
         qaln.mateCigar.clear();
         qaln.matePos = 0;
@@ -327,116 +358,732 @@ inline void filterAndCollectAlignments(
         qaln.mateStatus = MateStatus::SINGLE_END;
       }
     } else {
-      jointAlignments.emplace_back(tid,           // reference id
-                                   jointHit.leftClust->getTrFirstHitPos(),     // reference pos
-                                   jointHit.leftClust->isFw,     // fwd direction
-                                   readLen, // read length
-                                   jointHit.leftClust->cigar, // cigar string
-                                   jointHit.fragmentLen,       // fragment length
-                                   true);         // properly paired
+      jointAlignments.emplace_back(
+          tid,                                    // reference id
+          jointHit.leftClust->getTrFirstHitPos(), // reference pos
+          jointHit.leftClust->isFw,               // fwd direction
+          readLen,                                // read length
+          jointHit.leftClust->cigar,              // cigar string
+          jointHit.fragmentLen,                   // fragment length
+          true);                                  // properly paired
       // Fill in the mate info
-      auto &qaln = jointAlignments.back();
+      auto& qaln = jointAlignments.back();
       qaln.mateLen = mateLen;
       qaln.mateCigar = jointHit.rightClust->cigar;
-      qaln.matePos = static_cast<int32_t >(jointHit.rightClust->getTrFirstHitPos());
+      qaln.matePos =
+          static_cast<int32_t>(jointHit.rightClust->getTrFirstHitPos());
       qaln.mateIsFwd = jointHit.rightClust->isFw;
       qaln.mateStatus = MateStatus::PAIRED_END_PAIRED;
       // NOTE : wth is numHits?
-      qaln.numHits = static_cast<uint32_t >(jointHits.size());
+      qaln.numHits = static_cast<uint32_t>(jointHits.size());
       // NOTE : score should not be filled in from a double
-      qaln.score = !tryAlign ? static_cast<int32_t >(jointHit.leftClust->coverage):jointHit.alignmentScore;
+      qaln.score = !tryAlign
+                       ? static_cast<int32_t>(jointHit.leftClust->coverage)
+                       : jointHit.alignmentScore;
       qaln.estAlnProb(estAlnProb);
-      qaln.mateScore = !tryAlign ? static_cast<int32_t >(jointHit.rightClust->coverage):jointHit.mateAlignmentScore;
+      qaln.mateScore = !tryAlign
+                           ? static_cast<int32_t>(jointHit.rightClust->coverage)
+                           : jointHit.mateAlignmentScore;
     }
   }
   // done moving our alinged / score jointMEMs over to QuasiAlignment objects
 }
 
-
 inline void filterAndCollectAlignmentsDecoy(
-                                       std::vector<pufferfish::util::JointMems>& jointHits,
-                                       uint32_t readLen,
-                                       uint32_t mateLen,
-                                       bool singleEnd,
-                                       bool tryAlign,
-                                       bool hardFilter,
-                                       double scoreExp,
-                                       double minAlnProb,
-                                       salmon::mapping_utils::MappingScoreInfo& msi,
-                                       std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
-// NOTE: this function should only be called in the case that we have valid decoy mappings to report.
-// Currently, this happens only when there are *no valid non-decoy* mappings.
-// Further, this function will only add equally *best* decoy mappings to the output jointAlignments object
-// regardless of the the status of hardFilter (i.e. no sub-optimal decoy mappings will be reported).
-(void) hardFilter;
-(void) minAlnProb;
-(void) scoreExp;
-double estAlnProb = 1.0; //std::exp(-scoreExp * 0.0);
-for (auto& idxTxp : msi.best_decoy_hits) {
-  int32_t ctr = idxTxp.first;
-  int32_t tid = idxTxp.second;
-  auto& jointHit = jointHits[ctr];
-
-  if (singleEnd or jointHit.isOrphan()) {
-    readLen = jointHit.isLeftAvailable() ? readLen : mateLen;
-    jointAlignments.emplace_back(
-        tid,                                        // reference id
-        jointHit.orphanClust()->getTrFirstHitPos(), // reference pos
-        jointHit.orphanClust()->isFw,               // fwd direction
-        readLen,                                    // read length
-        jointHit.orphanClust()->cigar,              // cigar string
-        jointHit.fragmentLen,                       // fragment length
-        false);
-    auto& qaln = jointAlignments.back();
-    // NOTE : score should not be filled in from a double
-    qaln.score = !tryAlign
-                     ? static_cast<int32_t>(jointHit.orphanClust()->coverage)
-                     : jointHit.alignmentScore;
-    qaln.estAlnProb(estAlnProb);
-    // NOTE : wth is numHits?
-    qaln.numHits =
-        static_cast<uint32_t>(jointHits.size()); // orphanClust()->coverage;
-    qaln.mateStatus = jointHit.mateStatus;
-    if (singleEnd) {
-      qaln.mateLen = readLen;
-      qaln.mateCigar.clear();
-      qaln.matePos = 0;
-      qaln.mateIsFwd = true;
-      qaln.mateScore = 0;
-      qaln.mateStatus = MateStatus::SINGLE_END;
+    std::vector<pufferfish::util::JointMems>& jointHits, uint32_t readLen,
+    uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter,
+    double scoreExp, double minAlnProb,
+    salmon::mapping_utils::MappingScoreInfo& msi,
+    std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
+  // NOTE: this function should only be called in the case that we have valid
+  // decoy mappings to report. Currently, this happens only when there are *no
+  // valid non-decoy* mappings. Further, this function will only add equally
+  // *best* decoy mappings to the output jointAlignments object regardless of
+  // the the status of hardFilter (i.e. no sub-optimal decoy mappings will be
+  // reported).
+  (void)hardFilter;
+  (void)minAlnProb;
+  (void)scoreExp;
+  double estAlnProb = 1.0; // std::exp(-scoreExp * 0.0);
+  for (auto& idxTxp : msi.best_decoy_hits) {
+    int32_t ctr = idxTxp.first;
+    int32_t tid = idxTxp.second;
+    auto& jointHit = jointHits[ctr];
+
+    if (singleEnd or jointHit.isOrphan()) {
+      readLen = jointHit.isLeftAvailable() ? readLen : mateLen;
+      jointAlignments.emplace_back(
+          tid,                                        // reference id
+          jointHit.orphanClust()->getTrFirstHitPos(), // reference pos
+          jointHit.orphanClust()->isFw,               // fwd direction
+          readLen,                                    // read length
+          jointHit.orphanClust()->cigar,              // cigar string
+          jointHit.fragmentLen,                       // fragment length
+          false);
+      auto& qaln = jointAlignments.back();
+      // NOTE : score should not be filled in from a double
+      qaln.score = !tryAlign
+                       ? static_cast<int32_t>(jointHit.orphanClust()->coverage)
+                       : jointHit.alignmentScore;
+      qaln.estAlnProb(estAlnProb);
+      // NOTE : wth is numHits?
+      qaln.numHits =
+          static_cast<uint32_t>(jointHits.size()); // orphanClust()->coverage;
+      qaln.mateStatus = jointHit.mateStatus;
+      if (singleEnd) {
+        qaln.mateLen = readLen;
+        qaln.mateCigar.clear();
+        qaln.matePos = 0;
+        qaln.mateIsFwd = true;
+        qaln.mateScore = 0;
+        qaln.mateStatus = MateStatus::SINGLE_END;
+      }
+    } else {
+      jointAlignments.emplace_back(
+          tid,                                    // reference id
+          jointHit.leftClust->getTrFirstHitPos(), // reference pos
+          jointHit.leftClust->isFw,               // fwd direction
+          readLen,                                // read length
+          jointHit.leftClust->cigar,              // cigar string
+          jointHit.fragmentLen,                   // fragment length
+          true);                                  // properly paired
+      // Fill in the mate info
+      auto& qaln = jointAlignments.back();
+      qaln.mateLen = mateLen;
+      qaln.mateCigar = jointHit.rightClust->cigar;
+      qaln.matePos =
+          static_cast<int32_t>(jointHit.rightClust->getTrFirstHitPos());
+      qaln.mateIsFwd = jointHit.rightClust->isFw;
+      qaln.mateStatus = MateStatus::PAIRED_END_PAIRED;
+      // NOTE : wth is numHits?
+      qaln.numHits = static_cast<uint32_t>(jointHits.size());
+      // NOTE : score should not be filled in from a double
+      qaln.score = !tryAlign
+                       ? static_cast<int32_t>(jointHit.leftClust->coverage)
+                       : jointHit.alignmentScore;
+      qaln.estAlnProb(estAlnProb);
+      qaln.mateScore = !tryAlign
+                           ? static_cast<int32_t>(jointHit.rightClust->coverage)
+                           : jointHit.mateAlignmentScore;
     }
-  } else {
-    jointAlignments.emplace_back(
-        tid,                                    // reference id
-        jointHit.leftClust->getTrFirstHitPos(), // reference pos
-        jointHit.leftClust->isFw,               // fwd direction
-        readLen,                                // read length
-        jointHit.leftClust->cigar,              // cigar string
-        jointHit.fragmentLen,                   // fragment length
-        true);                                  // properly paired
-    // Fill in the mate info
-    auto& qaln = jointAlignments.back();
-    qaln.mateLen = mateLen;
-    qaln.mateCigar = jointHit.rightClust->cigar;
-    qaln.matePos =
-        static_cast<int32_t>(jointHit.rightClust->getTrFirstHitPos());
-    qaln.mateIsFwd = jointHit.rightClust->isFw;
-    qaln.mateStatus = MateStatus::PAIRED_END_PAIRED;
-    // NOTE : wth is numHits?
-    qaln.numHits = static_cast<uint32_t>(jointHits.size());
-    // NOTE : score should not be filled in from a double
-    qaln.score = !tryAlign ? static_cast<int32_t>(jointHit.leftClust->coverage)
-                           : jointHit.alignmentScore;
-    qaln.estAlnProb(estAlnProb);
-    qaln.mateScore = !tryAlign
-                         ? static_cast<int32_t>(jointHit.rightClust->coverage)
-                         : jointHit.mateAlignmentScore;
+  } // end for over best decoy hits
+}
+
+namespace pasc {
+
+constexpr int32_t invalid_frag_len = std::numeric_limits<int32_t>::min();
+constexpr int32_t invalid_mate_pos = std::numeric_limits<int32_t>::min();
+
+/* from pesc
+struct simple_hit {
+    bool is_fw{false};
+    bool mate_is_fw{false};
+    int32_t pos{-1};
+    float score{0.0};
+    uint32_t num_hits{0};
+    uint32_t tid{std::numeric_limits<uint32_t>::max()};
+    int32_t mate_pos{std::numeric_limits<int32_t>::max()};
+    int32_t fragment_length{std::numeric_limits<int32_t>::max()};
+    inline bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over)
+{ int32_t signed_txp_len = static_cast<int32_t>(txp_len); return (pos >
+-max_over) and ((pos + read_len) < (signed_txp_len + max_over));
+    }
+    inline bool has_mate() const { return mate_pos != invalid_mate_pos; }
+    inline bool mate_is_mapped() const { return mate_pos != invalid_mate_pos; }
+    inline int32_t frag_len() const {
+        return (fragment_length != invalid_frag_len) ? fragment_length : 0;
+    }
+};
+*/
+struct simple_hit {
+  bool is_fw{false};
+  int32_t pos{-1};
+  float score{0.0};
+  uint32_t num_hits{0};
+  uint32_t tid{std::numeric_limits<uint32_t>::max()};
+  // int32_t mate_pos{std::numeric_limits<int32_t>::max()};
+  // int32_t fragment_length{std::numeric_limits<int32_t>::max()};
+
+  // UNPAIRED_LEFT, UNPAIRED_RIGHT, PAIRED_FR, PAIRED_RF
+  PairingStatus pairing_status{PairingStatus::UNPAIRED_RIGHT};
+
+  bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) {
+    int32_t signed_txp_len = static_cast<int32_t>(txp_len);
+    return (pos > -max_over) and
+           ((pos + read_len) < (signed_txp_len + max_over));
+  }
+
+  bool operator<(const simple_hit& hit2) {
+    if (tid != hit2.tid) { // hit1 and hit2 are on different transcripts
+      return tid < hit2.tid;
+    } else { // hit1 and hit2 are on the same transcript
+      // NOTE: Is this what we actually want?
+      if (is_fw) { // fw < rc
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+};
+
+enum class HitDirection : uint8_t { FW, RC, BOTH };
+
+/*
+struct sketch_hit_info {
+  // add a hit to the current target that occurs in the forward
+  // orientation with respect to the target.
+  bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k,
+              int32_t max_stretch, float score_inc) {
+    (void)rl;
+    (void)k;
+    bool added{false};
+
+    // since hits are collected by moving _forward_ in the
+    // read, if this is a fw hit, it should be moving
+    // forward in the reference. Only add it if this is
+    // the case.  This ensure that we don't
+    // double-count a k-mer that might occur twice on
+    // this target.
+    if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) {
+      if (last_read_pos_fw == -1) {
+        approx_pos_fw = ref_pos - read_pos;
+      } else {
+        if ((ref_pos - approx_pos_fw) > max_stretch) {
+          return false;
+        }
+      }
+      // if (last_ref_pos_fw > -1 and (ref_pos > last_ref_pos_fw + 15)) { return
+      // false; }
+      last_ref_pos_fw = ref_pos;
+      last_read_pos_fw = read_pos;
+      fw_score += score_inc;
+      ++fw_hits;
+      added = true;
+    }
+    return added;
+  }
+
+  // add a hit to the current target that occurs in the forward
+  // orientation with respect to the target.
+  bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k,
+              int32_t max_stretch, float score_inc) {
+
+    bool added{false};
+    // since hits are collected by moving _forward_ in the
+    // read, if this is an rc hit, it should be moving
+    // backwards in the reference. Only add it if this is
+    // the case.
+    // This ensures that we don't double-count a k-mer that
+    // might occur twice on this target.
+    if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
+      approx_pos_rc = (ref_pos - (rl - (read_pos + k)));
+      if (last_read_pos_rc == -1) {
+        approx_end_pos_rc = ref_pos + read_pos;
+      } else {
+        if (approx_end_pos_rc - approx_pos_rc > max_stretch) {
+          return false;
+        }
+      }
+      // if (last_ref_pos_rc > -1 and ref_pos < last_ref_pos_rc - 15) { return
+      // false; }
+      last_ref_pos_rc = ref_pos;
+      last_read_pos_rc = read_pos;
+      rc_score += score_inc;
+      ++rc_hits;
+      added = true;
+    }
+    return added;
   }
-} // end for over best decoy hits
+
+  inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); }
+
+  // true if forward, false if rc
+  // second element is score
+  inline HitDirection best_hit_direction() {
+    int32_t fw_minus_rc =
+        static_cast<int32_t>(fw_hits) - static_cast<int32_t>(rc_hits);
+    return (fw_minus_rc > 0)
+               ? HitDirection::FW
+               : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH);
+  }
+
+  inline simple_hit get_fw_hit(PairingStatus ps) {
+    return simple_hit{true,
+                      approx_pos_fw,
+                      fw_score,
+                      fw_hits,
+                      std::numeric_limits<uint32_t>::max(),
+                      ps};
+  }
+
+  inline simple_hit get_rc_hit(PairingStatus ps) {
+    return simple_hit{false,
+                      approx_pos_rc,
+                      rc_score,
+                      rc_hits,
+                      std::numeric_limits<uint32_t>::max(),
+                      ps};
+  }
+
+  inline std::string to_string() {
+    std::stringstream ss;
+    ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score
+       << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits
+       << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc;
+    return ss.str();
+  }
+
+  int32_t last_read_pos_fw{-1};
+  int32_t last_read_pos_rc{-1};
+
+  int32_t last_ref_pos_fw{-1};
+  int32_t last_ref_pos_rc{std::numeric_limits<int32_t>::max()};
+
+  int32_t approx_pos_fw{-1};
+  int32_t approx_pos_rc{-1};
+  int32_t approx_end_pos_rc{-1};
+
+  uint32_t fw_hits{0};
+  uint32_t rc_hits{0};
+  float fw_score{0.0};
+  float rc_score{0.0};
+};*/
+
+struct sketch_hit_info {
+    // add a hit to the current target that occurs in the forward
+    // orientation with respect to the target.
+    bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch,
+                float score_inc) {
+        (void)rl;
+        (void)k;
+        bool added{false};
+
+        // since hits are collected by moving _forward_ in the
+        // read, if this is a fw hit, it should be moving
+        // forward in the reference. Only add it if this is
+        // the case.  This ensure that we don't
+        // double-count a k-mer that might occur twice on
+        // this target.
+        if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) {
+            if (last_read_pos_fw == -1) {
+                approx_pos_fw = ref_pos - read_pos;
+            } else {
+                if ((ref_pos - approx_pos_fw) > max_stretch) { return false; }
+            }
+            last_ref_pos_fw = ref_pos;
+            last_read_pos_fw = read_pos;
+            fw_score += score_inc;
+            ++fw_hits;
+            added = true;
+        }
+        return added;
+    }
+
+    // add a hit to the current target that occurs in the forward
+    // orientation with respect to the target.
+    bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch,
+                float score_inc) {
+        bool added{false};
+        // since hits are collected by moving _forward_ in the
+        // read, if this is an rc hit, it should be moving
+        // backwards in the reference. 
+        // In general, we only add the hit if this is the case.
+        // This ensures that we don't double-count a k-mer that
+        // might occur twice on this target.
+
+        // we have a special case here; what if the same exact 
+        // k-mer (i.e. not just the same sequence but same position
+        // on the query) occurs more than one time on this refernece?
+        //
+        // In that case, the GENERAL case code will already have 
+        // processed and seen a k-mer with the read position 
+        // equal to `read_pos`.  In the case below, we see 
+        // a hit with the *same* read pos again (one or more times).
+        // 
+        // Here, we swap out the previous hit having this read_pos 
+        // if the position of the current hit on the read is
+        // the same and the position on the reference is greater 
+        // (this is a heuristic to help in the case of tandem repeats or 
+        // highly-repetitive subsequence).
+        // NOTE: consider if a similar heuristic should be
+        // adopted for the forward case.
+        if ((read_pos == last_read_pos_rc) and (ref_pos > last_ref_pos_rc) and
+            (ref_pos < rightmost_bound_rc)) {
+
+            last_ref_pos_rc = ref_pos;
+            // if the read_pos was the same as the first read pos
+            // then also update the approx_end_pos_rc accordingly
+            // NOTE: for the time being don't mess with this position
+            // empirically this does better, but if we really want 
+            // to optimize this for accuracy we need a better general
+            // heuristic.
+            // if (read_pos == first_read_pos_rc) {
+            //   approx_end_pos_rc = ref_pos + read_pos;
+            //}
+
+            return added;
+        }
+
+        // GENERAL case
+        if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
+            approx_pos_rc = (ref_pos - (rl - (read_pos + k)));
+            if (last_read_pos_rc == -1) {
+                approx_end_pos_rc = ref_pos + read_pos;
+                first_read_pos_rc = read_pos;
+            } else {
+                if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false; }
+            }
+            rc_score += score_inc;
+            ++rc_hits;
+
+            // new
+            rightmost_bound_rc = last_ref_pos_rc;
+
+            last_ref_pos_rc = ref_pos;
+            last_read_pos_rc = read_pos;
+            added = true;
+        }
+        return added;
+    }
+
+    inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); }
+
+    // true if forward, false if rc
+    // second element is score
+    inline HitDirection best_hit_direction() {
+        int32_t fw_minus_rc = static_cast<int32_t>(fw_hits) - static_cast<int32_t>(rc_hits);
+        return (fw_minus_rc > 0) ? HitDirection::FW
+                                 : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH);
+    }
+
+    inline simple_hit get_fw_hit(PairingStatus ps) {
+        return simple_hit{true, approx_pos_fw,
+                          fw_score, fw_hits, std::numeric_limits<uint32_t>::max(), ps};
+    }
+
+    inline simple_hit get_rc_hit(PairingStatus ps) {
+        return simple_hit{false, approx_pos_rc,
+                          rc_score, rc_hits, std::numeric_limits<uint32_t>::max(), ps};
+    }
+
+    inline std::string to_string() {
+        std::stringstream ss;
+        ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score
+           << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits
+           << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc;
+        return ss.str();
+    }
+
+    int32_t last_read_pos_fw{-1};
+    int32_t last_read_pos_rc{-1};
+    int32_t rightmost_bound_rc{std::numeric_limits<int32_t>::max()};
+
+    // marks the read position (key) of the 
+    // first hit we see in the rc direction
+    int32_t first_read_pos_rc{-1};
+
+    int32_t last_ref_pos_fw{-1};
+    int32_t last_ref_pos_rc{std::numeric_limits<int32_t>::max()};
+
+    int32_t approx_pos_fw{-1};
+    int32_t approx_pos_rc{-1};
+    int32_t approx_end_pos_rc{-1};
+
+    uint32_t fw_hits{0};
+    uint32_t rc_hits{0};
+    float fw_score{0.0};
+    float rc_score{0.0};
+};
+
+/**
+ * This contains the information necessary to collect hits and perform
+ * mapping on a read using PASC.
+ **/
+template <typename IndexT> struct mapping_cache_info {
+public:
+  mapping_cache_info(IndexT* midx, size_t max_occ_default_in,
+                     size_t max_occ_recover_in, size_t max_read_occ_in)
+      : idx(midx), k(midx->k()), hit_searcher(midx),
+        max_occ_default(max_occ_default_in),
+        max_occ_recover(max_occ_recover_in),
+        attempt_occ_recover(max_occ_recover > max_occ_default),
+        max_read_occ(max_read_occ_in), alt_max_occ(max_read_occ_in) {}
+
+  inline void clear() {
+    map_type = salmon::utils::MappingType::UNMAPPED;
+    hit_searcher.clear();
+    hit_map.clear();
+    accepted_hits.clear();
+    alt_max_occ = max_read_occ;
+    has_matching_kmers = false;
+  }
+
+  // will store how the read mapped
+  salmon::utils::MappingType map_type{salmon::utils::MappingType::UNMAPPED};
+
+  // map from reference id to hit info
+  phmap::flat_hash_map<uint32_t, sketch_hit_info> hit_map;
+
+  // where the mappings that pass will be stored
+  std::vector<simple_hit> accepted_hits;
+
+  // map to recall the number of unmapped reads we see
+  // for each barcode
+  phmap::flat_hash_map<uint64_t, uint32_t> unmapped_bc_map;
+
+  // pointer to the underlying index
+  IndexT* idx{nullptr};
+
+  // the k-mer size of our index
+  size_t k{0};
+
+  // implements the PASC algorithm
+  MemCollector<IndexT> hit_searcher;
+
+  // the number of occurrences of a hit above which we
+  // won't consider it
+  size_t max_occ_default;
+
+  // the number of occurences of a hit above which we
+  // won't consider it, even in recovery mode
+  size_t max_occ_recover;
+
+  // will we attempt recovery if there are no hits with
+  // frequency below max_occ_default?
+  bool attempt_occ_recover;
+
+  // maximum number of places a read can occur and still
+  // be considered.
+  size_t max_read_occ;
+
+  size_t alt_max_occ;
+
+  // to perform queries
+  pufferfish::util::QueryCache qc;
+
+  // regardless of having full mappings, did any k-mers match
+  bool has_matching_kmers{false};
+};
+
+/**
+ * This function will map the read given by `read_seq`
+ * using PASC. The relevant parameters (e.g. maximum ambiguity
+ * of seed and maximum number of allowable mappings) are stored 
+ * in the `map_cache` structure.  The `PairingStatus` is passed in 
+ * by the caller and designates if the read being mapped is the 
+ * left end or the right end.
+ **/
+template <typename IndexT>
+inline bool map_read(std::string* read_seq,
+                     mapping_cache_info<IndexT>& map_cache, PairingStatus ps,
+                     bool verbose = false) {
+  map_cache.clear();
+  // rebind map_cache variables to
+  // local names
+  IndexT* qidx = map_cache.idx;
+  auto& qc = map_cache.qc;
+  auto& hit_searcher = map_cache.hit_searcher;
+  auto& hit_map = map_cache.hit_map;
+  auto& accepted_hits = map_cache.accepted_hits;
+  auto& map_type = map_cache.map_type;
+  const bool attempt_occ_recover = map_cache.attempt_occ_recover;
+  auto k = map_cache.k;
+  int32_t signed_k = static_cast<int32_t>(k);
+
+  // collect the set of matching seeds
+  map_cache.has_matching_kmers =
+      hit_searcher.get_raw_hits_sketch(*read_seq, qc, true, false);
+  bool early_stop = false;
+
+  // if there were hits
+  if (map_cache.has_matching_kmers) {
+    uint32_t num_valid_hits{0};
+    uint64_t total_occs{0};
+    uint64_t largest_occ{0};
+    auto& raw_hits = hit_searcher.get_left_hits();
+
+    // SANITY
+    decltype(raw_hits[0].first) prev_read_pos = -1;
+    // the maximum span the supporting k-mers of a
+    // mapping position are allowed to have.
+    // NOTE this is still > read_length b/c the stretch is measured wrt the
+    // START of the terminal k-mer.
+    int32_t max_stretch = static_cast<int32_t>(read_seq->length() * 1.0);
+
+    // a raw hit is a pair of read_pos and a projected hit
+
+    // the least frequent hit for this fragment.
+    uint64_t min_occ = std::numeric_limits<uint64_t>::max();
+
+    // this is false by default and will be set to true
+    // if *every* collected hit for this fragment occurs
+    // max_occ_default times or more.
+    bool had_alt_max_occ = false;
+    int32_t signed_rl = static_cast<int32_t>(read_seq->length());
+    auto collect_mappings_from_hits =
+        [&max_stretch, &min_occ, &hit_map, &num_valid_hits, &total_occs,
+         &largest_occ, &early_stop, signed_rl, k, signed_k, qidx,
+         verbose](auto& raw_hits, auto& prev_read_pos, auto& max_allowed_occ,
+                  auto& had_alt_max_occ) -> bool {
+      for (auto& raw_hit : raw_hits) {
+        auto& read_pos = raw_hit.first;
+        auto& proj_hits = raw_hit.second;
+        auto& refs = proj_hits.refRange;
+
+        // Keep track of the *least* ambiguous hit we see.
+        // If all hits occur more than `max_allowed_occ` times,
+        // but the least ambiguous hit (which occurs `min_occ`
+        // times) has < `max_occ_recover` occurrences, then 
+        // we will use all hits occurring `min_occ` number 
+        // of times or less to try top recover the read's mappings.
+        uint64_t num_occ = static_cast<uint64_t>(refs.size());
+        min_occ = std::min(min_occ, num_occ);
+        had_alt_max_occ = true;
+
+        // we visit every hit that occurs the allowable number of time
+        bool still_have_valid_target = false;
+        prev_read_pos = read_pos;
+        if (num_occ <= max_allowed_occ) {
+          total_occs += num_occ;
+          largest_occ = (num_occ > largest_occ) ? num_occ : largest_occ;
+          float score_inc = 1.0;
+
+          // vist each mapped reference position (mrp) where this hit 
+          // occurs.
+          for (auto& pos_it : refs) {
+            const auto& ref_pos_ori = proj_hits.decodeHit(pos_it);
+            uint32_t tid =
+                static_cast<uint32_t>(qidx->getRefId(pos_it.transcript_id()));
+            int32_t pos = static_cast<int32_t>(ref_pos_ori.pos);
+            bool ori = ref_pos_ori.isFW;
+            auto& target = hit_map[tid];
+
+            // Why >= here instead of == ?
+            // Because hits can happen on the same target in both the forward
+            // and rc orientations, it is possible that we start the loop with
+            // the target having num_valid_hits hits in a given orientation (o)
+            // we see a new hit for this target in oriention o (now it has
+            // num_valid_hits + 1) then we see a hit for this target in
+            // orientation rc(o).  We still want to add / consider this hit, but
+            // max_hits_for_target() > num_valid_hits. So, we must allow for
+            // that here.
+            if (target.max_hits_for_target() >= num_valid_hits) {
+              if (ori) {
+                target.add_fw(pos, static_cast<int32_t>(read_pos), signed_rl,
+                              signed_k, max_stretch, score_inc);
+              } else {
+                target.add_rc(pos, static_cast<int32_t>(read_pos), signed_rl,
+                              signed_k, max_stretch, score_inc);
+              }
+
+              // if this target is still valid after evaluating this hit 
+              // then we will have at least one globally valid target. If 
+              // no targets are valid after evaluating all mrps for this 
+              // hit then we can exit the search early.
+              still_have_valid_target |=
+                  (target.max_hits_for_target() >= num_valid_hits + 1);
+            }
+
+          } // DONE: for (auto &pos_it : refs)
+
+          ++num_valid_hits;
+
+          // if there are no targets reaching the valid hit threshold, then
+          // break early
+          if (!still_have_valid_target) {
+            return true;
+          }
+
+        } // DONE : if (num_occ <= max_allowed_occ)
+      }   // DONE : for (auto& raw_hit : raw_hits)
+
+      return false;
+    };
+
+    bool _discard = false;
+    auto mao_first_pass = map_cache.max_occ_default - 1;
+    early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos,
+                                            mao_first_pass, _discard);
+
+    // If our default threshold was too stringent, then fallback to a more
+    // liberal threshold and look up the k-mers that occur the least frequently.
+    // Specifically, if the min occuring hits have frequency < max_occ_recover
+    // (2500 by default) times, then collect the min occuring hits to get the
+    // mapping.
+    if (attempt_occ_recover and (min_occ >= map_cache.max_occ_default) and
+        (min_occ < map_cache.max_occ_recover)) {
+      prev_read_pos = -1;
+      uint64_t max_allowed_occ = min_occ;
+      early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos,
+                                              max_allowed_occ, had_alt_max_occ);
+    }
+
+    uint32_t best_alt_hits = 0;
+
+    for (auto& kv : hit_map) {
+      auto best_hit_dir = kv.second.best_hit_direction();
+      // if the best direction is FW or BOTH, add the fw hit
+      // otherwise add the RC.
+      auto mapping = (best_hit_dir != HitDirection::RC)
+                         ? kv.second.get_fw_hit(ps)
+                         : kv.second.get_rc_hit(ps);
+
+      if (mapping.num_hits >= num_valid_hits) {
+        mapping.tid = kv.first;
+        accepted_hits.emplace_back(mapping);
+        // if we had equally good hits in both directions
+        // add the rc hit here (since we added the fw)
+        // above if the best hit was either FW or BOTH
+        if (best_hit_dir == HitDirection::BOTH) {
+          auto second_hit = kv.second.get_rc_hit(ps);
+          second_hit.tid = kv.first;
+          accepted_hits.emplace_back(second_hit);
+        }
+      } else {
+        // best_alt_score = mapping.score > best_alt_score ? mapping.score :
+        // best_alt_score;
+        best_alt_hits =
+            mapping.num_hits > best_alt_hits ? mapping.num_hits : best_alt_hits;
+      }
+    }
+
+    map_cache.alt_max_occ =
+        had_alt_max_occ ? accepted_hits.size() : map_cache.max_occ_default;
+
+    /*
+     * This rule; if enabled, allows through mappings missing a single hit, if
+    there
+     * was no mapping with all hits. NOTE: this won't work with the current
+    early-exit
+     * optimization however.
+     if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >=
+     num_valid_hits
+     - 1)) { for (auto& kv : hit_map) { auto mapping = kv.second.get_best_hit();
+    if (mapping.num_hits >= best_alt_hits) {
+    //if (mapping.valid_pos(signed_read_len, transcripts[kv.first].RefLength,
+    10)) { mapping.tid = kv.first; accepted_hits.emplace_back(mapping);
+    //}
+    }
+    }
+    }
+    */
+  } // DONE : if (rh)
+
+  // If the read mapped to > maxReadOccs places, discard it
+  if (accepted_hits.size() > map_cache.alt_max_occ) {
+    accepted_hits.clear();
+    map_type = salmon::utils::MappingType::UNMAPPED;
+  } else if (!accepted_hits.empty()) {
+    map_type = salmon::utils::MappingType::SINGLE_MAPPED;
+  }
+
+  return early_stop;
 }
 
+} // namespace pasc
 
-  } // namespace mapping_utils
+} // namespace mapping_utils
 } // namespace salmon
 
 #endif //__SALMON_MAPPING_UTILS__


=====================================
include/Transcript.hpp
=====================================
@@ -14,8 +14,7 @@
 #include <limits>
 #include <memory>
 
-//#include "rapmap/bit_array.h"
-#include "pufferfish/compact_vector/compact_vector.hpp"
+#include "compact_vector/compact_vector.hpp"
 #include "pufferfish/rank9b.hpp"
 
 class Transcript {


=====================================
scripts/fetchPufferfish.sh
=====================================
@@ -23,11 +23,11 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then
     rm -fr ${INSTALL_DIR}/src/pufferfish
 fi
 
-SVER=salmon-v1.9.0
+SVER=salmon-v1.10.0
 #SVER=develop
 #SVER=sketch-mode
 
-EXPECTED_SHA256=2a862daeff95a19c9b188bc26a5d02fc0ecc5b9c9ae5523a67c9d14e62551c5d
+EXPECTED_SHA256=c961b9c252856b53c6538d22103b711c924ea1e649516de81efb85870aa8b143
 
 mkdir -p ${EXTERNAL_DIR}
 curl -k -L https://github.com/COMBINE-lab/pufferfish/archive/${SVER}.zip -o ${EXTERNAL_DIR}/pufferfish.zip
@@ -92,7 +92,8 @@ cp ${EXTERNAL_DIR}/pufferfish/include/BinWriter.hpp ${INSTALL_DIR}/include/puffe
 
 cp -r ${EXTERNAL_DIR}/pufferfish/include/libdivide ${INSTALL_DIR}/include/pufferfish
 cp -r ${EXTERNAL_DIR}/pufferfish/include/ksw2pp ${INSTALL_DIR}/include/pufferfish
-cp -r ${EXTERNAL_DIR}/pufferfish/include/compact_vector ${INSTALL_DIR}/include/pufferfish
+# this is now automatically tracked and inherited via twopaco (on which libpuffer depends)
+# cp -r ${EXTERNAL_DIR}/pufferfish/include/compact_vector ${INSTALL_DIR}/include/pufferfish
 cp -r ${EXTERNAL_DIR}/pufferfish/include/metro ${INSTALL_DIR}/include/pufferfish
 cp -r ${EXTERNAL_DIR}/pufferfish/include/itlib ${INSTALL_DIR}/include/pufferfish
 cp -r ${EXTERNAL_DIR}/pufferfish/include/sparsepp ${INSTALL_DIR}/include/pufferfish


=====================================
scripts/v1_10x/run.sh
=====================================
@@ -13,8 +13,8 @@ p2="$tmpdir/p2.fa"
 mkfifo $p1
 mkfifo $p2
 
-i1=`ls $base*I1*`
-wrapper <(cat $base*I1*) <(cat $base*RA*) >> $p1 2>> $p2 &
+i1=`ls $base/*I1*`
+wrapper <(cat $base/*I1*) <(cat $base/*RA*) >> $p1 2>> $p2 &
 
 echo "Running command [${new_cmd} -1 $p1 -2 $p2 -r $i1]"
 ${new_cmd} -1 $p1 -2 $p2 -r $i1


=====================================
src/Alevin.cpp
=====================================
@@ -164,10 +164,10 @@ std::vector<size_t> sort_indexes(const std::vector<T> &v) {
 
 // reference from https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
 // and https://github.com/CGATOxford/UMI-tools/blob/master/umi_tools/umi_methods.py#L193
-bool gaussianKDE(const std::vector<uint32_t>& freqCounter,
+bool gaussianKDE(const std::vector<uint64_t>& freqCounter,
                  const std::vector<size_t>& sortedIdx,
                  double& invCovariance, double& normFactor,
-                 uint32_t expect_cells, uint32_t& predicted_cells){
+                 uint64_t expect_cells, uint64_t& predicted_cells){
   double covariance{0.0}, mean{0.0}, bw_method {0.01}, threshold = 0.001*freqCounter[sortedIdx[0]];
   std::vector<double> logDataset;
   size_t numElem, xSpace {10000};
@@ -218,7 +218,7 @@ bool gaussianKDE(const std::vector<uint32_t>& freqCounter,
   }
 
   //calculating the argrelextrema
-  std::vector<uint32_t> local_mins;
+  std::vector<uint64_t> local_mins;
   for (size_t i=1; i<xSpace-1; i++){
     if (density[i-1] > density[i] and density[i] < density[i+1]){
       local_mins.emplace_back(i);
@@ -247,26 +247,26 @@ bool gaussianKDE(const std::vector<uint32_t>& freqCounter,
   return false;
 }
 
-uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
-                         uint32_t topxBarcodes,
-                         const std::vector<uint32_t>& freqCounter){
+uint64_t getLeftBoundary(std::vector<size_t>& sortedIdx,
+                         uint64_t topxBarcodes,
+                         const std::vector<uint64_t>& freqCounter){
   // iterate in reverse order since sortedIdx is sorted in decreasing
   // order
   double cumCount{0.0};
   std::vector<double> freqs(topxBarcodes);
-  for(uint32_t i = 0; i < topxBarcodes; i++){
+  for(uint64_t i = 0; i < topxBarcodes; i++){
     size_t ind = sortedIdx[topxBarcodes-i-1];
     cumCount += freqCounter[ind];
     freqs[i] = std::log(cumCount);
   }
 
-  std::vector<uint32_t> X(topxBarcodes);
+  std::vector<uint64_t> X(topxBarcodes);
   std::iota(X.begin(), X.end(), 0);
 
   bool isUp;
-  uint32_t x;
+  uint64_t x;
   double y, slope, leftExtreme{freqs[0]};
-  for(uint32_t j=0; j<topxBarcodes; j++){
+  for(uint64_t j=0; j<topxBarcodes; j++){
     x = X[j];
     y = freqs[j];
 
@@ -279,7 +279,7 @@ uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
     slope = y/x;
     // fill in the values for fitted line
     std::transform(X.begin()+nextBcIdx, X.end(), Y.begin(),
-                   [slope](uint32_t i) {return i*slope;} );
+                   [slope](uint64_t i) {return i*slope;} );
 
     isUp = false;
     double curveY, lineY;
@@ -294,9 +294,9 @@ uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
 
     if(isUp == false){
       // ignoring all the frequencies having same frequency as cutoff
-      uint32_t cutoff = topxBarcodes-j;
-      uint32_t cutoffFrequency = freqCounter[sortedIdx[cutoff]];
-      uint32_t nearestLeftFrequency = cutoffFrequency;
+      uint64_t cutoff = topxBarcodes-j;
+      uint64_t cutoffFrequency = freqCounter[sortedIdx[cutoff]];
+      uint64_t nearestLeftFrequency = cutoffFrequency;
       while(nearestLeftFrequency == cutoffFrequency){
         nearestLeftFrequency = freqCounter[sortedIdx[--cutoff]];
       }
@@ -310,13 +310,13 @@ uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
 
 void dumpFeatures(std::string& frequencyFileName,
                   std::vector<size_t>& sortedIdx,
-                  const std::vector<uint32_t>& freqCounter,
+                  const std::vector<uint64_t>& freqCounter,
                   std::unordered_map<uint32_t, nonstd::basic_string_view<char>>& colMap
                   ){
   std::ofstream freqFile;
   freqFile.open(frequencyFileName);
   for (auto i:sortedIdx){
-    uint32_t count = freqCounter[i];
+    uint64_t count = freqCounter[i];
     if (count == 0){
       break;
     }
@@ -330,7 +330,7 @@ void dumpFeatures(std::string& frequencyFileName,
   Knee calculation and sample true set of barcodes
  */
 template <typename ProtocolT>
-void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
+void sampleTrueBarcodes(const std::vector<uint64_t>& freqCounter,
                         TrueBcsT& trueBarcodes, size_t& lowRegionNumBarcodes,
                         std::unordered_map<uint32_t, nonstd::basic_string_view<char>> colMap,
                         AlevinOpts<ProtocolT>& aopt){
@@ -338,7 +338,7 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
   size_t maxNumBarcodes { aopt.maxNumBarcodes }, lowRegionMaxNumBarcodes { 1000 };
   size_t lowRegionMinNumBarcodes { aopt.lowRegionMinNumBarcodes };
   double lowConfidenceFraction { 0.5 };
-  uint32_t topxBarcodes = std::min(maxNumBarcodes, freqCounter.size());
+  uint64_t topxBarcodes = std::min(maxNumBarcodes, freqCounter.size());
   uint64_t history { 0 };
 
   if (aopt.forceCells > 0) {
@@ -358,13 +358,13 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
     // Expect Cells algorithm is taken from
     // https://github.com/10XGenomics/cellranger/blob/e5396c6c444acec6af84caa7d3655dd33a162852/lib/python/cellranger/stats.py#L138
 
-    constexpr uint32_t MAX_FILTERED_CELLS = 2;
+    constexpr uint64_t MAX_FILTERED_CELLS = 2;
     constexpr double UPPER_CELL_QUANTILE = 0.01;
     constexpr double FRACTION_MAX_FREQUENCY = 0.1;
 
-    uint32_t baselineBcs = std::max(1, static_cast<int32_t>( aopt.expectCells*UPPER_CELL_QUANTILE ));
+    uint64_t baselineBcs = std::max(static_cast<uint64_t>(1), static_cast<uint64_t>( aopt.expectCells*UPPER_CELL_QUANTILE ));
     double cutoffFrequency = std::max(1.0, freqCounter[sortedIdx[baselineBcs]] * FRACTION_MAX_FREQUENCY);
-    uint32_t maxNumCells = std::min(static_cast<uint32_t>(freqCounter.size()), aopt.expectCells * MAX_FILTERED_CELLS);
+    uint64_t maxNumCells = std::min(static_cast<uint64_t>(freqCounter.size()), aopt.expectCells * MAX_FILTERED_CELLS);
 
     topxBarcodes = maxNumCells;
     for(size_t i=baselineBcs; i<maxNumCells; i++){
@@ -395,7 +395,7 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
                           green, topxBarcodes, RESET_COLOR);
 
       double invCovariance {0.0}, normFactor{0.0};
-      uint32_t gaussThreshold;
+      uint64_t gaussThreshold;
       bool isGaussOk = gaussianKDE(freqCounter, sortedIdx,
                                    invCovariance, normFactor,
                                    topxBarcodes, gaussThreshold);
@@ -421,7 +421,7 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
     }
   }//end-left-knee finding case
 
-  uint32_t fractionTrueBarcodes = static_cast<int>(lowConfidenceFraction * topxBarcodes);
+  uint64_t fractionTrueBarcodes = static_cast<uint64_t>(lowConfidenceFraction * topxBarcodes);
 
   if (fractionTrueBarcodes < lowRegionMinNumBarcodes){
     lowRegionNumBarcodes = lowRegionMinNumBarcodes;
@@ -442,8 +442,8 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
   }
   topxBarcodes += lowRegionNumBarcodes;
 
-  uint32_t cutoffFrequency = freqCounter[sortedIdx[ topxBarcodes ]];
-  uint32_t nearestLeftFrequency = cutoffFrequency;
+  uint64_t cutoffFrequency = freqCounter[sortedIdx[ topxBarcodes ]];
+  uint64_t nearestLeftFrequency = cutoffFrequency;
   while(nearestLeftFrequency == cutoffFrequency && lowRegionNumBarcodes != 0){
     nearestLeftFrequency = freqCounter[sortedIdx[--topxBarcodes]];
     if (lowRegionNumBarcodes > lowRegionMinNumBarcodes)  { lowRegionNumBarcodes--; }
@@ -484,7 +484,7 @@ void indexBarcodes(AlevinOpts<ProtocolT>& aopt,
                    SoftMapT& barcodeSoftMap){
   std::unordered_set<std::string> neighbors;
   std::unordered_map<std::string, std::vector<std::string>> ZMatrix;
-  uint32_t wrngWhiteListCount{0};
+  uint64_t wrngWhiteListCount{0};
   for (const auto trueBarcode: trueBarcodes){
     neighbors.clear();
     //find all neighbors to this true barcode
@@ -493,7 +493,7 @@ void indexBarcodes(AlevinOpts<ProtocolT>& aopt,
                        neighbors);
 
     for(const auto& neighbor : neighbors){
-      uint32_t freq{0};
+      uint64_t freq{0};
       auto indexIt = freqCounter.find(neighbor);
       bool indexOk = indexIt != freqCounter.end();
       //bool indexOk = freqCounter.find(neighbor, freq);
@@ -542,7 +542,7 @@ bool writeFastq(AlevinOpts<ProtocolT>& aopt,
                 SoftMapT& barcodeMap,
                 TrueBcsT& trueBarcodes){
   size_t rangeSize{0};
-  uint32_t totNumBarcodes{0};
+  uint64_t totNumBarcodes{0};
   std::string barcode( aopt.protocol.barcodeLength, 'N');
   std::string umi( aopt.protocol.umiLength, 'N');
 
@@ -717,7 +717,7 @@ void processBarcodes(std::vector<std::string>& barcodeFiles,
     if (aopt.dumpfeatures) {
       aopt.jointLog->info("Sorting and dumping raw barcodes");
       auto frequencyFileName = (aopt.outputDirectory / "raw_cb_frequency.txt").string();
-      std::vector<uint32_t> collapsedfrequency;
+      std::vector<uint64_t> collapsedfrequency;
       std::unordered_map<uint32_t, nonstd::basic_string_view<char>> collapMap;
       size_t ind{0};
       for(auto fqIt = freqCounter.begin(); fqIt != freqCounter.end(); ++fqIt){
@@ -731,7 +731,7 @@ void processBarcodes(std::vector<std::string>& barcodeFiles,
     }
   }
   else {
-    std::vector<uint32_t> collapsedfrequency;
+    std::vector<uint64_t> collapsedfrequency;
     std::unordered_map<uint32_t, nonstd::basic_string_view<char>> collapMap;
     size_t ind{0};
     for(auto fqIt = freqCounter.begin(); fqIt != freqCounter.end(); ++fqIt){
@@ -741,7 +741,7 @@ void processBarcodes(std::vector<std::string>& barcodeFiles,
     }
 
     if (aopt.keepCBFraction != 0.0) {
-      aopt.forceCells = std::min(static_cast<uint32_t>(aopt.keepCBFraction * freqCounter.size()),
+      aopt.forceCells = std::min(static_cast<uint64_t>(aopt.keepCBFraction * freqCounter.size()),
                                  aopt.maxNumBarcodes);
       aopt.jointLog->info("Forcing to use {} cells", aopt.forceCells);
       aopt.jointLog->flush();


=====================================
src/AlevinUtils.cpp
=====================================
@@ -348,7 +348,7 @@ namespace alevin {
       (void)read2;
       pt.anchorPos = read.find(pt.anchorSeq);
       if (pt.anchorPos != std::string::npos && ( pt.anchorPos == pt.maxHairpinIndexLen || pt.anchorPos == pt.maxHairpinIndexLen -1) // only 2 possible values of pt.anchorPos
-         && read.length() >= pt.barcodeLength + pt.umiLength + pt.anchorSeqLen) {
+         && read.length() >= pt.barcodeLength + pt.umiLength + pt.anchorSeqLen -2) { // Subtract 2 to counter artificial increase in barcode length
            std::string bcAssign = read.substr(0,pt.anchorPos) + read.substr(pt.anchorPos + pt.anchorSeqLen + pt.umiLength, pt.rtIdxLen);
         if (pt.anchorPos < pt.maxHairpinIndexLen) { // hairpin index can be 9 or 10 bp
            bcAssign += "AC";


=====================================
src/CMakeLists.txt
=====================================
@@ -141,7 +141,6 @@ else()
   set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
 endif()
 
-
 set (TGT_RELEASE_FLAGS "${TGT_COMPILE_FLAGS};${TGT_WARN_FLAGS}")
 set (TGT_DEBUG_FLAGS "-g;${TGT_COMPILE_FLAGS};${TGT_WARN_FLAGS}")
 
@@ -155,6 +154,8 @@ HAVE_SSTREAM=1
 STX_NO_STD_STRING_VIEW=1
 span_FEATURE_MAKE_SPAN_TO_STD=14
 )
+target_include_directories(salmon_core PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
+
 
 if (USE_ARM)
 target_compile_definitions(salmon_core PUBLIC KSW_USE_ARM=1)
@@ -176,6 +177,7 @@ HAVE_ANSI_TERM=1
 HAVE_SSTREAM=1
 span_FEATURE_MAKE_SPAN_TO_STD=14
 )
+target_include_directories(alevin_core PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
 
 if (USE_ARM)
 target_compile_definitions(alevin_core PUBLIC KSW_USE_ARM=1)
@@ -190,6 +192,7 @@ endif()
 
 # Build the salmon executable
 add_executable(salmon ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
+target_include_directories(salmon PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
 
 if(HAS_IPO AND (NOT NO_IPO))
   set_property(TARGET salmon PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
@@ -202,6 +205,7 @@ target_compile_options(UnitTestsMain PUBLIC "$<$<CONFIG:RELEASE>:${TGT_RELEASE_F
 add_executable(unitTests ${UNIT_TESTS_INDIVIDUAL_SRCS} ${GAT_SOURCE_DIR}/tests/catch.hpp)
 target_compile_options(unitTests PUBLIC "$<$<CONFIG:DEBUG>:${TGT_DEBUG_FLAGS}>")
 target_compile_options(unitTests PUBLIC "$<$<CONFIG:RELEASE>:${TGT_RELEASE_FLAGS}>")
+target_include_directories(unitTests PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
 
 #add_executable(salmon-read ${SALMON_READ_SRCS})
 #set_target_properties(salmon-read PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_LIBPTHREAD -D_PBGZF_USE -fopenmp"
@@ -225,6 +229,7 @@ add_dependencies(salmon ksw2pp)
 add_dependencies(salmon salmon_core)
 add_dependencies(salmon alevin_core)
 if(TBB_RECONFIGURE OR TBB_TARGET_EXISTED)
+
 # Link the executable
 target_link_libraries(salmon
     Threads::Threads 
@@ -236,9 +241,10 @@ target_link_libraries(salmon
     gff
     ${Boost_LIBRARIES}
     ${ICU_LIBS}
-    ${STADEN_LIBRARIES} ${CURL_LIBRARIES}
+    ${CURL_LIBRARIES}
     ${ZLIB_LIBRARY}
     m
+    ${STADEN_LIBRARIES} 
     ${LIBLZMA_LIBRARIES}
     ${BZIP2_LIBRARIES}
     ${LIBSALMON_LINKER_FLAGS}
@@ -269,12 +275,13 @@ if(TBB_RECONFIGURE OR TBB_TARGET_EXISTED)
 target_link_libraries(unitTests
     Threads::Threads
 ## PUFF_INTEGRATION
+    ${Boost_LIBRARIES}
     salmon_core
     alevin_core
     gff
     UnitTestsMain
-    ${STADEN_LIBRARIES} 
     ${Boost_LIBRARIES}
+    ${STADEN_LIBRARIES} 
     ${ICU_LIBS}
     ${CURL_LIBRARIES}
     ${ZLIB_LIBRARY}


=====================================
src/SalmonAlevin.cpp
=====================================
@@ -414,15 +414,10 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
   salmon::utils::ShortFragStats shortFragStats;
   double maxZeroFrac{0.0};
 
-  // Write unmapped reads
-  fmt::MemoryWriter unmappedNames;
-  bool writeUnmapped = salmonOpts.writeUnmappedNames;
-  spdlog::logger* unmappedLogger = (writeUnmapped) ? salmonOpts.unmappedLog.get() : nullptr;
-
-  // Write unmapped reads
-  fmt::MemoryWriter orphanLinks;
-  bool writeOrphanLinks = salmonOpts.writeOrphanLinks;
-  spdlog::logger* orphanLinkLogger = (writeOrphanLinks) ? salmonOpts.orphanLinkLog.get() : nullptr;
+  bool quiet = salmonOpts.quiet;
+  if (salmonOpts.writeQualities) {
+    salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file.");
+  }
 
   BasicBinWriter bw;
   uint32_t num_reads_in_chunk{0};
@@ -436,200 +431,49 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
   uint64_t localNumAssignedFragments{0};
   uint64_t localNumUniqueMappings{0};
 
-  bool consistentHits = salmonOpts.consistentHits;
-  bool quiet = salmonOpts.quiet;
-
-  size_t maxNumHits{salmonOpts.maxReadOccs};
   size_t readLenLeft{0};
   size_t readLenRight{0};
 
-  constexpr const int32_t invalidScore = std::numeric_limits<int32_t>::min();
-  MemCollector<IndexT> memCollector(qidx);
-  ksw2pp::KSW2Aligner aligner;
-  pufferfish::util::AlignmentConfig aconf;
-  pufferfish::util::MappingConstraintPolicy mpol;
-  bool initOK = salmon::mapping_utils::initMapperSettings(salmonOpts, memCollector, aligner, aconf, mpol);
-  PuffAligner puffaligner(qidx->refseq_, qidx->refAccumLengths_, qidx->k(), aconf, aligner);
-
-  pufferfish::util::CachedVectorMap<size_t, std::vector<pufferfish::util::MemCluster>, std::hash<size_t>> hits;
-  std::vector<pufferfish::util::MemCluster> recoveredHits;
-  std::vector<pufferfish::util::JointMems> jointHits;
-  PairedAlignmentFormatter<IndexT*> formatter(qidx);
-  if (salmonOpts.writeQualities) {
-    salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file.");
-  }
-  pufferfish::util::QueryCache qc;
-
-  bool mimicStrictBT2 = salmonOpts.mimicStrictBT2;
-  bool mimicBT2 = salmonOpts.mimicBT2;
-  bool noDovetail = !salmonOpts.allowDovetail;
-  bool useChainingHeuristic = !salmonOpts.disableChainingHeuristic;
-
-  pufferfish::util::HitCounters hctr;
   salmon::utils::MappingType mapType{salmon::utils::MappingType::UNMAPPED};
-  bool hardFilter = salmonOpts.hardFilter;
 
   fmt::MemoryWriter sstream;
   auto* qmLog = salmonOpts.qmLog.get();
   bool writeQuasimappings = (qmLog != nullptr);
-
-  struct SimpleHit {
-    bool is_fw{false};
-    int32_t pos{-1};
-    float score{0.0};
-    uint32_t num_hits{0};
-    uint32_t tid{std::numeric_limits<uint32_t>::max()};
-    bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) {
-      int32_t signed_txp_len = static_cast<int32_t>(txp_len);
-      return (pos > -max_over) and ((pos + read_len) < (signed_txp_len + max_over));
-    }
-  };
-
-  enum class HitDirection : uint8_t {FW, RC, BOTH};
-
-  struct SketchHitInfo {
-    // add a hit to the current target that occurs in the forward 
-    // orientation with respect to the target.
-    bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) {
-      (void)rl;
-      (void)k;
-      bool added{false};
-      
-      // since hits are collected by moving _forward_ in the
-      // read, if this is a fw hit, it should be moving 
-      // forward in the reference. Only add it if this is
-      // the case.  This ensure that we don't 
-      // double-count a k-mer that might occur twice on
-      // this target.
-      if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) {
-        if (last_read_pos_fw == -1) {
-          approx_pos_fw = ref_pos - read_pos;
-        } else {
-          if ((ref_pos - approx_pos_fw) > max_stretch) { return false; }
-        }
-        //if (last_ref_pos_fw > -1 and (ref_pos > last_ref_pos_fw + 15)) { return false; }
-        last_ref_pos_fw = ref_pos;
-        last_read_pos_fw = read_pos;
-        fw_score += score_inc;
-        ++fw_hits;
-        added = true;
-      }   
-      return added;
-    }
-
-    // add a hit to the current target that occurs in the forward 
-    // orientation with respect to the target.
-    bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) {
-
-      bool added{false};
-      // since hits are collected by moving _forward_ in the
-      // read, if this is an rc hit, it should be moving 
-      // backwards in the reference. Only add it if this is
-      // the case.
-      // This ensures that we don't double-count a k-mer that 
-      // might occur twice on this target.
-      if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
-        approx_pos_rc = (ref_pos - (rl - (read_pos + k)));
-        if (last_read_pos_rc == -1) { 
-          approx_end_pos_rc = ref_pos + read_pos; 
-        } else {
-          if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false;}
-        }
-        //if (last_ref_pos_rc > -1 and ref_pos < last_ref_pos_rc - 15) { return false; }
-        last_ref_pos_rc = ref_pos;
-        last_read_pos_rc = read_pos;
-        rc_score += score_inc;
-        ++rc_hits;
-        added = true;
-        
-      }
-      return added;
-    }
-
-    inline uint32_t max_hits_for_target() {
-      return std::max(fw_hits, rc_hits);
-    }
-
-    // true if forward, false if rc
-    // second element is score
-    inline HitDirection best_hit_direction() {
-      int32_t fw_minus_rc = static_cast<int32_t>(fw_hits) - static_cast<int32_t>(rc_hits);
-      return (fw_minus_rc > 0) ? HitDirection::FW : 
-             ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH);
-    }
-
-    inline SimpleHit get_fw_hit() {
-      return SimpleHit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits<uint32_t>::max()}; 
-    }
-
-    inline SimpleHit get_rc_hit() {
-      return SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits<uint32_t>::max()};
-    }
-
-    inline SimpleHit get_best_hit() {
-      auto best_direction = best_hit_direction();
-      return (best_direction != HitDirection::RC) ? 
-        SimpleHit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits<uint32_t>::max()} : 
-        SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits<uint32_t>::max()};
-    }
-
-    inline std::string to_string() {
-      std::stringstream ss;
-      ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score << ", fw_pos : " << approx_pos_fw
-         << " || rc_hits: " << rc_hits << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc;
-      return ss.str();
-    }
-
-    int32_t last_read_pos_fw{-1};
-    int32_t last_read_pos_rc{-1};
-
-    int32_t last_ref_pos_fw{-1};
-    int32_t last_ref_pos_rc{std::numeric_limits<int32_t>::max()};
-    
-    int32_t approx_pos_fw{-1};
-    int32_t approx_pos_rc{-1};
-    int32_t approx_end_pos_rc{-1};
-
-    uint32_t fw_hits{0};
-    uint32_t rc_hits{0};
-    float fw_score{0.0};
-    float rc_score{0.0}; 
-  };
-
+  PairedAlignmentFormatter<IndexT*> formatter(qidx);
+  
   // map to recall the number of unmapped reads we see 
   // for each barcode
   phmap::flat_hash_map<uint64_t, uint32_t> unmapped_bc_map; 
 
-  // map from transcript id to hit info
-  phmap::flat_hash_map<uint32_t, SketchHitInfo> hit_map; 
-  std::vector<SimpleHit> accepted_hits;
-
-  //////////////////////
-  // NOTE: validation mapping based new parameters
-  std::string rc1; rc1.reserve(300);
-
-
   // check the frequency and decide here if we should
   // be attempting recovery of highly-multimapping reads
   const size_t max_occ_default = salmonOpts.maxReadOccs;
   const size_t max_occ_recover = salmonOpts.maxRecoverReadOccs;
   const bool attempt_occ_recover = (max_occ_recover > max_occ_default);
 
-  size_t numMappingsDropped{0};
-  size_t numDecoyFrags{0};
-  const double decoyThreshold = salmonOpts.decoyThreshold;
+  // think if this should be different since it's at the read level
+  const size_t max_read_occs = salmonOpts.maxReadOccs;
+  size_t alt_max_occ = max_read_occs;
 
-  salmon::mapping_utils::MappingScoreInfo msi(decoyThreshold);
-  // we only collect detailed decoy information if we will be 
-  // writing output to SAM.
-  msi.collect_decoys(writeQuasimappings);
+  // we'll use this for mapping
+  salmon::mapping_utils::pasc::mapping_cache_info<IndexT> map_cache_first(qidx, max_occ_default, max_occ_recover, max_read_occs);
+  salmon::mapping_utils::pasc::mapping_cache_info<IndexT> map_cache_second(qidx, max_occ_default, max_occ_recover, max_read_occs);
+
+  std::vector<salmon::mapping_utils::pasc::simple_hit>* accepted_hits_left_ptr;
+  std::vector<salmon::mapping_utils::pasc::simple_hit>* accepted_hits_right_ptr;
+  std::vector<salmon::mapping_utils::pasc::simple_hit>* accepted_hits_ptr;
+
+  // buffer in which to accumulate accepted hits if we are merging for left 
+  // and right reads
+  std::vector<salmon::mapping_utils::pasc::simple_hit> accepted_hits_buffer;
 
   std::string readBuffer;
   std::string umi(alevinOpts.protocol.umiLength, 'N');
   std::string barcode(alevinOpts.protocol.barcodeLength, 'N');
   //////////////////////
 
-  bool tryAlign{salmonOpts.validateMappings};
+  auto localProtocol = alevinOpts.protocol;
+  // start parsing reads
   auto rg = parser->getReadGroup();
   while (parser->refill(rg)) {
     rangeSize = rg.size();
@@ -650,45 +494,35 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
     	extraBAMtags.reserve(reserveSize);
     }
 
-    auto localProtocol = alevinOpts.protocol;
     for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch
       auto& rp = rg[i];
       readLenLeft = rp.first.seq.length();
       readLenRight= rp.second.seq.length();
 
+      // while the pasc code path will use the jointHitGroup to 
+      // store UMI and BarCode info, it won't actually use the 
+      // "alignment" fields
       bool tooShortRight = (readLenRight < (minK+alevinOpts.trimRight));
-      //localUpperBoundHits = 0;
       auto& jointHitGroup = structureVec[i];
       jointHitGroup.clearAlignments();
-      auto& jointAlignments= jointHitGroup.alignments();
 
-      hits.clear();
-      jointHits.clear();
-      memCollector.clear();
-      hit_map.clear(); 
-      accepted_hits.clear();
-      //jointAlignments.clear();
-      //readSubSeq.clear();
       mapType = salmon::utils::MappingType::UNMAPPED;
+      // ensure that accepted_hits_ptr points to a valid 
+      // vector in case it's not set below
+      accepted_hits_ptr = &accepted_hits_buffer;
+      alt_max_occ = max_read_occs;
 
       //////////////////////////////////////////////////////////////
       // extracting barcodes
       size_t barcodeLength = alevinOpts.protocol.barcodeLength;
       size_t umiLength = alevinOpts.protocol.umiLength;
-      //umi.clear();
-      //barcode.clear();
+
       nonstd::optional<uint32_t> barcodeIdx;
       extraBAMtags.clear();
       bool seqOk = false;
 
-      // keep track of the *least* freqeuntly 
-      // occurring hit in this fragment to consider 
-      // alternative processing of fragments where 
-      // all observed hits have high frequency.
-      uint64_t alt_max_occ = 0;
-
       if (alevinOpts.protocol.end == bcEnd::FIVE ||
-          alevinOpts.protocol.end == bcEnd::THREE){
+          alevinOpts.protocol.end == bcEnd::THREE) {
         // If the barcode sequence could be extracted, then this is set to true,
         // but the barcode sequence itself may still be invalid (e.g. contain `N` characters).
         // However, if extracted_barcode is false here, there is no hope to even recover the
@@ -721,164 +555,15 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
 
             if (isUmiIdxOk) {
               jointHitGroup.setUMI(umiIdx.word(0));
-              bool rh = false;
-              std::string* readSubSeq = aut::getReadSequence(localProtocol, rp.first.seq, rp.second.seq, readBuffer);
-              rh = tooShortRight
-                       ? false
-                       : memCollector.get_raw_hits_sketch(*readSubSeq,
-                                                          qc,
-                                                          true, // isLeft
-                                                          false // verbose
-                         );
-              // if there were hits
-              if (rh) {
-                uint32_t num_valid_hits{0};
-                uint64_t total_occs{0};
-                uint64_t largest_occ{0};
-                auto& raw_hits = memCollector.get_left_hits();
-
-                // SANITY
-                decltype(raw_hits[0].first) prev_read_pos = -1;
-                // the maximum span the supporting k-mers of a
-                // mapping position are allowed to have.
-                // NOTE this is still > read_length b/c the stretch is measured wrt the
-                // START of the terminal k-mer.
-                int32_t signed_rl = static_cast<int32_t>(readSubSeq->length());
-                int32_t max_stretch = static_cast<int32_t>(signed_rl * 1.0);
-
-                // a raw hit is a pair of read_pos and a projected hit
-
-                // the least frequent hit for this fragment.
-                uint64_t min_occ = std::numeric_limits<uint64_t>::max();
-
-                // this is false by default and will be set to true
-                // if *every* collected hit for this fragment occurs
-                // salmonOpts.maxReadOccs times or more.
-                bool had_alt_max_occ = false;
-
-                auto collect_mappings_from_hits = [&max_stretch, &min_occ, &hit_map,
-                                                   &salmonOpts, &num_valid_hits, &total_occs,
-                                                   &largest_occ, &qidx, signed_rl, signed_k](
-                  auto& raw_hits, auto& prev_read_pos,
-                  auto& max_allowed_occ, auto& had_alt_max_occ
-                ) -> bool {
-                  for (auto& raw_hit : raw_hits) {
-                    auto& read_pos = raw_hit.first;
-                    auto& proj_hits = raw_hit.second;
-                    auto& refs = proj_hits.refRange;
-                    uint64_t num_occ = static_cast<uint64_t>(refs.size());
-                    min_occ = std::min(min_occ, num_occ);
-                    had_alt_max_occ = true;
-
-                    bool still_have_valid_target = false;
-                    prev_read_pos = read_pos;
-                    if (num_occ <= max_allowed_occ) {
-
-                      total_occs += num_occ;
-                      largest_occ = (num_occ > largest_occ) ? num_occ : largest_occ;
-                      float score_inc = 1.0;
-
-                      for (auto &pos_it : refs) {
-                        const auto& ref_pos_ori = proj_hits.decodeHit(pos_it);
-                        uint32_t tid = static_cast<uint32_t>(qidx->getRefId(pos_it.transcript_id()));
-                        int32_t pos = static_cast<int32_t>(ref_pos_ori.pos);
-                        bool ori = ref_pos_ori.isFW;
-                        auto& target = hit_map[tid];
-
-                        // Why >= here instead of == ?
-                        // Because hits can happen on the same target in both the forward
-                        // and rc orientations, it is possible that we start the loop with
-                        // the target having num_valid_hits hits in a given orientation (o)
-                        // we see a new hit for this target in oriention o (now it has num_valid_hits + 1)
-                        // then we see a hit for this target in orientation rc(o).  We still want to
-                        // add / consider this hit, but max_hits_for_target() > num_valid_hits.
-                        // So, we must allow for that here.
-                        if (target.max_hits_for_target() >= num_valid_hits) {
-                          if (ori) {
-                            target.add_fw(pos, static_cast<int32_t>(read_pos), 
-                                          signed_rl, signed_k, max_stretch, score_inc);
-                          } else {
-                            target.add_rc(pos, static_cast<int32_t>(read_pos), 
-                                          signed_rl, signed_k, max_stretch, score_inc);
-                          }
-
-                          still_have_valid_target |= (target.max_hits_for_target() >= num_valid_hits + 1);
-                        }
-
-                      } // DONE: for (auto &pos_it : refs)
-
-                      ++num_valid_hits;
-
-                      // if there are no targets reaching the valid hit threshold, then break early
-                      if (!still_have_valid_target) { return true; }
-
-                    } // DONE : if (num_occ <= max_allowed_occ)
-                  } // DONE : for (auto& raw_hit : raw_hits)
-
-                  return false;
-                };
-
-                bool _discard = false;
-                auto mao_first_pass = max_occ_default - 1;
-                bool early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, mao_first_pass, _discard);
-
-                // If our default threshold was too stringent, then fallback to a more liberal
-                // threshold and look up the k-mers that occur the least frequently.
-                // Specifically, if the min occuring hits have frequency < max_occ_recover (2500 by default)
-                // times, then collect the min occuring hits to get the mapping.
-                if (attempt_occ_recover and (min_occ >= max_occ_default) and (min_occ < max_occ_recover)) {
-                  prev_read_pos = -1;
-                  uint64_t max_allowed_occ = min_occ;
-                  early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, max_allowed_occ, had_alt_max_occ);
-                }
 
-                uint32_t best_alt_hits = 0;
-                int32_t signed_read_len = static_cast<int32_t>(readSubSeq->length());
-
-                for (auto& kv : hit_map) {
-                  auto best_hit_dir = kv.second.best_hit_direction();
-                  // if the best direction is FW or BOTH, add the fw hit
-                  // otherwise add the RC.
-                  auto simple_hit = (best_hit_dir != HitDirection::RC) ? 
-                                    kv.second.get_fw_hit() : kv.second.get_rc_hit();
-
-                  if (simple_hit.num_hits >= num_valid_hits) { 
-                    simple_hit.tid = kv.first;
-                    accepted_hits.emplace_back(simple_hit);
-                    // if we had equally good hits in both directions
-                    // add the rc hit here (since we added the fw)
-                    // above if the best hit was either FW or BOTH
-                    if (best_hit_dir == HitDirection::BOTH) {
-                      auto second_hit = kv.second.get_rc_hit();
-                      second_hit.tid = kv.first;
-                      accepted_hits.emplace_back(second_hit);
-                    }
-                  } else {
-                    //best_alt_score = simple_hit.score > best_alt_score ? simple_hit.score : best_alt_score;
-                    best_alt_hits = simple_hit.num_hits > best_alt_hits ? simple_hit.num_hits : best_alt_hits;
-                  }
-                }
-
-                alt_max_occ = had_alt_max_occ ? accepted_hits.size() : salmonOpts.maxReadOccs;
-
-                /*
-                 * This rule; if enabled, allows through mappings missing a single hit, if there
-                 * was no mapping with all hits. NOTE: this won't work with the current early-exit
-                 * optimization however.
-                if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= num_valid_hits - 1)) {
-                  for (auto& kv : hit_map) { 
-                    auto simple_hit = kv.second.get_best_hit();
-                    if (simple_hit.num_hits >= best_alt_hits) {
-                      //if (simple_hit.valid_pos(signed_read_len, transcripts[kv.first].RefLength, 10)) {
-                        simple_hit.tid = kv.first;
-                        accepted_hits.emplace_back(simple_hit);
-                      //}
-                    }
-                  }
-                }
-                */
-              } // DONE : if (rh)
+              // get the subsequence for the alignable portion of the read
+              std::string* readSubSeq = aut::getReadSequence(localProtocol, rp.first.seq, rp.second.seq, readBuffer);
 
+              // try and map it
+              bool early_exit = salmon::mapping_utils::pasc::map_read(readSubSeq, map_cache_second, PairingStatus::UNPAIRED_RIGHT);
+              // in this case accepted_hits_ptr gets set to what we found
+              accepted_hits_ptr = &map_cache_second.accepted_hits;
+              alt_max_occ = map_cache_second.alt_max_occ;
             } else {
               nSeqs += 1;
             }
@@ -900,19 +585,20 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
       }
 
       // If the read mapped to > maxReadOccs places, discard it
-      if (accepted_hits.size() > alt_max_occ ) { 
-        accepted_hits.clear();
-      } else if (!accepted_hits.empty()) {
+      if (accepted_hits_ptr->size() > alt_max_occ) { 
+        accepted_hits_ptr->clear();
+      } else if (!accepted_hits_ptr->empty()) {
         mapType = salmon::utils::MappingType::SINGLE_MAPPED;
       }
 
+
       alevin::types::AlevinCellBarcodeKmer bck;
       bool barcode_ok = bck.fromChars(barcode);
  
       // NOTE: Think if we should put decoy mappings in the RAD file
       if (mapType == salmon::utils::MappingType::SINGLE_MAPPED) {
         // num aln
-        bw << static_cast<uint32_t>(accepted_hits.size()); 
+        bw << static_cast<uint32_t>(accepted_hits_ptr->size()); 
 
         // bc
         // if we can fit the barcode into an integer 
@@ -941,7 +627,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
           bw << umi;
         }
 
-        for (auto& aln : accepted_hits) {
+        for (auto& aln : (*accepted_hits_ptr)) {
           uint32_t fw_mask = aln.is_fw ? 0x80000000 : 0x00000000;
           bw << (aln.tid | fw_mask);
         }
@@ -969,9 +655,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
         bw << num_reads_in_chunk;
       }
 
-      validHits += accepted_hits.size();
-      localNumAssignedFragments += (accepted_hits.size() > 0);
-      localNumUniqueMappings += (accepted_hits.size() == 1) ? 1 : 0;
+      validHits += accepted_hits_ptr->size();
+      localNumAssignedFragments += (accepted_hits_ptr->size() > 0);
+      localNumUniqueMappings += (accepted_hits_ptr->size() == 1) ? 1 : 0;
       locRead++;
       ++numObservedFragments;
       jointHitGroup.clearAlignments();
@@ -1028,7 +714,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
   }
   numAssignedFragments += localNumAssignedFragments;
   numUniqueMappings += localNumUniqueMappings;
-  mstats.numDecoyFragments += numDecoyFrags;
+  mstats.numDecoyFragments += 0; // no decoys with pasc (yet?)
   readExp.updateShortFrags(shortFragStats);
 }
 



View it on GitLab: https://salsa.debian.org/med-team/salmon/-/commit/2fde979a6b705a0380d2428de0026ede19bd241d

-- 
View it on GitLab: https://salsa.debian.org/med-team/salmon/-/commit/2fde979a6b705a0380d2428de0026ede19bd241d
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230308/623c51ef/attachment-0001.htm>


More information about the debian-med-commit mailing list