[med-svn] [Git][med-team/salmon][upstream] New upstream version 1.10.0+ds1
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Mar 8 13:23:28 GMT 2023
Andreas Tille pushed to branch upstream at Debian Med / salmon
Commits:
2fde979a by Andreas Tille at 2023-03-08T13:46:24+01:00
New upstream version 1.10.0+ds1
- - - - -
17 changed files:
- CMakeLists.txt
- cmake/Modules/Findlibstadenio.cmake
- cmake/TestSalmonQuasi.cmake
- current_version.txt
- doc/source/conf.py
- docker/Dockerfile
- docker/build_test.sh
- include/AlevinOpts.hpp
- include/SalmonConfig.hpp
- include/SalmonMappingUtils.hpp
- include/Transcript.hpp
- scripts/fetchPufferfish.sh
- scripts/v1_10x/run.sh
- src/Alevin.cpp
- src/AlevinUtils.cpp
- src/CMakeLists.txt
- src/SalmonAlevin.cpp
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.15)
if(DEFINED ENV{CC})
set(CC $ENV{CC})
@@ -662,7 +662,7 @@ tar -xzvf v2021.5.tar.gz
SOURCE_DIR ${TBB_SOURCE_DIR}
INSTALL_DIR ${TBB_INSTALL_DIR}
PATCH_COMMAND "${TBB_PATCH_STEP}"
-CMAKE_ARGS -DCMAKE_CXX_FLAGS=${TBB_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DTBB_TEST=OFF -DTBB_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+CMAKE_ARGS -DCMAKE_CXX_FLAGS=${TBB_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DTBB_TEST=OFF -DTBB_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
BUILD_IN_SOURCE TRUE
)
@@ -786,7 +786,7 @@ find_package(CURL)
if (FETCH_STADEN)
set(LIBSTADEN_FOUND FALSE)
else ()
- find_package(libstadenio)
+ find_package(libstadenio 1.14.15)
endif()
if (NOT LIBSTADENIO_FOUND)
@@ -794,12 +794,12 @@ if (NOT LIBSTADENIO_FOUND)
message("==================================================================")
externalproject_add(libstadenio
DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external
- DOWNLOAD_COMMAND curl -k -L https://github.com/COMBINE-lab/staden-io_lib/archive/v1.14.8.1.tar.gz -o staden-io_lib-v1.14.8.tar.gz &&
- ${SHASUM} f6f30eefa478cfb708f3109a35fb6ffa0e24951d9d971985df2cef5919dd0bc3 staden-io_lib-v1.14.8.tar.gz &&
- mkdir -p staden-io_lib-1.14.8 &&
- tar -xzf staden-io_lib-v1.14.8.tar.gz --strip-components=1 -C staden-io_lib-1.14.8 &&
+ DOWNLOAD_COMMAND curl -k -L https://github.com/jkbonfield/io_lib/releases/download/io_lib-1-14-15/io_lib-1.14.15.tar.gz -o staden-io_lib-v1.14.15.tar.gz &&
+ ${SHASUM} 20814c4365e1e2fe6630fb11d0df370dec4c5688af3871de7f1cb0129671401e staden-io_lib-v1.14.15.tar.gz &&
+ mkdir -p staden-io_lib-1.14.15 &&
+ tar -xzf staden-io_lib-v1.14.15.tar.gz --strip-components=1 -C staden-io_lib-1.14.15 &&
rm -fr staden-io_lib &&
- mv -f staden-io_lib-1.14.8 staden-io_lib
+ mv -f staden-io_lib-1.14.15 staden-io_lib
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/staden-io_lib
INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/install
CONFIGURE_COMMAND ./configure --enable-shared=no --without-libcurl --prefix=<INSTALL_DIR> LDFLAGS=${LIBSTADEN_LDFLAGS} CFLAGS=${LIBSTADEN_CFLAGS} CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER}
@@ -814,7 +814,7 @@ if (NOT LIBSTADENIO_FOUND)
endif()
set(FETCHED_STADEN TRUE)
- set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a")
+ set(STADEN_LIBRARIES "${GAT_SOURCE_DIR}/external/install/lib/libstaden-read.a;${GAT_SOURCE_DIR}/external/install/lib/libhtscodecs.a")
endif()
if (ASAN_BUILD)
@@ -906,11 +906,20 @@ set(CPACK_SOURCE_IGNORE_FILES
message("CPACK_SOURCE_IGNORE_FILES = ${CPACK_SOURCE_IGNORE_FILES}")
+# we will use this property later
+define_property(TARGET PROPERTY COMPACT_VECTOR_DIR INHERITED
+ BRIEF_DOCS "the path to the directory containing the compact_vector include tree"
+ FULL_DOCS "the path to the directory containing the compact_vector include tree")
+
# Recurse into pufferfish source directory
# and build the library
set(BUILD_PUFF_FOR_SALMON TRUE)
add_subdirectory(external/pufferfish)
+# make sure we know the path to compact_vector
+get_property(COMPACT_VECTOR_INCLUDE_PATH TARGET graphdump PROPERTY COMPACT_VECTOR_DIR)
+message("fetched path for compact_vector as [${COMPACT_VECTOR_INCLUDE_PATH}]")
+
# and then the main salmon source directory
add_subdirectory(src)
=====================================
cmake/Modules/Findlibstadenio.cmake
=====================================
@@ -10,7 +10,10 @@ find_path(STADEN_INCLUDE_DIR io_lib
HINTS ${STADEN_ROOT} ENV STADEN_ROOT
PATH_SUFFIXES include)
-find_library(STADEN_LIBRARY NAMES staden-read libstaden-read
+find_library(STADEN_LIBRARY NAMES staden-read libstaden-read
+ HINTS ${STADEN_ROOT} ENV STADEN_ROOT PATH_SUFFIXES lib lib64)
+
+find_library(HTSCODEC_LIBRARY NAMES htscodecs libhtscodecs
HINTS ${STADEN_ROOT} ENV STADEN_ROOT PATH_SUFFIXES lib lib64)
if(STADEN_INCLUDE_DIR)
@@ -30,7 +33,7 @@ find_package_handle_standard_args(libstadenio DEFAULT_MSG
if (LIBSTADENIO_FOUND)
message(STATUS "Staden IOLib found (include: ${STADEN_INCLUDE_DIR})")
- set(STADEN_LIBRARIES ${STADEN_LIBRARY})
+ set(STADEN_LIBRARIES "${STADEN_LIBRARY};${HTSCODEC_LIBRARY}")
endif()
mark_as_advanced(STADEN_INCLUDE_DIR STADEN_LIBRARY)
=====================================
cmake/TestSalmonQuasi.cmake
=====================================
@@ -14,7 +14,10 @@ execute_process(COMMAND ${SALMON_QUASI_INDEX_CMD}
)
if (SALMON_QUASI_INDEX_RESULT)
- message(FATAL_ERROR "Error running ${SALMON_QUASI_INDEX_COMMAND}")
+ message(FATAL_ERROR
+ "While running: ${SALMON_QUASI_INDEX_CMD}\n"
+ "error: ${SALMON_QUASI_INDEX_RESULT}"
+ )
endif()
set(SALMON_QUANT_COMMAND ${CMAKE_BINARY_DIR}/salmon quant -i sample_salmon_quasi_index -l IU -1 reads_1.fastq -2 reads_2.fastq -o sample_salmon_quasi_quant)
@@ -23,7 +26,10 @@ execute_process(COMMAND ${SALMON_QUANT_COMMAND}
RESULT_VARIABLE SALMON_QUASI_QUANT_RESULT
)
if (SALMON_QUASI_QUANT_RESULT)
- message(FATAL_ERROR "Error running ${SALMON_QUASI_QUANT_RESULT}")
+ message(FATAL_ERROR
+ "While running: ${SALMON_QUANT_COMMAND}\n"
+ "error: ${SALMON_QUASI_QUANT_RESULT}"
+ )
endif()
if (EXISTS ${TOPLEVEL_DIR}/sample_data/sample_salmon_quasi_quant/quant.sf)
=====================================
current_version.txt
=====================================
@@ -1,3 +1,3 @@
VERSION_MAJOR 1
-VERSION_MINOR 9
+VERSION_MINOR 10
VERSION_PATCH 0
=====================================
doc/source/conf.py
=====================================
@@ -55,9 +55,9 @@ copyright = u'2013-2021, Rob Patro, Geet Duggal, Mike Love, Rafael Irizarry and
# built documents.
#
# The short X.Y version.
-version = '1.9'
+version = '1.10'
# The full version, including alpha/beta/rc tags.
-release = '1.9.0'
+release = '1.10.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
=====================================
docker/Dockerfile
=====================================
@@ -6,7 +6,7 @@ MAINTAINER salmon.maintainer at gmail.com
ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \
ca-certificates zlib1g-dev libcurl4-openssl-dev curl unzip autoconf apt-transport-https ca-certificates gnupg software-properties-common wget
-ENV SALMON_VERSION 1.9.0
+ENV SALMON_VERSION 1.10.0
# salmon binary will be installed in /home/salmon/bin/salmon
=====================================
docker/build_test.sh
=====================================
@@ -1,3 +1,3 @@
#! /bin/bash
-SALMON_VERSION=1.9.0
+SALMON_VERSION=1.10.0
docker build --no-cache -t combinelab/salmon:${SALMON_VERSION} -t combinelab/salmon:latest .
=====================================
include/AlevinOpts.hpp
=====================================
@@ -13,6 +13,10 @@
enum class BarcodeEnd { FIVE = 5, THREE = 3 };
enum class Sequence { BARCODE, UMI };
+// refers to which reads to use for mapping; use both for 5' libraries
+enum class ReadsToMap { USE_BOTH, USE_FIRST, USE_SECOND };
+// refers to which reads map to transcripts and to which transcripts they map to
+enum class PairingStatus { UNPAIRED_LEFT, UNPAIRED_RIGHT, PAIRED_FR, PAIRED_RF };
template <class protocolT>
struct AlevinOpts {
@@ -82,11 +86,11 @@ struct AlevinOpts {
// initialize EM with uniform prior
bool initUniform;
//number of cells
- uint32_t numCells;
+ uint64_t numCells;
// minimum number of CB to use for low confidence region
- uint32_t lowRegionMinNumBarcodes;
+ uint64_t lowRegionMinNumBarcodes;
// maximum number of barcodes to use
- uint32_t maxNumBarcodes;
+ uint64_t maxNumBarcodes;
// number of bootstraps to perform
uint32_t numBootstraps;
// number of gibbs samples to perform
@@ -115,27 +119,27 @@ struct AlevinOpts {
boost::filesystem::path bfhFile;
//meta-info related tags
- uint32_t totalReads;
- uint32_t totalUsedReads;
- uint32_t readsThrown;
+ uint64_t totalReads;
+ uint64_t totalUsedReads;
+ uint64_t readsThrown;
- uint32_t totalCBs;
- uint32_t totalUsedCBs;
+ uint64_t totalCBs;
+ uint64_t totalUsedCBs;
- uint32_t kneeCutoff;
- uint32_t intelligentCutoff;
- uint32_t totalLowConfidenceCBs;
- uint32_t numFeatures;
- uint32_t numNoMapCB;
+ uint64_t kneeCutoff;
+ uint64_t intelligentCutoff;
+ uint64_t totalLowConfidenceCBs;
+ uint64_t numFeatures;
+ uint64_t numNoMapCB;
- uint32_t eqReads;
- uint32_t noisyUmis;
+ uint64_t eqReads;
+ uint64_t noisyUmis;
double mappingRate;
double keepCBFraction;
double vbemNorm;
- uint32_t totalDedupUMIs;
- uint32_t totalExpGenes;
+ uint64_t totalDedupUMIs;
+ uint64_t totalExpGenes;
};
#endif // ALEVIN_OPTS_HPP
=====================================
include/SalmonConfig.hpp
=====================================
@@ -26,9 +26,9 @@
namespace salmon {
constexpr char majorVersion[] = "1";
-constexpr char minorVersion[] = "9";
+constexpr char minorVersion[] = "10";
constexpr char patchVersion[] = "0";
-constexpr char version[] = "1.9.0";
+constexpr char version[] = "1.10.0";
constexpr uint32_t indexVersion = 5;
constexpr char requiredQuasiIndexVersion[] = "p7";
} // namespace salmon
=====================================
include/SalmonMappingUtils.hpp
=====================================
@@ -22,7 +22,6 @@
#ifndef __SALMON_MAPPING_UTILS__
#define __SALMON_MAPPING_UTILS__
-
#include <algorithm>
#include <atomic>
#include <cassert>
@@ -39,107 +38,122 @@
// Future C++ convenience classes
#include "core/range.hpp"
-#include "SalmonDefaults.hpp"
-#include "SalmonMath.hpp"
-#include "SalmonUtils.hpp"
-#include "Transcript.hpp"
+#include "AlevinOpts.hpp"
#include "AlignmentGroup.hpp"
#include "ProgramOptionsGenerator.hpp"
#include "ReadExperiment.hpp"
+#include "SalmonDefaults.hpp"
+#include "SalmonMath.hpp"
#include "SalmonOpts.hpp"
+#include "SalmonUtils.hpp"
+#include "Transcript.hpp"
-#include "pufferfish/Util.hpp"
-#include "pufferfish/MemCollector.hpp"
+#include "parallel_hashmap/phmap.h"
#include "pufferfish/MemChainer.hpp"
-#include "pufferfish/SAMWriter.hpp"
+#include "pufferfish/MemCollector.hpp"
#include "pufferfish/PuffAligner.hpp"
-#include "pufferfish/ksw2pp/KSW2Aligner.hpp"
-#include "pufferfish/metro/metrohash64.h"
+#include "pufferfish/SAMWriter.hpp"
#include "pufferfish/SelectiveAlignmentUtils.hpp"
+#include "pufferfish/Util.hpp"
#include "pufferfish/itlib/small_vector.hpp"
-#include "parallel_hashmap/phmap.h"
+#include "pufferfish/ksw2pp/KSW2Aligner.hpp"
+#include "pufferfish/metro/metrohash64.h"
namespace salmon {
- namespace mapping_utils {
+namespace mapping_utils {
- using MateStatus = pufferfish::util::MateStatus;
- constexpr const int32_t invalid_score_ = std::numeric_limits<int32_t>::min();
- constexpr const int32_t invalid_index_ = std::numeric_limits<int32_t>::min();
- constexpr const size_t static_vec_size = 32;
+using MateStatus = pufferfish::util::MateStatus;
+constexpr const int32_t invalid_score_ = std::numeric_limits<int32_t>::min();
+constexpr const int32_t invalid_index_ = std::numeric_limits<int32_t>::min();
+constexpr const size_t static_vec_size = 32;
- class MappingScoreInfo {
- public:
- MappingScoreInfo()
- : bestScore(invalid_score_), secondBestScore(invalid_score_),
- bestDecoyScore(invalid_score_), decoyThresh(1.0), collect_decoy_info_(false) {}
+class MappingScoreInfo {
+public:
+ MappingScoreInfo()
+ : bestScore(invalid_score_), secondBestScore(invalid_score_),
+ bestDecoyScore(invalid_score_), decoyThresh(1.0),
+ collect_decoy_info_(false) {}
- MappingScoreInfo(double decoyThreshIn) : MappingScoreInfo() {
- decoyThresh = decoyThreshIn;
- }
+ MappingScoreInfo(double decoyThreshIn) : MappingScoreInfo() {
+ decoyThresh = decoyThreshIn;
+ }
- void collect_decoys(bool do_collect) { collect_decoy_info_ = do_collect; }
- bool collect_decoys() const { return collect_decoy_info_; }
-
- // clear everything but the decoy threshold
- void clear(size_t num_hits) {
- bestScore = invalid_score_;
- secondBestScore = invalid_score_;
- bestDecoyScore = invalid_score_;
- scores_.clear(); scores_.resize(num_hits, invalid_score_);
- bestScorePerTranscript_.clear();
- perm_.clear();
- // NOTE: we do _not_ reset decoyThresh here
- if (collect_decoy_info_) {
- bool revert_to_static = best_decoy_hits.size() > static_vec_size;
- best_decoy_hits.clear();
- if (revert_to_static) { best_decoy_hits.revert_to_static(); }
- }
- }
+ void collect_decoys(bool do_collect) { collect_decoy_info_ = do_collect; }
+ bool collect_decoys() const { return collect_decoy_info_; }
- bool haveOnlyDecoyMappings() const {
- // if the best non-decoy mapping has score less than decoyThresh *
- // bestDecoyScore and if the bestDecoyScore is a valid value, then we
- // have no valid non-decoy mappings.
- return (bestScore < static_cast<int32_t>(decoyThresh * bestDecoyScore)) and
- (bestDecoyScore > std::numeric_limits<decltype(bestDecoyScore)>::min());
+ // clear everything but the decoy threshold
+ void clear(size_t num_hits) {
+ bestScore = invalid_score_;
+ secondBestScore = invalid_score_;
+ bestDecoyScore = invalid_score_;
+ scores_.clear();
+ scores_.resize(num_hits, invalid_score_);
+ bestScorePerTranscript_.clear();
+ perm_.clear();
+ // NOTE: we do _not_ reset decoyThresh here
+ if (collect_decoy_info_) {
+ bool revert_to_static = best_decoy_hits.size() > static_vec_size;
+ best_decoy_hits.clear();
+ if (revert_to_static) {
+ best_decoy_hits.revert_to_static();
}
-
- inline bool update_decoy_mappings(int32_t hitScore, size_t idx, uint32_t tid) {
- const bool better_score = hitScore > bestDecoyScore;
- if (hitScore > bestDecoyScore) {
- bestDecoyScore = hitScore;
- if (collect_decoy_info_) {
- best_decoy_hits.clear();
- best_decoy_hits.push_back(std::make_pair(static_cast<int32_t>(idx), static_cast<int32_t>(tid)));
- }
- } else if (collect_decoy_info_ and (hitScore == bestDecoyScore)){
- best_decoy_hits.push_back(std::make_pair(static_cast<int32_t>(idx), static_cast<int32_t>(tid)));
- }
- return better_score;
+ }
+ }
+
+ bool haveOnlyDecoyMappings() const {
+ // if the best non-decoy mapping has score less than decoyThresh *
+ // bestDecoyScore and if the bestDecoyScore is a valid value, then we
+ // have no valid non-decoy mappings.
+ return (bestScore < static_cast<int32_t>(decoyThresh * bestDecoyScore)) and
+ (bestDecoyScore >
+ std::numeric_limits<decltype(bestDecoyScore)>::min());
+ }
+
+ inline bool update_decoy_mappings(int32_t hitScore, size_t idx,
+ uint32_t tid) {
+ const bool better_score = hitScore > bestDecoyScore;
+ if (hitScore > bestDecoyScore) {
+ bestDecoyScore = hitScore;
+ if (collect_decoy_info_) {
+ best_decoy_hits.clear();
+ best_decoy_hits.push_back(std::make_pair(static_cast<int32_t>(idx),
+ static_cast<int32_t>(tid)));
}
+ } else if (collect_decoy_info_ and (hitScore == bestDecoyScore)) {
+ best_decoy_hits.push_back(
+ std::make_pair(static_cast<int32_t>(idx), static_cast<int32_t>(tid)));
+ }
+ return better_score;
+ }
- int32_t bestScore;
- int32_t secondBestScore;
- int32_t bestDecoyScore;
- double decoyThresh;
- itlib::small_vector<std::pair<int32_t, int32_t>> best_decoy_hits;
- bool collect_decoy_info_;
- std::vector<int32_t> scores_;
- phmap::flat_hash_map<uint32_t, std::pair<int32_t, int32_t>> bestScorePerTranscript_;
- std::vector<std::pair<int32_t, int32_t>> perm_;
- };
+ int32_t bestScore;
+ int32_t secondBestScore;
+ int32_t bestDecoyScore;
+ double decoyThresh;
+ itlib::small_vector<std::pair<int32_t, int32_t>> best_decoy_hits;
+ bool collect_decoy_info_;
+ std::vector<int32_t> scores_;
+ phmap::flat_hash_map<uint32_t, std::pair<int32_t, int32_t>>
+ bestScorePerTranscript_;
+ std::vector<std::pair<int32_t, int32_t>> perm_;
+};
template <typename IndexT>
-inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& memCollector, ksw2pp::KSW2Aligner& aligner,
- pufferfish::util::AlignmentConfig& aconf, pufferfish::util::MappingConstraintPolicy& mpol) {
+inline bool
+initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& memCollector,
+ ksw2pp::KSW2Aligner& aligner,
+ pufferfish::util::AlignmentConfig& aconf,
+ pufferfish::util::MappingConstraintPolicy& mpol) {
memCollector.configureMemClusterer(salmonOpts.maxOccsPerHit);
- double consensusFraction = (salmonOpts.consensusSlack == 0.0) ? 1.0 : (1.0 - salmonOpts.consensusSlack);
+ double consensusFraction = (salmonOpts.consensusSlack == 0.0)
+ ? 1.0
+ : (1.0 - salmonOpts.consensusSlack);
memCollector.setConsensusFraction(consensusFraction);
memCollector.setHitFilterPolicy(salmonOpts.hitFilterPolicy);
memCollector.setAltSkip(salmonOpts.mismatchSeedSkip);
memCollector.setChainSubOptThresh(salmonOpts.pre_merge_chain_sub_thresh);
- //Initialize ksw aligner
+ // Initialize ksw aligner
ksw2pp::KSW2Config config;
config.dropoff = -1;
config.gapo = salmonOpts.gapOpenPenalty;
@@ -166,14 +180,17 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& mem
aconf.mimicBT2 = salmonOpts.mimicBT2;
aconf.mimicBT2Strict = salmonOpts.mimicStrictBT2;
aconf.allowOverhangSoftclip = salmonOpts.softclipOverhangs;
- aconf.allowSoftclip = salmonOpts.softclip;
+ aconf.allowSoftclip = salmonOpts.softclip;
aconf.useAlignmentCache = !salmonOpts.disableAlignmentCache;
aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::SCORE_ONLY;
- // we actually care about the softclips in the cigar string
- // if we are writing output and softclipping (or softclipping of overhangs) is enabled
- if ( (!salmonOpts.qmFileName.empty()) and (salmonOpts.softclip or salmonOpts.softclipOverhangs) ) {
- aconf.alignmentMode = pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR;
+ // we actually care about the softclips in the cigar string
+ // if we are writing output and softclipping (or softclipping of overhangs) is
+ // enabled
+ if ((!salmonOpts.qmFileName.empty()) and
+ (salmonOpts.softclip or salmonOpts.softclipOverhangs)) {
+ aconf.alignmentMode =
+ pufferfish::util::PuffAlignmentMode::APPROXIMATE_CIGAR;
}
mpol.noOrphans = !salmonOpts.allowOrphans;
@@ -189,25 +206,26 @@ inline bool initMapperSettings(SalmonOpts& salmonOpts, MemCollector<IndexT>& mem
mpol.setPostMergeChainSubThresh(salmonOpts.post_merge_chain_sub_thresh);
mpol.setOrphanChainSubThresh(salmonOpts.orphan_chain_sub_thresh);
-
+
return true;
}
-
-inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, size_t idx,
- const std::vector<Transcript>& transcripts,
- int32_t invalidScore,
- salmon::mapping_utils::MappingScoreInfo& msi) {
+inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat,
+ size_t idx,
+ const std::vector<Transcript>& transcripts,
+ int32_t invalidScore,
+ salmon::mapping_utils::MappingScoreInfo& msi) {
auto& scores = msi.scores_;
scores[idx] = hitScore;
auto& t = transcripts[tid];
bool isDecoy = t.isDecoy();
- double decoyCutoff = static_cast<int32_t>(msi.decoyThresh * msi.bestDecoyScore);
+ double decoyCutoff =
+ static_cast<int32_t>(msi.decoyThresh * msi.bestDecoyScore);
- //if (hitScore < decoyCutoff or (hitScore == invalidScore)) { }
+ // if (hitScore < decoyCutoff or (hitScore == invalidScore)) { }
if (isDecoy) {
- // NOTE: decide here if we need to process any of this if the
+ // NOTE: decide here if we need to process any of this if the
// current score is < the best (non-decoy) score. I think not.
// if this is a decoy and its score is better than the best decoy score
@@ -215,10 +233,10 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
(void)did_update;
return;
} else if (hitScore < decoyCutoff or (hitScore == invalidScore)) {
- // if the current score is to a valid target but doesn't
+ // if the current score is to a valid target but doesn't
// exceed the necessary decoy threshold, then skip it.
return;
- }
+ }
// otherwise, we have a "high-scoring" hit to a non-decoy
auto& perm = msi.perm_;
@@ -230,7 +248,8 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
// this is the current best
bestScorePerTranscript[tid].first = hitScore;
bestScorePerTranscript[tid].second = idx;
- } else if ((hitScore > it->second.first) or (hitScore == it->second.first and isCompat)) {
+ } else if ((hitScore > it->second.first) or
+ (hitScore == it->second.first and isCompat)) {
// otherwise, if we had an alignment for this transcript and it's
// better than the current best, then set the best score to this
// alignment's score, and invalidate the previous alignment
@@ -241,7 +260,7 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
// otherwise, there is already a better mapping for this transcript.
scores[idx] = invalidScore;
}
-
+
if (hitScore > msi.bestScore) {
msi.secondBestScore = msi.bestScore;
msi.bestScore = hitScore;
@@ -249,44 +268,50 @@ inline void updateRefMappings(uint32_t tid, int32_t hitScore, bool isCompat, siz
perm.push_back(std::make_pair(idx, tid));
}
-
inline void filterAndCollectAlignments(
- std::vector<pufferfish::util::JointMems>& jointHits,
- uint32_t readLen,
- uint32_t mateLen,
- bool singleEnd,
- bool tryAlign,
- bool hardFilter,
- double scoreExp,
- double minAlnProb,
- salmon::mapping_utils::MappingScoreInfo& msi,
- std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
+ std::vector<pufferfish::util::JointMems>& jointHits, uint32_t readLen,
+ uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter,
+ double scoreExp, double minAlnProb,
+ salmon::mapping_utils::MappingScoreInfo& msi,
+ std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
auto invalidScore = std::numeric_limits<decltype(msi.bestDecoyScore)>::min();
- if (msi.bestDecoyScore == invalidScore) { msi.bestDecoyScore = invalidScore + 1 ;}
+ if (msi.bestDecoyScore == invalidScore) {
+ msi.bestDecoyScore = invalidScore + 1;
+ }
- int32_t decoyThreshold = static_cast<decltype(msi.bestDecoyScore)>(msi.decoyThresh * msi.bestDecoyScore);
+ int32_t decoyThreshold = static_cast<decltype(msi.bestDecoyScore)>(
+ msi.decoyThresh * msi.bestDecoyScore);
- //auto filterScore = (bestDecoyScore < secondBestScore) ? secondBestScore : bestDecoyScore;
+ // auto filterScore = (bestDecoyScore < secondBestScore) ? secondBestScore :
+ // bestDecoyScore;
auto& scores = msi.scores_;
auto& perm = msi.perm_;
// throw away any pairs for which we should not produce valid alignments :
// ======
- // If we are doing soft-filtering (default), we remove those not exceeding the bestDecoyScore
- // If we are doing hard-filtering, we remove those less than the bestScore
- perm.erase(std::remove_if(perm.begin(), perm.end(),
- [&scores, hardFilter, &msi, decoyThreshold, invalidScore](const std::pair<int32_t, int32_t>& idxtid) -> bool {
- return !hardFilter ? scores[idxtid.first] < decoyThreshold : scores[idxtid.first] < msi.bestScore;
- //return !hardFilter ? scores[idxtid.first] < filterScore : scores[idxtid.first] < bestScore;
- }), perm.end());
-
- // Unlike RapMap, pufferfish doesn't guarantee the hits computed above are in order
- // by transcript, so we find the permutation of indices that puts things in transcript
- // order.
- std::sort(perm.begin(), perm.end(), [](const std::pair<int32_t, int32_t>& p1,
- const std::pair<int32_t, int32_t>& p2) {
- return p1.second < p2.second;});
+ // If we are doing soft-filtering (default), we remove those not exceeding the
+ // bestDecoyScore If we are doing hard-filtering, we remove those less than
+ // the bestScore
+ perm.erase(std::remove_if(
+ perm.begin(), perm.end(),
+ [&scores, hardFilter, &msi, decoyThreshold, invalidScore](
+ const std::pair<int32_t, int32_t>& idxtid) -> bool {
+ return !hardFilter ? scores[idxtid.first] < decoyThreshold
+ : scores[idxtid.first] < msi.bestScore;
+ // return !hardFilter ? scores[idxtid.first] < filterScore :
+ // scores[idxtid.first] < bestScore;
+ }),
+ perm.end());
+
+ // Unlike RapMap, pufferfish doesn't guarantee the hits computed above are in
+ // order by transcript, so we find the permutation of indices that puts things
+ // in transcript order.
+ std::sort(perm.begin(), perm.end(),
+ [](const std::pair<int32_t, int32_t>& p1,
+ const std::pair<int32_t, int32_t>& p2) {
+ return p1.second < p2.second;
+ });
// moving our alinged / score jointMEMs over to QuasiAlignment objects
double bestScoreD = static_cast<double>(msi.bestScore);
@@ -298,27 +323,33 @@ inline void filterAndCollectAlignments(
double currScore = scores[ctr];
double v = bestScoreD - currScore;
// why -1?
- double estAlnProb = hardFilter ? -1.0 : std::exp(- scoreExp * v );
+ double estAlnProb = hardFilter ? -1.0 : std::exp(-scoreExp * v);
// skip any alignment with aln prob < minAlnProb
- if (!hardFilter and (estAlnProb < minAlnProb)) { continue; }
+ if (!hardFilter and (estAlnProb < minAlnProb)) {
+ continue;
+ }
if (singleEnd or jointHit.isOrphan()) {
readLen = jointHit.isLeftAvailable() ? readLen : mateLen;
- jointAlignments.emplace_back(tid, // reference id
- jointHit.orphanClust()->getTrFirstHitPos(), // reference pos
- jointHit.orphanClust()->isFw, // fwd direction
- readLen, // read length
- jointHit.orphanClust()->cigar, // cigar string
- jointHit.fragmentLen, // fragment length
- false);
- auto &qaln = jointAlignments.back();
+ jointAlignments.emplace_back(
+ tid, // reference id
+ jointHit.orphanClust()->getTrFirstHitPos(), // reference pos
+ jointHit.orphanClust()->isFw, // fwd direction
+ readLen, // read length
+ jointHit.orphanClust()->cigar, // cigar string
+ jointHit.fragmentLen, // fragment length
+ false);
+ auto& qaln = jointAlignments.back();
// NOTE : score should not be filled in from a double
- qaln.score = !tryAlign ? static_cast<int32_t >(jointHit.orphanClust()->coverage):jointHit.alignmentScore;
+ qaln.score = !tryAlign
+ ? static_cast<int32_t>(jointHit.orphanClust()->coverage)
+ : jointHit.alignmentScore;
qaln.estAlnProb(estAlnProb);
// NOTE : wth is numHits?
- qaln.numHits = static_cast<uint32_t >(jointHits.size());//orphanClust()->coverage;
+ qaln.numHits =
+ static_cast<uint32_t>(jointHits.size()); // orphanClust()->coverage;
qaln.mateStatus = jointHit.mateStatus;
- if(singleEnd) {
+ if (singleEnd) {
qaln.mateLen = readLen;
qaln.mateCigar.clear();
qaln.matePos = 0;
@@ -327,116 +358,732 @@ inline void filterAndCollectAlignments(
qaln.mateStatus = MateStatus::SINGLE_END;
}
} else {
- jointAlignments.emplace_back(tid, // reference id
- jointHit.leftClust->getTrFirstHitPos(), // reference pos
- jointHit.leftClust->isFw, // fwd direction
- readLen, // read length
- jointHit.leftClust->cigar, // cigar string
- jointHit.fragmentLen, // fragment length
- true); // properly paired
+ jointAlignments.emplace_back(
+ tid, // reference id
+ jointHit.leftClust->getTrFirstHitPos(), // reference pos
+ jointHit.leftClust->isFw, // fwd direction
+ readLen, // read length
+ jointHit.leftClust->cigar, // cigar string
+ jointHit.fragmentLen, // fragment length
+ true); // properly paired
// Fill in the mate info
- auto &qaln = jointAlignments.back();
+ auto& qaln = jointAlignments.back();
qaln.mateLen = mateLen;
qaln.mateCigar = jointHit.rightClust->cigar;
- qaln.matePos = static_cast<int32_t >(jointHit.rightClust->getTrFirstHitPos());
+ qaln.matePos =
+ static_cast<int32_t>(jointHit.rightClust->getTrFirstHitPos());
qaln.mateIsFwd = jointHit.rightClust->isFw;
qaln.mateStatus = MateStatus::PAIRED_END_PAIRED;
// NOTE : wth is numHits?
- qaln.numHits = static_cast<uint32_t >(jointHits.size());
+ qaln.numHits = static_cast<uint32_t>(jointHits.size());
// NOTE : score should not be filled in from a double
- qaln.score = !tryAlign ? static_cast<int32_t >(jointHit.leftClust->coverage):jointHit.alignmentScore;
+ qaln.score = !tryAlign
+ ? static_cast<int32_t>(jointHit.leftClust->coverage)
+ : jointHit.alignmentScore;
qaln.estAlnProb(estAlnProb);
- qaln.mateScore = !tryAlign ? static_cast<int32_t >(jointHit.rightClust->coverage):jointHit.mateAlignmentScore;
+ qaln.mateScore = !tryAlign
+ ? static_cast<int32_t>(jointHit.rightClust->coverage)
+ : jointHit.mateAlignmentScore;
}
}
// done moving our alinged / score jointMEMs over to QuasiAlignment objects
}
-
inline void filterAndCollectAlignmentsDecoy(
- std::vector<pufferfish::util::JointMems>& jointHits,
- uint32_t readLen,
- uint32_t mateLen,
- bool singleEnd,
- bool tryAlign,
- bool hardFilter,
- double scoreExp,
- double minAlnProb,
- salmon::mapping_utils::MappingScoreInfo& msi,
- std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
-// NOTE: this function should only be called in the case that we have valid decoy mappings to report.
-// Currently, this happens only when there are *no valid non-decoy* mappings.
-// Further, this function will only add equally *best* decoy mappings to the output jointAlignments object
-// regardless of the the status of hardFilter (i.e. no sub-optimal decoy mappings will be reported).
-(void) hardFilter;
-(void) minAlnProb;
-(void) scoreExp;
-double estAlnProb = 1.0; //std::exp(-scoreExp * 0.0);
-for (auto& idxTxp : msi.best_decoy_hits) {
- int32_t ctr = idxTxp.first;
- int32_t tid = idxTxp.second;
- auto& jointHit = jointHits[ctr];
-
- if (singleEnd or jointHit.isOrphan()) {
- readLen = jointHit.isLeftAvailable() ? readLen : mateLen;
- jointAlignments.emplace_back(
- tid, // reference id
- jointHit.orphanClust()->getTrFirstHitPos(), // reference pos
- jointHit.orphanClust()->isFw, // fwd direction
- readLen, // read length
- jointHit.orphanClust()->cigar, // cigar string
- jointHit.fragmentLen, // fragment length
- false);
- auto& qaln = jointAlignments.back();
- // NOTE : score should not be filled in from a double
- qaln.score = !tryAlign
- ? static_cast<int32_t>(jointHit.orphanClust()->coverage)
- : jointHit.alignmentScore;
- qaln.estAlnProb(estAlnProb);
- // NOTE : wth is numHits?
- qaln.numHits =
- static_cast<uint32_t>(jointHits.size()); // orphanClust()->coverage;
- qaln.mateStatus = jointHit.mateStatus;
- if (singleEnd) {
- qaln.mateLen = readLen;
- qaln.mateCigar.clear();
- qaln.matePos = 0;
- qaln.mateIsFwd = true;
- qaln.mateScore = 0;
- qaln.mateStatus = MateStatus::SINGLE_END;
+ std::vector<pufferfish::util::JointMems>& jointHits, uint32_t readLen,
+ uint32_t mateLen, bool singleEnd, bool tryAlign, bool hardFilter,
+ double scoreExp, double minAlnProb,
+ salmon::mapping_utils::MappingScoreInfo& msi,
+ std::vector<pufferfish::util::QuasiAlignment>& jointAlignments) {
+ // NOTE: this function should only be called in the case that we have valid
+ // decoy mappings to report. Currently, this happens only when there are *no
+ // valid non-decoy* mappings. Further, this function will only add equally
+ // *best* decoy mappings to the output jointAlignments object regardless of
+ // the the status of hardFilter (i.e. no sub-optimal decoy mappings will be
+ // reported).
+ (void)hardFilter;
+ (void)minAlnProb;
+ (void)scoreExp;
+ double estAlnProb = 1.0; // std::exp(-scoreExp * 0.0);
+ for (auto& idxTxp : msi.best_decoy_hits) {
+ int32_t ctr = idxTxp.first;
+ int32_t tid = idxTxp.second;
+ auto& jointHit = jointHits[ctr];
+
+ if (singleEnd or jointHit.isOrphan()) {
+ readLen = jointHit.isLeftAvailable() ? readLen : mateLen;
+ jointAlignments.emplace_back(
+ tid, // reference id
+ jointHit.orphanClust()->getTrFirstHitPos(), // reference pos
+ jointHit.orphanClust()->isFw, // fwd direction
+ readLen, // read length
+ jointHit.orphanClust()->cigar, // cigar string
+ jointHit.fragmentLen, // fragment length
+ false);
+ auto& qaln = jointAlignments.back();
+ // NOTE : score should not be filled in from a double
+ qaln.score = !tryAlign
+ ? static_cast<int32_t>(jointHit.orphanClust()->coverage)
+ : jointHit.alignmentScore;
+ qaln.estAlnProb(estAlnProb);
+ // NOTE : wth is numHits?
+ qaln.numHits =
+ static_cast<uint32_t>(jointHits.size()); // orphanClust()->coverage;
+ qaln.mateStatus = jointHit.mateStatus;
+ if (singleEnd) {
+ qaln.mateLen = readLen;
+ qaln.mateCigar.clear();
+ qaln.matePos = 0;
+ qaln.mateIsFwd = true;
+ qaln.mateScore = 0;
+ qaln.mateStatus = MateStatus::SINGLE_END;
+ }
+ } else {
+ jointAlignments.emplace_back(
+ tid, // reference id
+ jointHit.leftClust->getTrFirstHitPos(), // reference pos
+ jointHit.leftClust->isFw, // fwd direction
+ readLen, // read length
+ jointHit.leftClust->cigar, // cigar string
+ jointHit.fragmentLen, // fragment length
+ true); // properly paired
+ // Fill in the mate info
+ auto& qaln = jointAlignments.back();
+ qaln.mateLen = mateLen;
+ qaln.mateCigar = jointHit.rightClust->cigar;
+ qaln.matePos =
+ static_cast<int32_t>(jointHit.rightClust->getTrFirstHitPos());
+ qaln.mateIsFwd = jointHit.rightClust->isFw;
+ qaln.mateStatus = MateStatus::PAIRED_END_PAIRED;
+ // NOTE : wth is numHits?
+ qaln.numHits = static_cast<uint32_t>(jointHits.size());
+ // NOTE : score should not be filled in from a double
+ qaln.score = !tryAlign
+ ? static_cast<int32_t>(jointHit.leftClust->coverage)
+ : jointHit.alignmentScore;
+ qaln.estAlnProb(estAlnProb);
+ qaln.mateScore = !tryAlign
+ ? static_cast<int32_t>(jointHit.rightClust->coverage)
+ : jointHit.mateAlignmentScore;
}
- } else {
- jointAlignments.emplace_back(
- tid, // reference id
- jointHit.leftClust->getTrFirstHitPos(), // reference pos
- jointHit.leftClust->isFw, // fwd direction
- readLen, // read length
- jointHit.leftClust->cigar, // cigar string
- jointHit.fragmentLen, // fragment length
- true); // properly paired
- // Fill in the mate info
- auto& qaln = jointAlignments.back();
- qaln.mateLen = mateLen;
- qaln.mateCigar = jointHit.rightClust->cigar;
- qaln.matePos =
- static_cast<int32_t>(jointHit.rightClust->getTrFirstHitPos());
- qaln.mateIsFwd = jointHit.rightClust->isFw;
- qaln.mateStatus = MateStatus::PAIRED_END_PAIRED;
- // NOTE : wth is numHits?
- qaln.numHits = static_cast<uint32_t>(jointHits.size());
- // NOTE : score should not be filled in from a double
- qaln.score = !tryAlign ? static_cast<int32_t>(jointHit.leftClust->coverage)
- : jointHit.alignmentScore;
- qaln.estAlnProb(estAlnProb);
- qaln.mateScore = !tryAlign
- ? static_cast<int32_t>(jointHit.rightClust->coverage)
- : jointHit.mateAlignmentScore;
+ } // end for over best decoy hits
+}
+
+namespace pasc {
+
+constexpr int32_t invalid_frag_len = std::numeric_limits<int32_t>::min();
+constexpr int32_t invalid_mate_pos = std::numeric_limits<int32_t>::min();
+
+/* from pesc
+struct simple_hit {
+ bool is_fw{false};
+ bool mate_is_fw{false};
+ int32_t pos{-1};
+ float score{0.0};
+ uint32_t num_hits{0};
+ uint32_t tid{std::numeric_limits<uint32_t>::max()};
+ int32_t mate_pos{std::numeric_limits<int32_t>::max()};
+ int32_t fragment_length{std::numeric_limits<int32_t>::max()};
+ inline bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over)
+{ int32_t signed_txp_len = static_cast<int32_t>(txp_len); return (pos >
+-max_over) and ((pos + read_len) < (signed_txp_len + max_over));
+ }
+ inline bool has_mate() const { return mate_pos != invalid_mate_pos; }
+ inline bool mate_is_mapped() const { return mate_pos != invalid_mate_pos; }
+ inline int32_t frag_len() const {
+ return (fragment_length != invalid_frag_len) ? fragment_length : 0;
+ }
+};
+*/
+struct simple_hit {
+ bool is_fw{false};
+ int32_t pos{-1};
+ float score{0.0};
+ uint32_t num_hits{0};
+ uint32_t tid{std::numeric_limits<uint32_t>::max()};
+ // int32_t mate_pos{std::numeric_limits<int32_t>::max()};
+ // int32_t fragment_length{std::numeric_limits<int32_t>::max()};
+
+ // UNPAIRED_LEFT, UNPAIRED_RIGHT, PAIRED_FR, PAIRED_RF
+ PairingStatus pairing_status{PairingStatus::UNPAIRED_RIGHT};
+
+ bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) {
+ int32_t signed_txp_len = static_cast<int32_t>(txp_len);
+ return (pos > -max_over) and
+ ((pos + read_len) < (signed_txp_len + max_over));
+ }
+
+ bool operator<(const simple_hit& hit2) {
+ if (tid != hit2.tid) { // hit1 and hit2 are on different transcripts
+ return tid < hit2.tid;
+ } else { // hit1 and hit2 are on the same transcript
+ // NOTE: Is this what we actually want?
+ if (is_fw) { // fw < rc
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+};
+
+enum class HitDirection : uint8_t { FW, RC, BOTH };
+
+/*
+struct sketch_hit_info {
+ // add a hit to the current target that occurs in the forward
+ // orientation with respect to the target.
+ bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k,
+ int32_t max_stretch, float score_inc) {
+ (void)rl;
+ (void)k;
+ bool added{false};
+
+ // since hits are collected by moving _forward_ in the
+ // read, if this is a fw hit, it should be moving
+ // forward in the reference. Only add it if this is
+ // the case. This ensure that we don't
+ // double-count a k-mer that might occur twice on
+ // this target.
+ if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) {
+ if (last_read_pos_fw == -1) {
+ approx_pos_fw = ref_pos - read_pos;
+ } else {
+ if ((ref_pos - approx_pos_fw) > max_stretch) {
+ return false;
+ }
+ }
+ // if (last_ref_pos_fw > -1 and (ref_pos > last_ref_pos_fw + 15)) { return
+ // false; }
+ last_ref_pos_fw = ref_pos;
+ last_read_pos_fw = read_pos;
+ fw_score += score_inc;
+ ++fw_hits;
+ added = true;
+ }
+ return added;
+ }
+
+ // add a hit to the current target that occurs in the forward
+ // orientation with respect to the target.
+ bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k,
+ int32_t max_stretch, float score_inc) {
+
+ bool added{false};
+ // since hits are collected by moving _forward_ in the
+ // read, if this is an rc hit, it should be moving
+ // backwards in the reference. Only add it if this is
+ // the case.
+ // This ensures that we don't double-count a k-mer that
+ // might occur twice on this target.
+ if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
+ approx_pos_rc = (ref_pos - (rl - (read_pos + k)));
+ if (last_read_pos_rc == -1) {
+ approx_end_pos_rc = ref_pos + read_pos;
+ } else {
+ if (approx_end_pos_rc - approx_pos_rc > max_stretch) {
+ return false;
+ }
+ }
+ // if (last_ref_pos_rc > -1 and ref_pos < last_ref_pos_rc - 15) { return
+ // false; }
+ last_ref_pos_rc = ref_pos;
+ last_read_pos_rc = read_pos;
+ rc_score += score_inc;
+ ++rc_hits;
+ added = true;
+ }
+ return added;
}
-} // end for over best decoy hits
+
+ inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); }
+
+ // true if forward, false if rc
+ // second element is score
+ inline HitDirection best_hit_direction() {
+ int32_t fw_minus_rc =
+ static_cast<int32_t>(fw_hits) - static_cast<int32_t>(rc_hits);
+ return (fw_minus_rc > 0)
+ ? HitDirection::FW
+ : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH);
+ }
+
+ inline simple_hit get_fw_hit(PairingStatus ps) {
+ return simple_hit{true,
+ approx_pos_fw,
+ fw_score,
+ fw_hits,
+ std::numeric_limits<uint32_t>::max(),
+ ps};
+ }
+
+ inline simple_hit get_rc_hit(PairingStatus ps) {
+ return simple_hit{false,
+ approx_pos_rc,
+ rc_score,
+ rc_hits,
+ std::numeric_limits<uint32_t>::max(),
+ ps};
+ }
+
+ inline std::string to_string() {
+ std::stringstream ss;
+ ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score
+ << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits
+ << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc;
+ return ss.str();
+ }
+
+ int32_t last_read_pos_fw{-1};
+ int32_t last_read_pos_rc{-1};
+
+ int32_t last_ref_pos_fw{-1};
+ int32_t last_ref_pos_rc{std::numeric_limits<int32_t>::max()};
+
+ int32_t approx_pos_fw{-1};
+ int32_t approx_pos_rc{-1};
+ int32_t approx_end_pos_rc{-1};
+
+ uint32_t fw_hits{0};
+ uint32_t rc_hits{0};
+ float fw_score{0.0};
+ float rc_score{0.0};
+};*/
+
+struct sketch_hit_info {
+ // add a hit to the current target that occurs in the forward
+ // orientation with respect to the target.
+ bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch,
+ float score_inc) {
+ (void)rl;
+ (void)k;
+ bool added{false};
+
+ // since hits are collected by moving _forward_ in the
+ // read, if this is a fw hit, it should be moving
+ // forward in the reference. Only add it if this is
+ // the case. This ensure that we don't
+ // double-count a k-mer that might occur twice on
+ // this target.
+ if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) {
+ if (last_read_pos_fw == -1) {
+ approx_pos_fw = ref_pos - read_pos;
+ } else {
+ if ((ref_pos - approx_pos_fw) > max_stretch) { return false; }
+ }
+ last_ref_pos_fw = ref_pos;
+ last_read_pos_fw = read_pos;
+ fw_score += score_inc;
+ ++fw_hits;
+ added = true;
+ }
+ return added;
+ }
+
+ // add a hit to the current target that occurs in the forward
+ // orientation with respect to the target.
+ bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch,
+ float score_inc) {
+ bool added{false};
+ // since hits are collected by moving _forward_ in the
+ // read, if this is an rc hit, it should be moving
+ // backwards in the reference.
+ // In general, we only add the hit if this is the case.
+ // This ensures that we don't double-count a k-mer that
+ // might occur twice on this target.
+
+ // we have a special case here; what if the same exact
+ // k-mer (i.e. not just the same sequence but same position
+ // on the query) occurs more than one time on this refernece?
+ //
+ // In that case, the GENERAL case code will already have
+ // processed and seen a k-mer with the read position
+ // equal to `read_pos`. In the case below, we see
+ // a hit with the *same* read pos again (one or more times).
+ //
+ // Here, we swap out the previous hit having this read_pos
+ // if the position of the current hit on the read is
+ // the same and the position on the reference is greater
+ // (this is a heuristic to help in the case of tandem repeats or
+ // highly-repetitive subsequence).
+ // NOTE: consider if a similar heuristic should be
+ // adopted for the forward case.
+ if ((read_pos == last_read_pos_rc) and (ref_pos > last_ref_pos_rc) and
+ (ref_pos < rightmost_bound_rc)) {
+
+ last_ref_pos_rc = ref_pos;
+ // if the read_pos was the same as the first read pos
+ // then also update the approx_end_pos_rc accordingly
+ // NOTE: for the time being don't mess with this position
+ // empirically this does better, but if we really want
+ // to optimize this for accuracy we need a better general
+ // heuristic.
+ // if (read_pos == first_read_pos_rc) {
+ // approx_end_pos_rc = ref_pos + read_pos;
+ //}
+
+ return added;
+ }
+
+ // GENERAL case
+ if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
+ approx_pos_rc = (ref_pos - (rl - (read_pos + k)));
+ if (last_read_pos_rc == -1) {
+ approx_end_pos_rc = ref_pos + read_pos;
+ first_read_pos_rc = read_pos;
+ } else {
+ if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false; }
+ }
+ rc_score += score_inc;
+ ++rc_hits;
+
+ // new
+ rightmost_bound_rc = last_ref_pos_rc;
+
+ last_ref_pos_rc = ref_pos;
+ last_read_pos_rc = read_pos;
+ added = true;
+ }
+ return added;
+ }
+
+ inline uint32_t max_hits_for_target() { return std::max(fw_hits, rc_hits); }
+
+ // true if forward, false if rc
+ // second element is score
+ inline HitDirection best_hit_direction() {
+ int32_t fw_minus_rc = static_cast<int32_t>(fw_hits) - static_cast<int32_t>(rc_hits);
+ return (fw_minus_rc > 0) ? HitDirection::FW
+ : ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH);
+ }
+
+ inline simple_hit get_fw_hit(PairingStatus ps) {
+ return simple_hit{true, approx_pos_fw,
+ fw_score, fw_hits, std::numeric_limits<uint32_t>::max(), ps};
+ }
+
+ inline simple_hit get_rc_hit(PairingStatus ps) {
+ return simple_hit{false, approx_pos_rc,
+ rc_score, rc_hits, std::numeric_limits<uint32_t>::max(), ps};
+ }
+
+ inline std::string to_string() {
+ std::stringstream ss;
+ ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score
+ << ", fw_pos : " << approx_pos_fw << " || rc_hits: " << rc_hits
+ << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc;
+ return ss.str();
+ }
+
+ int32_t last_read_pos_fw{-1};
+ int32_t last_read_pos_rc{-1};
+ int32_t rightmost_bound_rc{std::numeric_limits<int32_t>::max()};
+
+ // marks the read position (key) of the
+ // first hit we see in the rc direction
+ int32_t first_read_pos_rc{-1};
+
+ int32_t last_ref_pos_fw{-1};
+ int32_t last_ref_pos_rc{std::numeric_limits<int32_t>::max()};
+
+ int32_t approx_pos_fw{-1};
+ int32_t approx_pos_rc{-1};
+ int32_t approx_end_pos_rc{-1};
+
+ uint32_t fw_hits{0};
+ uint32_t rc_hits{0};
+ float fw_score{0.0};
+ float rc_score{0.0};
+};
+
+/**
+ * This contains the information necessary to collect hits and perform
+ * mapping on a read using PASC.
+ **/
+template <typename IndexT> struct mapping_cache_info {
+public:
+ mapping_cache_info(IndexT* midx, size_t max_occ_default_in,
+ size_t max_occ_recover_in, size_t max_read_occ_in)
+ : idx(midx), k(midx->k()), hit_searcher(midx),
+ max_occ_default(max_occ_default_in),
+ max_occ_recover(max_occ_recover_in),
+ attempt_occ_recover(max_occ_recover > max_occ_default),
+ max_read_occ(max_read_occ_in), alt_max_occ(max_read_occ_in) {}
+
+ inline void clear() {
+ map_type = salmon::utils::MappingType::UNMAPPED;
+ hit_searcher.clear();
+ hit_map.clear();
+ accepted_hits.clear();
+ alt_max_occ = max_read_occ;
+ has_matching_kmers = false;
+ }
+
+ // will store how the read mapped
+ salmon::utils::MappingType map_type{salmon::utils::MappingType::UNMAPPED};
+
+ // map from reference id to hit info
+ phmap::flat_hash_map<uint32_t, sketch_hit_info> hit_map;
+
+ // where the mappings that pass will be stored
+ std::vector<simple_hit> accepted_hits;
+
+ // map to recall the number of unmapped reads we see
+ // for each barcode
+ phmap::flat_hash_map<uint64_t, uint32_t> unmapped_bc_map;
+
+ // pointer to the underlying index
+ IndexT* idx{nullptr};
+
+ // the k-mer size of our index
+ size_t k{0};
+
+ // implements the PASC algorithm
+ MemCollector<IndexT> hit_searcher;
+
+ // the number of occurrences of a hit above which we
+ // won't consider it
+ size_t max_occ_default;
+
+ // the number of occurences of a hit above which we
+ // won't consider it, even in recovery mode
+ size_t max_occ_recover;
+
+ // will we attempt recovery if there are no hits with
+ // frequency below max_occ_default?
+ bool attempt_occ_recover;
+
+ // maximum number of places a read can occur and still
+ // be considered.
+ size_t max_read_occ;
+
+ size_t alt_max_occ;
+
+ // to perform queries
+ pufferfish::util::QueryCache qc;
+
+ // regardless of having full mappings, did any k-mers match
+ bool has_matching_kmers{false};
+};
+
+/**
+ * This function will map the read given by `read_seq`
+ * using PASC. The relevant parameters (e.g. maximum ambiguity
+ * of seed and maximum number of allowable mappings) are stored
+ * in the `map_cache` structure. The `PairingStatus` is passed in
+ * by the caller and designates if the read being mapped is the
+ * left end or the right end.
+ **/
+template <typename IndexT>
+inline bool map_read(std::string* read_seq,
+ mapping_cache_info<IndexT>& map_cache, PairingStatus ps,
+ bool verbose = false) {
+ map_cache.clear();
+ // rebind map_cache variables to
+ // local names
+ IndexT* qidx = map_cache.idx;
+ auto& qc = map_cache.qc;
+ auto& hit_searcher = map_cache.hit_searcher;
+ auto& hit_map = map_cache.hit_map;
+ auto& accepted_hits = map_cache.accepted_hits;
+ auto& map_type = map_cache.map_type;
+ const bool attempt_occ_recover = map_cache.attempt_occ_recover;
+ auto k = map_cache.k;
+ int32_t signed_k = static_cast<int32_t>(k);
+
+ // collect the set of matching seeds
+ map_cache.has_matching_kmers =
+ hit_searcher.get_raw_hits_sketch(*read_seq, qc, true, false);
+ bool early_stop = false;
+
+ // if there were hits
+ if (map_cache.has_matching_kmers) {
+ uint32_t num_valid_hits{0};
+ uint64_t total_occs{0};
+ uint64_t largest_occ{0};
+ auto& raw_hits = hit_searcher.get_left_hits();
+
+ // SANITY
+ decltype(raw_hits[0].first) prev_read_pos = -1;
+ // the maximum span the supporting k-mers of a
+ // mapping position are allowed to have.
+ // NOTE this is still > read_length b/c the stretch is measured wrt the
+ // START of the terminal k-mer.
+ int32_t max_stretch = static_cast<int32_t>(read_seq->length() * 1.0);
+
+ // a raw hit is a pair of read_pos and a projected hit
+
+ // the least frequent hit for this fragment.
+ uint64_t min_occ = std::numeric_limits<uint64_t>::max();
+
+ // this is false by default and will be set to true
+ // if *every* collected hit for this fragment occurs
+ // max_occ_default times or more.
+ bool had_alt_max_occ = false;
+ int32_t signed_rl = static_cast<int32_t>(read_seq->length());
+ auto collect_mappings_from_hits =
+ [&max_stretch, &min_occ, &hit_map, &num_valid_hits, &total_occs,
+ &largest_occ, &early_stop, signed_rl, k, signed_k, qidx,
+ verbose](auto& raw_hits, auto& prev_read_pos, auto& max_allowed_occ,
+ auto& had_alt_max_occ) -> bool {
+ for (auto& raw_hit : raw_hits) {
+ auto& read_pos = raw_hit.first;
+ auto& proj_hits = raw_hit.second;
+ auto& refs = proj_hits.refRange;
+
+ // Keep track of the *least* ambiguous hit we see.
+ // If all hits occur more than `max_allowed_occ` times,
+ // but the least ambiguous hit (which occurs `min_occ`
+ // times) has < `max_occ_recover` occurrences, then
+ // we will use all hits occurring `min_occ` number
+ // of times or less to try top recover the read's mappings.
+ uint64_t num_occ = static_cast<uint64_t>(refs.size());
+ min_occ = std::min(min_occ, num_occ);
+ had_alt_max_occ = true;
+
+ // we visit every hit that occurs the allowable number of time
+ bool still_have_valid_target = false;
+ prev_read_pos = read_pos;
+ if (num_occ <= max_allowed_occ) {
+ total_occs += num_occ;
+ largest_occ = (num_occ > largest_occ) ? num_occ : largest_occ;
+ float score_inc = 1.0;
+
+ // vist each mapped reference position (mrp) where this hit
+ // occurs.
+ for (auto& pos_it : refs) {
+ const auto& ref_pos_ori = proj_hits.decodeHit(pos_it);
+ uint32_t tid =
+ static_cast<uint32_t>(qidx->getRefId(pos_it.transcript_id()));
+ int32_t pos = static_cast<int32_t>(ref_pos_ori.pos);
+ bool ori = ref_pos_ori.isFW;
+ auto& target = hit_map[tid];
+
+ // Why >= here instead of == ?
+ // Because hits can happen on the same target in both the forward
+ // and rc orientations, it is possible that we start the loop with
+ // the target having num_valid_hits hits in a given orientation (o)
+ // we see a new hit for this target in oriention o (now it has
+ // num_valid_hits + 1) then we see a hit for this target in
+ // orientation rc(o). We still want to add / consider this hit, but
+ // max_hits_for_target() > num_valid_hits. So, we must allow for
+ // that here.
+ if (target.max_hits_for_target() >= num_valid_hits) {
+ if (ori) {
+ target.add_fw(pos, static_cast<int32_t>(read_pos), signed_rl,
+ signed_k, max_stretch, score_inc);
+ } else {
+ target.add_rc(pos, static_cast<int32_t>(read_pos), signed_rl,
+ signed_k, max_stretch, score_inc);
+ }
+
+ // if this target is still valid after evaluating this hit
+ // then we will have at least one globally valid target. If
+ // no targets are valid after evaluating all mrps for this
+ // hit then we can exit the search early.
+ still_have_valid_target |=
+ (target.max_hits_for_target() >= num_valid_hits + 1);
+ }
+
+ } // DONE: for (auto &pos_it : refs)
+
+ ++num_valid_hits;
+
+ // if there are no targets reaching the valid hit threshold, then
+ // break early
+ if (!still_have_valid_target) {
+ return true;
+ }
+
+ } // DONE : if (num_occ <= max_allowed_occ)
+ } // DONE : for (auto& raw_hit : raw_hits)
+
+ return false;
+ };
+
+ bool _discard = false;
+ auto mao_first_pass = map_cache.max_occ_default - 1;
+ early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos,
+ mao_first_pass, _discard);
+
+ // If our default threshold was too stringent, then fallback to a more
+ // liberal threshold and look up the k-mers that occur the least frequently.
+ // Specifically, if the min occuring hits have frequency < max_occ_recover
+ // (2500 by default) times, then collect the min occuring hits to get the
+ // mapping.
+ if (attempt_occ_recover and (min_occ >= map_cache.max_occ_default) and
+ (min_occ < map_cache.max_occ_recover)) {
+ prev_read_pos = -1;
+ uint64_t max_allowed_occ = min_occ;
+ early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos,
+ max_allowed_occ, had_alt_max_occ);
+ }
+
+ uint32_t best_alt_hits = 0;
+
+ for (auto& kv : hit_map) {
+ auto best_hit_dir = kv.second.best_hit_direction();
+ // if the best direction is FW or BOTH, add the fw hit
+ // otherwise add the RC.
+ auto mapping = (best_hit_dir != HitDirection::RC)
+ ? kv.second.get_fw_hit(ps)
+ : kv.second.get_rc_hit(ps);
+
+ if (mapping.num_hits >= num_valid_hits) {
+ mapping.tid = kv.first;
+ accepted_hits.emplace_back(mapping);
+ // if we had equally good hits in both directions
+ // add the rc hit here (since we added the fw)
+ // above if the best hit was either FW or BOTH
+ if (best_hit_dir == HitDirection::BOTH) {
+ auto second_hit = kv.second.get_rc_hit(ps);
+ second_hit.tid = kv.first;
+ accepted_hits.emplace_back(second_hit);
+ }
+ } else {
+ // best_alt_score = mapping.score > best_alt_score ? mapping.score :
+ // best_alt_score;
+ best_alt_hits =
+ mapping.num_hits > best_alt_hits ? mapping.num_hits : best_alt_hits;
+ }
+ }
+
+ map_cache.alt_max_occ =
+ had_alt_max_occ ? accepted_hits.size() : map_cache.max_occ_default;
+
+ /*
+ * This rule; if enabled, allows through mappings missing a single hit, if
+ there
+ * was no mapping with all hits. NOTE: this won't work with the current
+ early-exit
+ * optimization however.
+ if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >=
+ num_valid_hits
+ - 1)) { for (auto& kv : hit_map) { auto mapping = kv.second.get_best_hit();
+ if (mapping.num_hits >= best_alt_hits) {
+ //if (mapping.valid_pos(signed_read_len, transcripts[kv.first].RefLength,
+ 10)) { mapping.tid = kv.first; accepted_hits.emplace_back(mapping);
+ //}
+ }
+ }
+ }
+ */
+ } // DONE : if (rh)
+
+ // If the read mapped to > maxReadOccs places, discard it
+ if (accepted_hits.size() > map_cache.alt_max_occ) {
+ accepted_hits.clear();
+ map_type = salmon::utils::MappingType::UNMAPPED;
+ } else if (!accepted_hits.empty()) {
+ map_type = salmon::utils::MappingType::SINGLE_MAPPED;
+ }
+
+ return early_stop;
}
+} // namespace pasc
- } // namespace mapping_utils
+} // namespace mapping_utils
} // namespace salmon
#endif //__SALMON_MAPPING_UTILS__
=====================================
include/Transcript.hpp
=====================================
@@ -14,8 +14,7 @@
#include <limits>
#include <memory>
-//#include "rapmap/bit_array.h"
-#include "pufferfish/compact_vector/compact_vector.hpp"
+#include "compact_vector/compact_vector.hpp"
#include "pufferfish/rank9b.hpp"
class Transcript {
=====================================
scripts/fetchPufferfish.sh
=====================================
@@ -23,11 +23,11 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then
rm -fr ${INSTALL_DIR}/src/pufferfish
fi
-SVER=salmon-v1.9.0
+SVER=salmon-v1.10.0
#SVER=develop
#SVER=sketch-mode
-EXPECTED_SHA256=2a862daeff95a19c9b188bc26a5d02fc0ecc5b9c9ae5523a67c9d14e62551c5d
+EXPECTED_SHA256=c961b9c252856b53c6538d22103b711c924ea1e649516de81efb85870aa8b143
mkdir -p ${EXTERNAL_DIR}
curl -k -L https://github.com/COMBINE-lab/pufferfish/archive/${SVER}.zip -o ${EXTERNAL_DIR}/pufferfish.zip
@@ -92,7 +92,8 @@ cp ${EXTERNAL_DIR}/pufferfish/include/BinWriter.hpp ${INSTALL_DIR}/include/puffe
cp -r ${EXTERNAL_DIR}/pufferfish/include/libdivide ${INSTALL_DIR}/include/pufferfish
cp -r ${EXTERNAL_DIR}/pufferfish/include/ksw2pp ${INSTALL_DIR}/include/pufferfish
-cp -r ${EXTERNAL_DIR}/pufferfish/include/compact_vector ${INSTALL_DIR}/include/pufferfish
+# this is now automatically tracked and inherited via twopaco (on which libpuffer depends)
+# cp -r ${EXTERNAL_DIR}/pufferfish/include/compact_vector ${INSTALL_DIR}/include/pufferfish
cp -r ${EXTERNAL_DIR}/pufferfish/include/metro ${INSTALL_DIR}/include/pufferfish
cp -r ${EXTERNAL_DIR}/pufferfish/include/itlib ${INSTALL_DIR}/include/pufferfish
cp -r ${EXTERNAL_DIR}/pufferfish/include/sparsepp ${INSTALL_DIR}/include/pufferfish
=====================================
scripts/v1_10x/run.sh
=====================================
@@ -13,8 +13,8 @@ p2="$tmpdir/p2.fa"
mkfifo $p1
mkfifo $p2
-i1=`ls $base*I1*`
-wrapper <(cat $base*I1*) <(cat $base*RA*) >> $p1 2>> $p2 &
+i1=`ls $base/*I1*`
+wrapper <(cat $base/*I1*) <(cat $base/*RA*) >> $p1 2>> $p2 &
echo "Running command [${new_cmd} -1 $p1 -2 $p2 -r $i1]"
${new_cmd} -1 $p1 -2 $p2 -r $i1
=====================================
src/Alevin.cpp
=====================================
@@ -164,10 +164,10 @@ std::vector<size_t> sort_indexes(const std::vector<T> &v) {
// reference from https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
// and https://github.com/CGATOxford/UMI-tools/blob/master/umi_tools/umi_methods.py#L193
-bool gaussianKDE(const std::vector<uint32_t>& freqCounter,
+bool gaussianKDE(const std::vector<uint64_t>& freqCounter,
const std::vector<size_t>& sortedIdx,
double& invCovariance, double& normFactor,
- uint32_t expect_cells, uint32_t& predicted_cells){
+ uint64_t expect_cells, uint64_t& predicted_cells){
double covariance{0.0}, mean{0.0}, bw_method {0.01}, threshold = 0.001*freqCounter[sortedIdx[0]];
std::vector<double> logDataset;
size_t numElem, xSpace {10000};
@@ -218,7 +218,7 @@ bool gaussianKDE(const std::vector<uint32_t>& freqCounter,
}
//calculating the argrelextrema
- std::vector<uint32_t> local_mins;
+ std::vector<uint64_t> local_mins;
for (size_t i=1; i<xSpace-1; i++){
if (density[i-1] > density[i] and density[i] < density[i+1]){
local_mins.emplace_back(i);
@@ -247,26 +247,26 @@ bool gaussianKDE(const std::vector<uint32_t>& freqCounter,
return false;
}
-uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
- uint32_t topxBarcodes,
- const std::vector<uint32_t>& freqCounter){
+uint64_t getLeftBoundary(std::vector<size_t>& sortedIdx,
+ uint64_t topxBarcodes,
+ const std::vector<uint64_t>& freqCounter){
// iterate in reverse order since sortedIdx is sorted in decreasing
// order
double cumCount{0.0};
std::vector<double> freqs(topxBarcodes);
- for(uint32_t i = 0; i < topxBarcodes; i++){
+ for(uint64_t i = 0; i < topxBarcodes; i++){
size_t ind = sortedIdx[topxBarcodes-i-1];
cumCount += freqCounter[ind];
freqs[i] = std::log(cumCount);
}
- std::vector<uint32_t> X(topxBarcodes);
+ std::vector<uint64_t> X(topxBarcodes);
std::iota(X.begin(), X.end(), 0);
bool isUp;
- uint32_t x;
+ uint64_t x;
double y, slope, leftExtreme{freqs[0]};
- for(uint32_t j=0; j<topxBarcodes; j++){
+ for(uint64_t j=0; j<topxBarcodes; j++){
x = X[j];
y = freqs[j];
@@ -279,7 +279,7 @@ uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
slope = y/x;
// fill in the values for fitted line
std::transform(X.begin()+nextBcIdx, X.end(), Y.begin(),
- [slope](uint32_t i) {return i*slope;} );
+ [slope](uint64_t i) {return i*slope;} );
isUp = false;
double curveY, lineY;
@@ -294,9 +294,9 @@ uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
if(isUp == false){
// ignoring all the frequencies having same frequency as cutoff
- uint32_t cutoff = topxBarcodes-j;
- uint32_t cutoffFrequency = freqCounter[sortedIdx[cutoff]];
- uint32_t nearestLeftFrequency = cutoffFrequency;
+ uint64_t cutoff = topxBarcodes-j;
+ uint64_t cutoffFrequency = freqCounter[sortedIdx[cutoff]];
+ uint64_t nearestLeftFrequency = cutoffFrequency;
while(nearestLeftFrequency == cutoffFrequency){
nearestLeftFrequency = freqCounter[sortedIdx[--cutoff]];
}
@@ -310,13 +310,13 @@ uint32_t getLeftBoundary(std::vector<size_t>& sortedIdx,
void dumpFeatures(std::string& frequencyFileName,
std::vector<size_t>& sortedIdx,
- const std::vector<uint32_t>& freqCounter,
+ const std::vector<uint64_t>& freqCounter,
std::unordered_map<uint32_t, nonstd::basic_string_view<char>>& colMap
){
std::ofstream freqFile;
freqFile.open(frequencyFileName);
for (auto i:sortedIdx){
- uint32_t count = freqCounter[i];
+ uint64_t count = freqCounter[i];
if (count == 0){
break;
}
@@ -330,7 +330,7 @@ void dumpFeatures(std::string& frequencyFileName,
Knee calculation and sample true set of barcodes
*/
template <typename ProtocolT>
-void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
+void sampleTrueBarcodes(const std::vector<uint64_t>& freqCounter,
TrueBcsT& trueBarcodes, size_t& lowRegionNumBarcodes,
std::unordered_map<uint32_t, nonstd::basic_string_view<char>> colMap,
AlevinOpts<ProtocolT>& aopt){
@@ -338,7 +338,7 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
size_t maxNumBarcodes { aopt.maxNumBarcodes }, lowRegionMaxNumBarcodes { 1000 };
size_t lowRegionMinNumBarcodes { aopt.lowRegionMinNumBarcodes };
double lowConfidenceFraction { 0.5 };
- uint32_t topxBarcodes = std::min(maxNumBarcodes, freqCounter.size());
+ uint64_t topxBarcodes = std::min(maxNumBarcodes, freqCounter.size());
uint64_t history { 0 };
if (aopt.forceCells > 0) {
@@ -358,13 +358,13 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
// Expect Cells algorithm is taken from
// https://github.com/10XGenomics/cellranger/blob/e5396c6c444acec6af84caa7d3655dd33a162852/lib/python/cellranger/stats.py#L138
- constexpr uint32_t MAX_FILTERED_CELLS = 2;
+ constexpr uint64_t MAX_FILTERED_CELLS = 2;
constexpr double UPPER_CELL_QUANTILE = 0.01;
constexpr double FRACTION_MAX_FREQUENCY = 0.1;
- uint32_t baselineBcs = std::max(1, static_cast<int32_t>( aopt.expectCells*UPPER_CELL_QUANTILE ));
+ uint64_t baselineBcs = std::max(static_cast<uint64_t>(1), static_cast<uint64_t>( aopt.expectCells*UPPER_CELL_QUANTILE ));
double cutoffFrequency = std::max(1.0, freqCounter[sortedIdx[baselineBcs]] * FRACTION_MAX_FREQUENCY);
- uint32_t maxNumCells = std::min(static_cast<uint32_t>(freqCounter.size()), aopt.expectCells * MAX_FILTERED_CELLS);
+ uint64_t maxNumCells = std::min(static_cast<uint64_t>(freqCounter.size()), aopt.expectCells * MAX_FILTERED_CELLS);
topxBarcodes = maxNumCells;
for(size_t i=baselineBcs; i<maxNumCells; i++){
@@ -395,7 +395,7 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
green, topxBarcodes, RESET_COLOR);
double invCovariance {0.0}, normFactor{0.0};
- uint32_t gaussThreshold;
+ uint64_t gaussThreshold;
bool isGaussOk = gaussianKDE(freqCounter, sortedIdx,
invCovariance, normFactor,
topxBarcodes, gaussThreshold);
@@ -421,7 +421,7 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
}
}//end-left-knee finding case
- uint32_t fractionTrueBarcodes = static_cast<int>(lowConfidenceFraction * topxBarcodes);
+ uint64_t fractionTrueBarcodes = static_cast<uint64_t>(lowConfidenceFraction * topxBarcodes);
if (fractionTrueBarcodes < lowRegionMinNumBarcodes){
lowRegionNumBarcodes = lowRegionMinNumBarcodes;
@@ -442,8 +442,8 @@ void sampleTrueBarcodes(const std::vector<uint32_t>& freqCounter,
}
topxBarcodes += lowRegionNumBarcodes;
- uint32_t cutoffFrequency = freqCounter[sortedIdx[ topxBarcodes ]];
- uint32_t nearestLeftFrequency = cutoffFrequency;
+ uint64_t cutoffFrequency = freqCounter[sortedIdx[ topxBarcodes ]];
+ uint64_t nearestLeftFrequency = cutoffFrequency;
while(nearestLeftFrequency == cutoffFrequency && lowRegionNumBarcodes != 0){
nearestLeftFrequency = freqCounter[sortedIdx[--topxBarcodes]];
if (lowRegionNumBarcodes > lowRegionMinNumBarcodes) { lowRegionNumBarcodes--; }
@@ -484,7 +484,7 @@ void indexBarcodes(AlevinOpts<ProtocolT>& aopt,
SoftMapT& barcodeSoftMap){
std::unordered_set<std::string> neighbors;
std::unordered_map<std::string, std::vector<std::string>> ZMatrix;
- uint32_t wrngWhiteListCount{0};
+ uint64_t wrngWhiteListCount{0};
for (const auto trueBarcode: trueBarcodes){
neighbors.clear();
//find all neighbors to this true barcode
@@ -493,7 +493,7 @@ void indexBarcodes(AlevinOpts<ProtocolT>& aopt,
neighbors);
for(const auto& neighbor : neighbors){
- uint32_t freq{0};
+ uint64_t freq{0};
auto indexIt = freqCounter.find(neighbor);
bool indexOk = indexIt != freqCounter.end();
//bool indexOk = freqCounter.find(neighbor, freq);
@@ -542,7 +542,7 @@ bool writeFastq(AlevinOpts<ProtocolT>& aopt,
SoftMapT& barcodeMap,
TrueBcsT& trueBarcodes){
size_t rangeSize{0};
- uint32_t totNumBarcodes{0};
+ uint64_t totNumBarcodes{0};
std::string barcode( aopt.protocol.barcodeLength, 'N');
std::string umi( aopt.protocol.umiLength, 'N');
@@ -717,7 +717,7 @@ void processBarcodes(std::vector<std::string>& barcodeFiles,
if (aopt.dumpfeatures) {
aopt.jointLog->info("Sorting and dumping raw barcodes");
auto frequencyFileName = (aopt.outputDirectory / "raw_cb_frequency.txt").string();
- std::vector<uint32_t> collapsedfrequency;
+ std::vector<uint64_t> collapsedfrequency;
std::unordered_map<uint32_t, nonstd::basic_string_view<char>> collapMap;
size_t ind{0};
for(auto fqIt = freqCounter.begin(); fqIt != freqCounter.end(); ++fqIt){
@@ -731,7 +731,7 @@ void processBarcodes(std::vector<std::string>& barcodeFiles,
}
}
else {
- std::vector<uint32_t> collapsedfrequency;
+ std::vector<uint64_t> collapsedfrequency;
std::unordered_map<uint32_t, nonstd::basic_string_view<char>> collapMap;
size_t ind{0};
for(auto fqIt = freqCounter.begin(); fqIt != freqCounter.end(); ++fqIt){
@@ -741,7 +741,7 @@ void processBarcodes(std::vector<std::string>& barcodeFiles,
}
if (aopt.keepCBFraction != 0.0) {
- aopt.forceCells = std::min(static_cast<uint32_t>(aopt.keepCBFraction * freqCounter.size()),
+ aopt.forceCells = std::min(static_cast<uint64_t>(aopt.keepCBFraction * freqCounter.size()),
aopt.maxNumBarcodes);
aopt.jointLog->info("Forcing to use {} cells", aopt.forceCells);
aopt.jointLog->flush();
=====================================
src/AlevinUtils.cpp
=====================================
@@ -348,7 +348,7 @@ namespace alevin {
(void)read2;
pt.anchorPos = read.find(pt.anchorSeq);
if (pt.anchorPos != std::string::npos && ( pt.anchorPos == pt.maxHairpinIndexLen || pt.anchorPos == pt.maxHairpinIndexLen -1) // only 2 possible values of pt.anchorPos
- && read.length() >= pt.barcodeLength + pt.umiLength + pt.anchorSeqLen) {
+ && read.length() >= pt.barcodeLength + pt.umiLength + pt.anchorSeqLen -2) { // Subtract 2 to counter artificial increase in barcode length
std::string bcAssign = read.substr(0,pt.anchorPos) + read.substr(pt.anchorPos + pt.anchorSeqLen + pt.umiLength, pt.rtIdxLen);
if (pt.anchorPos < pt.maxHairpinIndexLen) { // hairpin index can be 9 or 10 bp
bcAssign += "AC";
=====================================
src/CMakeLists.txt
=====================================
@@ -141,7 +141,6 @@ else()
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
endif()
-
set (TGT_RELEASE_FLAGS "${TGT_COMPILE_FLAGS};${TGT_WARN_FLAGS}")
set (TGT_DEBUG_FLAGS "-g;${TGT_COMPILE_FLAGS};${TGT_WARN_FLAGS}")
@@ -155,6 +154,8 @@ HAVE_SSTREAM=1
STX_NO_STD_STRING_VIEW=1
span_FEATURE_MAKE_SPAN_TO_STD=14
)
+target_include_directories(salmon_core PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
+
if (USE_ARM)
target_compile_definitions(salmon_core PUBLIC KSW_USE_ARM=1)
@@ -176,6 +177,7 @@ HAVE_ANSI_TERM=1
HAVE_SSTREAM=1
span_FEATURE_MAKE_SPAN_TO_STD=14
)
+target_include_directories(alevin_core PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
if (USE_ARM)
target_compile_definitions(alevin_core PUBLIC KSW_USE_ARM=1)
@@ -190,6 +192,7 @@ endif()
# Build the salmon executable
add_executable(salmon ${SALMON_MAIN_SRCS} ${SALMON_ALIGN_SRCS})
+target_include_directories(salmon PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
if(HAS_IPO AND (NOT NO_IPO))
set_property(TARGET salmon PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
@@ -202,6 +205,7 @@ target_compile_options(UnitTestsMain PUBLIC "$<$<CONFIG:RELEASE>:${TGT_RELEASE_F
add_executable(unitTests ${UNIT_TESTS_INDIVIDUAL_SRCS} ${GAT_SOURCE_DIR}/tests/catch.hpp)
target_compile_options(unitTests PUBLIC "$<$<CONFIG:DEBUG>:${TGT_DEBUG_FLAGS}>")
target_compile_options(unitTests PUBLIC "$<$<CONFIG:RELEASE>:${TGT_RELEASE_FLAGS}>")
+target_include_directories(unitTests PUBLIC ${COMPACT_VECTOR_INCLUDE_PATH})
#add_executable(salmon-read ${SALMON_READ_SRCS})
#set_target_properties(salmon-read PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_LIBPTHREAD -D_PBGZF_USE -fopenmp"
@@ -225,6 +229,7 @@ add_dependencies(salmon ksw2pp)
add_dependencies(salmon salmon_core)
add_dependencies(salmon alevin_core)
if(TBB_RECONFIGURE OR TBB_TARGET_EXISTED)
+
# Link the executable
target_link_libraries(salmon
Threads::Threads
@@ -236,9 +241,10 @@ target_link_libraries(salmon
gff
${Boost_LIBRARIES}
${ICU_LIBS}
- ${STADEN_LIBRARIES} ${CURL_LIBRARIES}
+ ${CURL_LIBRARIES}
${ZLIB_LIBRARY}
m
+ ${STADEN_LIBRARIES}
${LIBLZMA_LIBRARIES}
${BZIP2_LIBRARIES}
${LIBSALMON_LINKER_FLAGS}
@@ -269,12 +275,13 @@ if(TBB_RECONFIGURE OR TBB_TARGET_EXISTED)
target_link_libraries(unitTests
Threads::Threads
## PUFF_INTEGRATION
+ ${Boost_LIBRARIES}
salmon_core
alevin_core
gff
UnitTestsMain
- ${STADEN_LIBRARIES}
${Boost_LIBRARIES}
+ ${STADEN_LIBRARIES}
${ICU_LIBS}
${CURL_LIBRARIES}
${ZLIB_LIBRARY}
=====================================
src/SalmonAlevin.cpp
=====================================
@@ -414,15 +414,10 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
salmon::utils::ShortFragStats shortFragStats;
double maxZeroFrac{0.0};
- // Write unmapped reads
- fmt::MemoryWriter unmappedNames;
- bool writeUnmapped = salmonOpts.writeUnmappedNames;
- spdlog::logger* unmappedLogger = (writeUnmapped) ? salmonOpts.unmappedLog.get() : nullptr;
-
- // Write unmapped reads
- fmt::MemoryWriter orphanLinks;
- bool writeOrphanLinks = salmonOpts.writeOrphanLinks;
- spdlog::logger* orphanLinkLogger = (writeOrphanLinks) ? salmonOpts.orphanLinkLog.get() : nullptr;
+ bool quiet = salmonOpts.quiet;
+ if (salmonOpts.writeQualities) {
+ salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file.");
+ }
BasicBinWriter bw;
uint32_t num_reads_in_chunk{0};
@@ -436,200 +431,49 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
uint64_t localNumAssignedFragments{0};
uint64_t localNumUniqueMappings{0};
- bool consistentHits = salmonOpts.consistentHits;
- bool quiet = salmonOpts.quiet;
-
- size_t maxNumHits{salmonOpts.maxReadOccs};
size_t readLenLeft{0};
size_t readLenRight{0};
- constexpr const int32_t invalidScore = std::numeric_limits<int32_t>::min();
- MemCollector<IndexT> memCollector(qidx);
- ksw2pp::KSW2Aligner aligner;
- pufferfish::util::AlignmentConfig aconf;
- pufferfish::util::MappingConstraintPolicy mpol;
- bool initOK = salmon::mapping_utils::initMapperSettings(salmonOpts, memCollector, aligner, aconf, mpol);
- PuffAligner puffaligner(qidx->refseq_, qidx->refAccumLengths_, qidx->k(), aconf, aligner);
-
- pufferfish::util::CachedVectorMap<size_t, std::vector<pufferfish::util::MemCluster>, std::hash<size_t>> hits;
- std::vector<pufferfish::util::MemCluster> recoveredHits;
- std::vector<pufferfish::util::JointMems> jointHits;
- PairedAlignmentFormatter<IndexT*> formatter(qidx);
- if (salmonOpts.writeQualities) {
- salmonOpts.jointLog->warn("The --writeQualities flag has no effect when writing results to a RAD file.");
- }
- pufferfish::util::QueryCache qc;
-
- bool mimicStrictBT2 = salmonOpts.mimicStrictBT2;
- bool mimicBT2 = salmonOpts.mimicBT2;
- bool noDovetail = !salmonOpts.allowDovetail;
- bool useChainingHeuristic = !salmonOpts.disableChainingHeuristic;
-
- pufferfish::util::HitCounters hctr;
salmon::utils::MappingType mapType{salmon::utils::MappingType::UNMAPPED};
- bool hardFilter = salmonOpts.hardFilter;
fmt::MemoryWriter sstream;
auto* qmLog = salmonOpts.qmLog.get();
bool writeQuasimappings = (qmLog != nullptr);
-
- struct SimpleHit {
- bool is_fw{false};
- int32_t pos{-1};
- float score{0.0};
- uint32_t num_hits{0};
- uint32_t tid{std::numeric_limits<uint32_t>::max()};
- bool valid_pos(int32_t read_len, uint32_t txp_len, int32_t max_over) {
- int32_t signed_txp_len = static_cast<int32_t>(txp_len);
- return (pos > -max_over) and ((pos + read_len) < (signed_txp_len + max_over));
- }
- };
-
- enum class HitDirection : uint8_t {FW, RC, BOTH};
-
- struct SketchHitInfo {
- // add a hit to the current target that occurs in the forward
- // orientation with respect to the target.
- bool add_fw(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) {
- (void)rl;
- (void)k;
- bool added{false};
-
- // since hits are collected by moving _forward_ in the
- // read, if this is a fw hit, it should be moving
- // forward in the reference. Only add it if this is
- // the case. This ensure that we don't
- // double-count a k-mer that might occur twice on
- // this target.
- if (ref_pos > last_ref_pos_fw and read_pos > last_read_pos_fw) {
- if (last_read_pos_fw == -1) {
- approx_pos_fw = ref_pos - read_pos;
- } else {
- if ((ref_pos - approx_pos_fw) > max_stretch) { return false; }
- }
- //if (last_ref_pos_fw > -1 and (ref_pos > last_ref_pos_fw + 15)) { return false; }
- last_ref_pos_fw = ref_pos;
- last_read_pos_fw = read_pos;
- fw_score += score_inc;
- ++fw_hits;
- added = true;
- }
- return added;
- }
-
- // add a hit to the current target that occurs in the forward
- // orientation with respect to the target.
- bool add_rc(int32_t ref_pos, int32_t read_pos, int32_t rl, int32_t k, int32_t max_stretch, float score_inc) {
-
- bool added{false};
- // since hits are collected by moving _forward_ in the
- // read, if this is an rc hit, it should be moving
- // backwards in the reference. Only add it if this is
- // the case.
- // This ensures that we don't double-count a k-mer that
- // might occur twice on this target.
- if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
- approx_pos_rc = (ref_pos - (rl - (read_pos + k)));
- if (last_read_pos_rc == -1) {
- approx_end_pos_rc = ref_pos + read_pos;
- } else {
- if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false;}
- }
- //if (last_ref_pos_rc > -1 and ref_pos < last_ref_pos_rc - 15) { return false; }
- last_ref_pos_rc = ref_pos;
- last_read_pos_rc = read_pos;
- rc_score += score_inc;
- ++rc_hits;
- added = true;
-
- }
- return added;
- }
-
- inline uint32_t max_hits_for_target() {
- return std::max(fw_hits, rc_hits);
- }
-
- // true if forward, false if rc
- // second element is score
- inline HitDirection best_hit_direction() {
- int32_t fw_minus_rc = static_cast<int32_t>(fw_hits) - static_cast<int32_t>(rc_hits);
- return (fw_minus_rc > 0) ? HitDirection::FW :
- ((fw_minus_rc < 0) ? HitDirection::RC : HitDirection::BOTH);
- }
-
- inline SimpleHit get_fw_hit() {
- return SimpleHit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits<uint32_t>::max()};
- }
-
- inline SimpleHit get_rc_hit() {
- return SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits<uint32_t>::max()};
- }
-
- inline SimpleHit get_best_hit() {
- auto best_direction = best_hit_direction();
- return (best_direction != HitDirection::RC) ?
- SimpleHit{true, approx_pos_fw, fw_score, fw_hits, std::numeric_limits<uint32_t>::max()} :
- SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits<uint32_t>::max()};
- }
-
- inline std::string to_string() {
- std::stringstream ss;
- ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score << ", fw_pos : " << approx_pos_fw
- << " || rc_hits: " << rc_hits << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc;
- return ss.str();
- }
-
- int32_t last_read_pos_fw{-1};
- int32_t last_read_pos_rc{-1};
-
- int32_t last_ref_pos_fw{-1};
- int32_t last_ref_pos_rc{std::numeric_limits<int32_t>::max()};
-
- int32_t approx_pos_fw{-1};
- int32_t approx_pos_rc{-1};
- int32_t approx_end_pos_rc{-1};
-
- uint32_t fw_hits{0};
- uint32_t rc_hits{0};
- float fw_score{0.0};
- float rc_score{0.0};
- };
-
+ PairedAlignmentFormatter<IndexT*> formatter(qidx);
+
// map to recall the number of unmapped reads we see
// for each barcode
phmap::flat_hash_map<uint64_t, uint32_t> unmapped_bc_map;
- // map from transcript id to hit info
- phmap::flat_hash_map<uint32_t, SketchHitInfo> hit_map;
- std::vector<SimpleHit> accepted_hits;
-
- //////////////////////
- // NOTE: validation mapping based new parameters
- std::string rc1; rc1.reserve(300);
-
-
// check the frequency and decide here if we should
// be attempting recovery of highly-multimapping reads
const size_t max_occ_default = salmonOpts.maxReadOccs;
const size_t max_occ_recover = salmonOpts.maxRecoverReadOccs;
const bool attempt_occ_recover = (max_occ_recover > max_occ_default);
- size_t numMappingsDropped{0};
- size_t numDecoyFrags{0};
- const double decoyThreshold = salmonOpts.decoyThreshold;
+ // think if this should be different since it's at the read level
+ const size_t max_read_occs = salmonOpts.maxReadOccs;
+ size_t alt_max_occ = max_read_occs;
- salmon::mapping_utils::MappingScoreInfo msi(decoyThreshold);
- // we only collect detailed decoy information if we will be
- // writing output to SAM.
- msi.collect_decoys(writeQuasimappings);
+ // we'll use this for mapping
+ salmon::mapping_utils::pasc::mapping_cache_info<IndexT> map_cache_first(qidx, max_occ_default, max_occ_recover, max_read_occs);
+ salmon::mapping_utils::pasc::mapping_cache_info<IndexT> map_cache_second(qidx, max_occ_default, max_occ_recover, max_read_occs);
+
+ std::vector<salmon::mapping_utils::pasc::simple_hit>* accepted_hits_left_ptr;
+ std::vector<salmon::mapping_utils::pasc::simple_hit>* accepted_hits_right_ptr;
+ std::vector<salmon::mapping_utils::pasc::simple_hit>* accepted_hits_ptr;
+
+ // buffer in which to accumulate accepted hits if we are merging for left
+ // and right reads
+ std::vector<salmon::mapping_utils::pasc::simple_hit> accepted_hits_buffer;
std::string readBuffer;
std::string umi(alevinOpts.protocol.umiLength, 'N');
std::string barcode(alevinOpts.protocol.barcodeLength, 'N');
//////////////////////
- bool tryAlign{salmonOpts.validateMappings};
+ auto localProtocol = alevinOpts.protocol;
+ // start parsing reads
auto rg = parser->getReadGroup();
while (parser->refill(rg)) {
rangeSize = rg.size();
@@ -650,45 +494,35 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
extraBAMtags.reserve(reserveSize);
}
- auto localProtocol = alevinOpts.protocol;
for (size_t i = 0; i < rangeSize; ++i) { // For all the read in this batch
auto& rp = rg[i];
readLenLeft = rp.first.seq.length();
readLenRight= rp.second.seq.length();
+ // while the pasc code path will use the jointHitGroup to
+ // store UMI and BarCode info, it won't actually use the
+ // "alignment" fields
bool tooShortRight = (readLenRight < (minK+alevinOpts.trimRight));
- //localUpperBoundHits = 0;
auto& jointHitGroup = structureVec[i];
jointHitGroup.clearAlignments();
- auto& jointAlignments= jointHitGroup.alignments();
- hits.clear();
- jointHits.clear();
- memCollector.clear();
- hit_map.clear();
- accepted_hits.clear();
- //jointAlignments.clear();
- //readSubSeq.clear();
mapType = salmon::utils::MappingType::UNMAPPED;
+ // ensure that accepted_hits_ptr points to a valid
+ // vector in case it's not set below
+ accepted_hits_ptr = &accepted_hits_buffer;
+ alt_max_occ = max_read_occs;
//////////////////////////////////////////////////////////////
// extracting barcodes
size_t barcodeLength = alevinOpts.protocol.barcodeLength;
size_t umiLength = alevinOpts.protocol.umiLength;
- //umi.clear();
- //barcode.clear();
+
nonstd::optional<uint32_t> barcodeIdx;
extraBAMtags.clear();
bool seqOk = false;
- // keep track of the *least* freqeuntly
- // occurring hit in this fragment to consider
- // alternative processing of fragments where
- // all observed hits have high frequency.
- uint64_t alt_max_occ = 0;
-
if (alevinOpts.protocol.end == bcEnd::FIVE ||
- alevinOpts.protocol.end == bcEnd::THREE){
+ alevinOpts.protocol.end == bcEnd::THREE) {
// If the barcode sequence could be extracted, then this is set to true,
// but the barcode sequence itself may still be invalid (e.g. contain `N` characters).
// However, if extracted_barcode is false here, there is no hope to even recover the
@@ -721,164 +555,15 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
if (isUmiIdxOk) {
jointHitGroup.setUMI(umiIdx.word(0));
- bool rh = false;
- std::string* readSubSeq = aut::getReadSequence(localProtocol, rp.first.seq, rp.second.seq, readBuffer);
- rh = tooShortRight
- ? false
- : memCollector.get_raw_hits_sketch(*readSubSeq,
- qc,
- true, // isLeft
- false // verbose
- );
- // if there were hits
- if (rh) {
- uint32_t num_valid_hits{0};
- uint64_t total_occs{0};
- uint64_t largest_occ{0};
- auto& raw_hits = memCollector.get_left_hits();
-
- // SANITY
- decltype(raw_hits[0].first) prev_read_pos = -1;
- // the maximum span the supporting k-mers of a
- // mapping position are allowed to have.
- // NOTE this is still > read_length b/c the stretch is measured wrt the
- // START of the terminal k-mer.
- int32_t signed_rl = static_cast<int32_t>(readSubSeq->length());
- int32_t max_stretch = static_cast<int32_t>(signed_rl * 1.0);
-
- // a raw hit is a pair of read_pos and a projected hit
-
- // the least frequent hit for this fragment.
- uint64_t min_occ = std::numeric_limits<uint64_t>::max();
-
- // this is false by default and will be set to true
- // if *every* collected hit for this fragment occurs
- // salmonOpts.maxReadOccs times or more.
- bool had_alt_max_occ = false;
-
- auto collect_mappings_from_hits = [&max_stretch, &min_occ, &hit_map,
- &salmonOpts, &num_valid_hits, &total_occs,
- &largest_occ, &qidx, signed_rl, signed_k](
- auto& raw_hits, auto& prev_read_pos,
- auto& max_allowed_occ, auto& had_alt_max_occ
- ) -> bool {
- for (auto& raw_hit : raw_hits) {
- auto& read_pos = raw_hit.first;
- auto& proj_hits = raw_hit.second;
- auto& refs = proj_hits.refRange;
- uint64_t num_occ = static_cast<uint64_t>(refs.size());
- min_occ = std::min(min_occ, num_occ);
- had_alt_max_occ = true;
-
- bool still_have_valid_target = false;
- prev_read_pos = read_pos;
- if (num_occ <= max_allowed_occ) {
-
- total_occs += num_occ;
- largest_occ = (num_occ > largest_occ) ? num_occ : largest_occ;
- float score_inc = 1.0;
-
- for (auto &pos_it : refs) {
- const auto& ref_pos_ori = proj_hits.decodeHit(pos_it);
- uint32_t tid = static_cast<uint32_t>(qidx->getRefId(pos_it.transcript_id()));
- int32_t pos = static_cast<int32_t>(ref_pos_ori.pos);
- bool ori = ref_pos_ori.isFW;
- auto& target = hit_map[tid];
-
- // Why >= here instead of == ?
- // Because hits can happen on the same target in both the forward
- // and rc orientations, it is possible that we start the loop with
- // the target having num_valid_hits hits in a given orientation (o)
- // we see a new hit for this target in oriention o (now it has num_valid_hits + 1)
- // then we see a hit for this target in orientation rc(o). We still want to
- // add / consider this hit, but max_hits_for_target() > num_valid_hits.
- // So, we must allow for that here.
- if (target.max_hits_for_target() >= num_valid_hits) {
- if (ori) {
- target.add_fw(pos, static_cast<int32_t>(read_pos),
- signed_rl, signed_k, max_stretch, score_inc);
- } else {
- target.add_rc(pos, static_cast<int32_t>(read_pos),
- signed_rl, signed_k, max_stretch, score_inc);
- }
-
- still_have_valid_target |= (target.max_hits_for_target() >= num_valid_hits + 1);
- }
-
- } // DONE: for (auto &pos_it : refs)
-
- ++num_valid_hits;
-
- // if there are no targets reaching the valid hit threshold, then break early
- if (!still_have_valid_target) { return true; }
-
- } // DONE : if (num_occ <= max_allowed_occ)
- } // DONE : for (auto& raw_hit : raw_hits)
-
- return false;
- };
-
- bool _discard = false;
- auto mao_first_pass = max_occ_default - 1;
- bool early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, mao_first_pass, _discard);
-
- // If our default threshold was too stringent, then fallback to a more liberal
- // threshold and look up the k-mers that occur the least frequently.
- // Specifically, if the min occuring hits have frequency < max_occ_recover (2500 by default)
- // times, then collect the min occuring hits to get the mapping.
- if (attempt_occ_recover and (min_occ >= max_occ_default) and (min_occ < max_occ_recover)) {
- prev_read_pos = -1;
- uint64_t max_allowed_occ = min_occ;
- early_stop = collect_mappings_from_hits(raw_hits, prev_read_pos, max_allowed_occ, had_alt_max_occ);
- }
- uint32_t best_alt_hits = 0;
- int32_t signed_read_len = static_cast<int32_t>(readSubSeq->length());
-
- for (auto& kv : hit_map) {
- auto best_hit_dir = kv.second.best_hit_direction();
- // if the best direction is FW or BOTH, add the fw hit
- // otherwise add the RC.
- auto simple_hit = (best_hit_dir != HitDirection::RC) ?
- kv.second.get_fw_hit() : kv.second.get_rc_hit();
-
- if (simple_hit.num_hits >= num_valid_hits) {
- simple_hit.tid = kv.first;
- accepted_hits.emplace_back(simple_hit);
- // if we had equally good hits in both directions
- // add the rc hit here (since we added the fw)
- // above if the best hit was either FW or BOTH
- if (best_hit_dir == HitDirection::BOTH) {
- auto second_hit = kv.second.get_rc_hit();
- second_hit.tid = kv.first;
- accepted_hits.emplace_back(second_hit);
- }
- } else {
- //best_alt_score = simple_hit.score > best_alt_score ? simple_hit.score : best_alt_score;
- best_alt_hits = simple_hit.num_hits > best_alt_hits ? simple_hit.num_hits : best_alt_hits;
- }
- }
-
- alt_max_occ = had_alt_max_occ ? accepted_hits.size() : salmonOpts.maxReadOccs;
-
- /*
- * This rule; if enabled, allows through mappings missing a single hit, if there
- * was no mapping with all hits. NOTE: this won't work with the current early-exit
- * optimization however.
- if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= num_valid_hits - 1)) {
- for (auto& kv : hit_map) {
- auto simple_hit = kv.second.get_best_hit();
- if (simple_hit.num_hits >= best_alt_hits) {
- //if (simple_hit.valid_pos(signed_read_len, transcripts[kv.first].RefLength, 10)) {
- simple_hit.tid = kv.first;
- accepted_hits.emplace_back(simple_hit);
- //}
- }
- }
- }
- */
- } // DONE : if (rh)
+ // get the subsequence for the alignable portion of the read
+ std::string* readSubSeq = aut::getReadSequence(localProtocol, rp.first.seq, rp.second.seq, readBuffer);
+ // try and map it
+ bool early_exit = salmon::mapping_utils::pasc::map_read(readSubSeq, map_cache_second, PairingStatus::UNPAIRED_RIGHT);
+ // in this case accepted_hits_ptr gets set to what we found
+ accepted_hits_ptr = &map_cache_second.accepted_hits;
+ alt_max_occ = map_cache_second.alt_max_occ;
} else {
nSeqs += 1;
}
@@ -900,19 +585,20 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
}
// If the read mapped to > maxReadOccs places, discard it
- if (accepted_hits.size() > alt_max_occ ) {
- accepted_hits.clear();
- } else if (!accepted_hits.empty()) {
+ if (accepted_hits_ptr->size() > alt_max_occ) {
+ accepted_hits_ptr->clear();
+ } else if (!accepted_hits_ptr->empty()) {
mapType = salmon::utils::MappingType::SINGLE_MAPPED;
}
+
alevin::types::AlevinCellBarcodeKmer bck;
bool barcode_ok = bck.fromChars(barcode);
// NOTE: Think if we should put decoy mappings in the RAD file
if (mapType == salmon::utils::MappingType::SINGLE_MAPPED) {
// num aln
- bw << static_cast<uint32_t>(accepted_hits.size());
+ bw << static_cast<uint32_t>(accepted_hits_ptr->size());
// bc
// if we can fit the barcode into an integer
@@ -941,7 +627,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
bw << umi;
}
- for (auto& aln : accepted_hits) {
+ for (auto& aln : (*accepted_hits_ptr)) {
uint32_t fw_mask = aln.is_fw ? 0x80000000 : 0x00000000;
bw << (aln.tid | fw_mask);
}
@@ -969,9 +655,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
bw << num_reads_in_chunk;
}
- validHits += accepted_hits.size();
- localNumAssignedFragments += (accepted_hits.size() > 0);
- localNumUniqueMappings += (accepted_hits.size() == 1) ? 1 : 0;
+ validHits += accepted_hits_ptr->size();
+ localNumAssignedFragments += (accepted_hits_ptr->size() > 0);
+ localNumUniqueMappings += (accepted_hits_ptr->size() == 1) ? 1 : 0;
locRead++;
++numObservedFragments;
jointHitGroup.clearAlignments();
@@ -1028,7 +714,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
}
numAssignedFragments += localNumAssignedFragments;
numUniqueMappings += localNumUniqueMappings;
- mstats.numDecoyFragments += numDecoyFrags;
+ mstats.numDecoyFragments += 0; // no decoys with pasc (yet?)
readExp.updateShortFrags(shortFragStats);
}
View it on GitLab: https://salsa.debian.org/med-team/salmon/-/commit/2fde979a6b705a0380d2428de0026ede19bd241d
--
View it on GitLab: https://salsa.debian.org/med-team/salmon/-/commit/2fde979a6b705a0380d2428de0026ede19bd241d
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230308/623c51ef/attachment-0001.htm>
More information about the debian-med-commit
mailing list