[med-svn] [Git][med-team/kallisto][master] 7 commits: New upstream version 0.51.1+dfsg
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sun Oct 13 16:23:14 BST 2024
Andreas Tille pushed to branch master at Debian Med / kallisto
Commits:
fab07f24 by Andreas Tille at 2024-10-13T17:09:54+02:00
New upstream version 0.51.1+dfsg
- - - - -
a0ac45c5 by Andreas Tille at 2024-10-13T17:09:54+02:00
New upstream version
- - - - -
5b06a039 by Andreas Tille at 2024-10-13T17:09:57+02:00
Update upstream source from tag 'upstream/0.51.1+dfsg'
Update to upstream version '0.51.1+dfsg'
with Debian dir 2ef0c7f320c130718513472579aabb90837032e6
- - - - -
a437f60a by Andreas Tille at 2024-10-13T17:09:57+02:00
Standards-Version: 4.7.0 (routine-update)
- - - - -
ef34b105 by Andreas Tille at 2024-10-13T15:10:01+00:00
Remove duplicate line from changelog.
Changes-By: lintian-brush
- - - - -
b24bfb4a by Andreas Tille at 2024-10-13T17:18:55+02:00
Remove patch Check_busOptions.seq_length_before_accessing.patch that is missing from debian/patches/series.
- - - - -
84c31f65 by Andreas Tille at 2024-10-13T17:19:34+02:00
Adapt patches
- - - - -
23 changed files:
- CMakeLists.txt
- debian/changelog
- debian/control
- − debian/patches/Check_busOptions.seq_length_before_accessing.patch
- debian/patches/use_debian_packaged_htslib.patch
- ext/bifrost/CMakeLists.txt
- ext/bifrost/src/BlockedBloomFilter.hpp
- ext/bifrost/src/CMakeLists.txt
- ext/bifrost/src/CompactedDBG.hpp
- ext/bifrost/src/StreamCounter.hpp
- src/CMakeLists.txt
- src/EMAlgorithm.h
- src/H5Writer.cpp
- src/KmerIndex.cpp
- src/KmerIndex.h
- src/MinCollector.cpp
- src/MinCollector.h
- src/PlaintextWriter.cpp
- src/PlaintextWriter.h
- src/ProcessReads.cpp
- src/ProcessReads.h
- src/common.h
- src/main.cpp
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -4,6 +4,23 @@ project(kallisto)
include(GNUInstallDirs)
+if(NOT MAX_KMER_SIZE)
+set(MAX_KMER_SIZE "32")
+endif()
+
+set(DO_ENABLE_AVX2 "")
+set(DO_ENABLE_COMPILATION_ARCH "")
+if(ENABLE_AVX2 MATCHES "OFF")
+add_compile_definitions("ENABLE_AVX2=OFF")
+set(DO_ENABLE_AVX2 "-DENABLE_AVX2=OFF")
+endif(ENABLE_AVX2 MATCHES "OFF")
+if(COMPILATION_ARCH MATCHES "OFF")
+add_compile_definitions("COMPILATION_ARCH=OFF")
+set(DO_ENABLE_COMPILATION_ARCH "-DCOMPILATION_ARCH=OFF")
+endif(COMPILATION_ARCH MATCHES "OFF")
+
+add_compile_definitions("MAX_KMER_SIZE=${MAX_KMER_SIZE}")
+
option(USE_HDF5 "Compile with HDF5 support" OFF) #OFF by default
option(USE_BAM "Compile with HTSLIB support" OFF) # OFF by default
@@ -72,7 +89,7 @@ ExternalProject_Add(bifrost
PREFIX ${PROJECT_SOURCE_DIR}/ext/bifrost
SOURCE_DIR ${PROJECT_SOURCE_DIR}/ext/bifrost
BUILD_IN_SOURCE 1
- CONFIGURE_COMMAND mkdir -p build && cd build && cmake .. -DCMAKE_INSTALL_PREFIX=${PREFIX} -DCMAKE_CXX_FLAGS=${PROJECT_BIFROST_CMAKE_CXX_FLAGS}
+ CONFIGURE_COMMAND mkdir -p build && cd build && cmake .. -DMAX_KMER_SIZE=${MAX_KMER_SIZE} -DCMAKE_INSTALL_PREFIX=${PREFIX} -DCMAKE_CXX_FLAGS=${PROJECT_BIFROST_CMAKE_CXX_FLAGS} ${DO_ENABLE_AVX2} ${DO_ENABLE_COMPILATION_ARCH}
BUILD_COMMAND cd build && make
INSTALL_COMMAND ""
)
=====================================
debian/changelog
=====================================
@@ -1,11 +1,13 @@
-kallisto (0.50.1+dfsg-1) UNRELEASED; urgency=medium
+kallisto (0.51.1+dfsg-1) UNRELEASED; urgency=medium
[ Andreas Tille ]
* New upstream version
- * Standards-Version: 4.6.2 (routine-update)
+ * Standards-Version: 4.7.0 (routine-update)
* d/watch: Fix download name
* Add support for loongarch64 in d/control
Closes: #1057131
+ * Remove patch Check_busOptions.seq_length_before_accessing.patch that is
+ missing from debian/patches/series.
[ Michael R. Crusoe ]
* d/control: simplify architecture specifications by using provisions
@@ -15,7 +17,7 @@ kallisto (0.50.1+dfsg-1) UNRELEASED; urgency=medium
* Upstream has forked bifrost, use their copy instead.
* d/rules: need cmake option needed: USE_BAM.
- -- Andreas Tille <tille at debian.org> Wed, 08 Nov 2023 09:43:17 +0100
+ -- Andreas Tille <tille at debian.org> Sun, 13 Oct 2024 17:09:54 +0200
kallisto (0.48.0+dfsg-4) unstable; urgency=medium
=====================================
debian/control
=====================================
@@ -11,7 +11,7 @@ Build-Depends: debhelper-compat (= 13),
libhdf5-dev,
libhts-dev,
snakemake <!nocheck>,
-Standards-Version: 4.6.2
+Standards-Version: 4.7.0
Vcs-Browser: https://salsa.debian.org/med-team/kallisto
Vcs-Git: https://salsa.debian.org/med-team/kallisto.git
Homepage: https://pachterlab.github.io/kallisto
=====================================
debian/patches/Check_busOptions.seq_length_before_accessing.patch deleted
=====================================
@@ -1,33 +0,0 @@
-Description: Check busOptions.seq.empty() before trying to access it,
- to avoid a segfault. More details can be found at the upstream bug.
-Author: Pranav Ballaney <ballaneypranav at gmail.com>
-Bug: https://github.com/pachterlab/kallisto/issues/254
-Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=960368
-Last-Update: 2020-05-16
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/src/ProcessReads.cpp
-+++ b/src/ProcessReads.cpp
-@@ -1408,6 +1408,13 @@ void BUSProcessor::processBuffer() {
- std::vector<int> l(jmax,0);
-
- bool singleSeq = busopt.seq.size() ==1 ;
-+
-+ if (mp.opt.busOptions.seq.empty()) {
-+ std::cerr << "No seq technology specified.\n";
-+ std::cerr << "If you believe this is a bug, please report using reportbug.";
-+ std::cerr.flush();
-+ exit(1);
-+ }
- const BUSOptionSubstr seqopt = busopt.seq.front();
-
-
-@@ -2145,7 +2152,7 @@ void AlnProcessor::processBufferGenome()
- rlen2 = seqs[si2].second;
- }
-
-- if (mp.opt.busOptions.seq.front().fileno == 1) {
-+ if (!mp.opt.busOptions.seq.empty() && mp.opt.busOptions.seq.front().fileno == 1) {
- std::swap(si1,si2);
- std::swap(rlen1,rlen2);
- }
=====================================
debian/patches/use_debian_packaged_htslib.patch
=====================================
@@ -2,9 +2,9 @@ Author: Andreas Tille <tille at debian.org>
Last-Update: Thu, 21 Mar 2019 15:53:04 +0100
Description: Adapt build system to Debian packaged libhts
---- kallisto.orig/CMakeLists.txt
-+++ kallisto/CMakeLists.txt
-@@ -54,8 +54,17 @@
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -71,8 +71,17 @@ ELSE(LINK MATCHES shared)
message("shared build")
ENDIF(LINK MATCHES static)
@@ -23,23 +23,7 @@ Description: Adapt build system to Debian packaged libhts
if (USE_BAM)
ExternalProject_Add(htslib
PREFIX ${PROJECT_SOURCE_DIR}/ext/htslib
-@@ -68,15 +77,6 @@
- )
- endif(USE_BAM)
-
--ExternalProject_Add(bifrost
-- PREFIX ${PROJECT_SOURCE_DIR}/ext/bifrost
-- SOURCE_DIR ${PROJECT_SOURCE_DIR}/ext/bifrost
-- BUILD_IN_SOURCE 1
-- CONFIGURE_COMMAND mkdir -p build && cd build && cmake .. -DCMAKE_INSTALL_PREFIX=${PREFIX} -DCMAKE_CXX_FLAGS=${PROJECT_BIFROST_CMAKE_CXX_FLAGS}
-- BUILD_COMMAND cd build && make
-- INSTALL_COMMAND ""
--)
--
- if (ZLIBNG)
- message("zlib-ng enabled.")
- ExternalProject_Add(zlib-ng
-@@ -92,13 +92,13 @@
+@@ -109,13 +118,13 @@ endif(ZLIBNG)
if (USE_BAM)
include_directories(${htslib_PREFIX}/src/htslib)
endif(USE_BAM)
@@ -54,9 +38,9 @@ Description: Adapt build system to Debian packaged libhts
# add_compile_options(-Wdeprecated-register)
add_subdirectory(src)
---- kallisto.orig/src/CMakeLists.txt
-+++ kallisto/src/CMakeLists.txt
-@@ -3,9 +3,11 @@
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -3,9 +3,11 @@ file(GLOB headers *.h *.hpp)
list(REMOVE_ITEM sources main.cpp)
@@ -66,17 +50,17 @@ Description: Adapt build system to Debian packaged libhts
endif(USE_BAM)
+endif()
- add_compile_options(-Wno-subobject-linkage) # Suppress bifrost warning
-
-@@ -16,11 +18,16 @@
+ if(NOT MAX_KMER_SIZE)
+ set(MAX_KMER_SIZE "32")
+@@ -20,11 +22,16 @@ add_executable(kallisto main.cpp)
find_package( Threads REQUIRED )
ExternalProject_Get_Property(bifrost install_dir)
+if(!DEBIAN_BUILD)
if (USE_BAM)
- target_link_libraries(kallisto kallisto_core pthread ${CMAKE_CURRENT_SOURCE_DIR}/../ext/htslib/libhts.a ${install_dir}/build/src/libbifrost.a)
+ target_link_libraries(kallisto kallisto_core pthread ${CMAKE_CURRENT_SOURCE_DIR}/../ext/htslib/libhts.a ${install_dir}/build/src/libbifrost.a -DMAX_KMER_SIZE=${MAX_KMER_SIZE})
else()
- target_link_libraries(kallisto kallisto_core pthread ${install_dir}/build/src/libbifrost.a)
+ target_link_libraries(kallisto kallisto_core pthread ${install_dir}/build/src/libbifrost.a -DMAX_KMER_SIZE=${MAX_KMER_SIZE})
endif(USE_BAM)
+else()
+# FIXME: once bifrost is accepted this should be linked against the Debian packaged library
@@ -85,7 +69,7 @@ Description: Adapt build system to Debian packaged libhts
if(LINK MATCHES static)
set(BUILD_SHARED_LIBS OFF)
-@@ -84,4 +91,4 @@
+@@ -94,4 +101,4 @@ else()
endif(LINK MATCHES static)
=====================================
ext/bifrost/CMakeLists.txt
=====================================
@@ -5,7 +5,6 @@ project(Bifrost)
find_package(Threads REQUIRED)
# To enable a larger default k-mer size, replace MAX_KMER_SIZE with a larger multiple of 32: actual maximum k-mer size will be MAX_KMER_SIZE-1.
-SET(MAX_KMER_SIZE "32" CACHE STRING "MAX_KMER_SIZE")
SET(MAX_GMER_SIZE "${MAX_KMER_SIZE}" CACHE STRING "MAX_GMER_SIZE")
# Enable architecture optimizations
SET(COMPILATION_ARCH "native" CACHE STRING "COMPILATION_ARCH")
=====================================
ext/bifrost/src/BlockedBloomFilter.hpp
=====================================
@@ -3,6 +3,7 @@
#include <array>
#include <atomic>
+#include <limits>
#include <cmath>
#include <cstdlib>
#include <cstring>
=====================================
ext/bifrost/src/CMakeLists.txt
=====================================
@@ -26,7 +26,13 @@ target_link_libraries(bifrost_static ${ZLIB_LIBRARIES})
target_link_libraries(bifrost_dynamic ${ZLIB_LIBRARIES})
if (ZLIB_FOUND)
- include_directories(${ZLIB_INCLUDE_DIRS})
+ if (DEFINED ZLIB_INCLUDE_DIRS)
+ include_directories( ${ZLIB_INCLUDE_DIRS} )
+ elseif (DEFINED ZLIB_INCLUDE_DIR)
+ include_directories( ${ZLIB_INCLUDE_DIR} )
+ else()
+ message(FATAL_ERROR "zlib found but no include directories are set.")
+ endif()
else()
message(FATAL_ERROR "zlib not found. Required for to output files")
endif(ZLIB_FOUND)
=====================================
ext/bifrost/src/CompactedDBG.hpp
=====================================
@@ -1,6 +1,8 @@
#ifndef BIFROST_COMPACTED_DBG_HPP
#define BIFROST_COMPACTED_DBG_HPP
+#include <stddef.h>
+#include <limits>
#include <cmath>
#include <cstdlib>
#include <cstring>
=====================================
ext/bifrost/src/StreamCounter.hpp
=====================================
@@ -4,6 +4,7 @@
#include <sstream>
#include <stdint.h>
#include <string>
+#include <limits>
#include <cmath>
#include <fstream>
#include <assert.h>
=====================================
src/CMakeLists.txt
=====================================
@@ -7,6 +7,10 @@ if (USE_BAM)
include_directories(../ext/htslib)
endif(USE_BAM)
+if(NOT MAX_KMER_SIZE)
+set(MAX_KMER_SIZE "32")
+endif()
+
add_compile_options(-Wno-subobject-linkage) # Suppress bifrost warning
add_library(kallisto_core ${sources} ${headers})
@@ -17,9 +21,9 @@ add_executable(kallisto main.cpp)
find_package( Threads REQUIRED )
ExternalProject_Get_Property(bifrost install_dir)
if (USE_BAM)
-target_link_libraries(kallisto kallisto_core pthread ${CMAKE_CURRENT_SOURCE_DIR}/../ext/htslib/libhts.a ${install_dir}/build/src/libbifrost.a)
+target_link_libraries(kallisto kallisto_core pthread ${CMAKE_CURRENT_SOURCE_DIR}/../ext/htslib/libhts.a ${install_dir}/build/src/libbifrost.a -DMAX_KMER_SIZE=${MAX_KMER_SIZE})
else()
-target_link_libraries(kallisto kallisto_core pthread ${install_dir}/build/src/libbifrost.a)
+target_link_libraries(kallisto kallisto_core pthread ${install_dir}/build/src/libbifrost.a -DMAX_KMER_SIZE=${MAX_KMER_SIZE})
endif(USE_BAM)
if(LINK MATCHES static)
@@ -45,7 +49,13 @@ if (NOT ZLIBNG)
find_package( ZLIB REQUIRED )
if ( ZLIB_FOUND )
- include_directories( ${ZLIB_INCLUDE_DIRS} )
+ if (DEFINED ZLIB_INCLUDE_DIRS)
+ include_directories( ${ZLIB_INCLUDE_DIRS} )
+ elseif (DEFINED ZLIB_INCLUDE_DIR)
+ include_directories( ${ZLIB_INCLUDE_DIR} )
+ else()
+ message(FATAL_ERROR "zlib found but no include directories are set.")
+ endif()
target_link_libraries(kallisto kallisto_core ${ZLIB_LIBRARIES})
else()
message(FATAL_ERROR "zlib not found. Required for to output files" )
=====================================
src/EMAlgorithm.h
=====================================
@@ -85,7 +85,7 @@ struct EMAlgorithm {
if (alpha_.size() == priors.size()) {
alpha_.assign(priors.begin(), priors.end());
- } else {
+ } else if (priors.size() != 0) {
std::cerr << "[ em] number of priors does not match number of transcripts." << std::endl;
std::cerr << " defaulting to uniform priors." << std::endl;
@@ -108,6 +108,7 @@ struct EMAlgorithm {
}
int i;
+ if (!opt.long_read || (opt.long_read && opt.platform == "ONT")){
for (i = 0; i < n_iter; ++i) {
if (recomputeEffLen && (i == min_rounds || i == min_rounds + 500)) {
eff_lens_ = update_eff_lens(all_fl_means, tc_, index_, alpha_, eff_lens_, post_bias_, opt);
@@ -220,6 +221,140 @@ struct EMAlgorithm {
}
}
+ } else {
+ for (i = 0; i < n_iter; ++i) {
+ if (recomputeEffLen && (i == min_rounds || i == min_rounds + 500) && !opt.long_read) {
+ eff_lens_ = update_eff_lens(all_fl_means, tc_, index_, alpha_, eff_lens_, post_bias_, opt);
+ weight_map_ = calc_weights (tc_.counts, index_.ecmapinv, eff_lens_);
+ }
+
+ if (recomputeEffLen && (i == min_rounds || i % min_rounds == 0) && opt.long_read) {
+ weight_map_ = calc_weights (tc_.counts, index_.ecmapinv, eff_lens_);
+ }
+
+
+ /*** should this go after main loop?
+ for (const auto& it : ecmapinv_) {
+ if (it.first.cardinality() == 1) {
+ next_alpha[it.first.maximum()] = counts_[it.second];
+ //std::cerr << "cardinality 1 : " << it.first.maximum() << " it.second: " << it.second << " counts_[it.second]: " << counts_[it.second] << std::endl; std::cerr.flush();
+ }
+ }
+ ***/
+
+ for (const auto& it : ecmapinv_) {
+ if (it.first.cardinality() == 1) { // Individual transcript
+ //std::cerr << "Should always enter here with toy error free simulation" << std::endl; std::cerr.flush();
+ continue;
+ }
+
+ denom = 0.0;
+
+ if (counts_[it.second] == 0) {
+ continue;
+ }
+
+ // first, compute the denominator: a normalizer
+ // iterate over targets in EC map
+ auto& wv = weight_map_[it.second];
+
+ // everything in ecmap should be in weight_map
+ //assert( w_search != weight_map_.end() );
+ //assert( w_search->second.size() == ec_kv.second.size() );
+
+ // wv is weights vector
+ // trs is a vector of transcript ids
+
+ auto& r = it.first; //ecmap_.find(ec)->second;
+ auto numEC = r.cardinality();
+ uint32_t* trs = new uint32_t[numEC];
+ r.toUint32Array(trs);
+
+ for (auto t_it = 0; t_it < numEC; ++t_it) {
+ denom += alpha_[trs[t_it]] * wv[t_it];
+ }
+
+ if (denom < TOLERANCE) {
+ continue;
+ }
+
+ // compute the update step
+ auto countNorm = counts_[it.second] / denom;
+
+ //std::cerr << "countNorm: " << countNorm << std::endl; std::cerr.flush();
+ //std::cerr <<"numEC is " << numEC << std::endl; std::cerr.flush();
+ for (auto t_it = 0; t_it < numEC; ++t_it) {
+ //std::cerr <<"t_it is: " << t_it << ", alpha is: " << alpha_[trs[t_it]] << " wv[t_it] is: " << wv[t_it] << " trs[t_it] is:" << trs[t_it] << std::endl; std::cerr.flush();
+ next_alpha[trs[t_it]] += (wv[t_it] * alpha_[trs[t_it]]) * countNorm;
+ }
+
+ delete[] trs;
+ trs = nullptr;
+
+ }
+
+ // TODO: check for relative difference for convergence in EM
+
+ bool stopEM = false; //!finalRound && (i >= min_rounds); // false initially
+ //double maxChange = 0.0;
+ int chcount = 0;
+ for (int ec = 0; ec < num_trans_; ec++) {
+ if (next_alpha[ec] > alpha_change_limit && (std::fabs(next_alpha[ec] - alpha_[ec]) / next_alpha[ec]) > alpha_change) {
+ chcount++;
+ }
+
+ //if (stopEM && next_alpha[ec] >= alpha_limit) {
+
+ /* double reldiff = abs(next_alpha[ec]-alpha_[ec]) / next_alpha[ec];
+ if (reldiff >= alpha_change) {
+ stopEM = false;
+ }*/
+ //}
+
+ /*
+ if (next_alpha[ec] > alpha_limit) {
+ maxChange = std::max(maxChange,std::fabs(next_alpha[ec]-alpha_[ec]) / next_alpha[ec]);
+ }
+ */
+ // reassign alpha_ to next_alpha
+ alpha_[ec] = next_alpha[ec];
+
+ // clear all next_alpha values 0 for next iteration
+ next_alpha[ec] = 0.0;
+ }
+
+ //std::cout << chcount << std::endl;
+ if (chcount == 0 && i > min_rounds) {
+
+ stopEM=true;
+ }
+
+ if (finalRound) {
+ break;
+ }
+
+ // std::cout << maxChange << std::endl;
+ if (stopEM) {
+ finalRound = true;
+ alpha_before_zeroes_.resize( alpha_.size() );
+ for (int ec = 0; ec < num_trans_; ec++) {
+ alpha_before_zeroes_[ec] = alpha_[ec];
+ if (alpha_[ec] < alpha_limit/10.0) {
+ alpha_[ec] = 0.0;
+ }
+ }
+ }
+
+ }
+
+ //TRYING PLACING HERE INSTEAD
+ for (const auto& it : ecmapinv_) {
+ if (it.first.cardinality() == 1) {
+ alpha_[it.first.maximum()] += counts_[it.second];
+ //std::cerr << "cardinality 1 : " << it.first.maximum() << " it.second: " << it.second << " counts_[it.second]: " << counts_[it.second] << std::endl; std::cerr.flush();
+ }
+ }
+ }
// ran for the maximum number of iterations
if (n_iter == i) {
=====================================
src/H5Writer.cpp
=====================================
@@ -166,6 +166,7 @@ void H5Converter::write_aux() {
std::string(std::to_string(-1)),
kal_version_,
std::string(std::to_string(idx_version_)),
+ std::string("dummy k-mer length"),
start_time_,
call_
);
=====================================
src/KmerIndex.cpp
=====================================
@@ -10,6 +10,7 @@
#include <iostream>
#include <unordered_map>
#include <string>
+#include <set>
#include "ColoredCDBG.hpp"
// --aa option helper functions
@@ -231,6 +232,18 @@ std::pair<size_t,size_t> KmerIndex::getECInfo() const {
return std::make_pair(max_ec_len, cardinality_zero_encounters);
}
+// Begin Shading
+std::pair<std::string,std::string> shadedTargetName(std::string& name) {
+ if (name.find("_shade_") != std::string::npos) {
+ std::string name_header = "_shade_";
+ std::string tname = name.substr(0, name.find(name_header));
+ std::string variant = name.substr(name.find(name_header)+name_header.length(), name.size());
+ return std::make_pair(tname,variant); // Return the target name and the the associated shade
+ }
+ return std::make_pair("",""); // Not a shade
+}
+// End Shading
+
void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out) {
// read input
u_set_<std::string> unique_names;
@@ -354,6 +367,18 @@ void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out)
}
unique_names.insert(name);
target_names_.push_back(name);
+
+ // Begin Shading
+ auto shade_info = shadedTargetName(name);
+ if (shade_info.first != "") {
+ std::string tname = shade_info.first;
+ std::string variant = shade_info.second;
+ auto it = std::find(target_names_.begin(), target_names_.end(), tname);
+ if (it != target_names_.end()) {
+ shadeToColorTranscriptMap[target_names_.size()-1] = std::distance(target_names_.begin(), it);
+ }
+ }
+ // End Shading
}
}
@@ -399,6 +424,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
size_t num_seqs = 0;
int max_color = 0;
u_set_<int> external_input_names;
+ // Begin Shading
+ std::set<std::string> variants_set; // Ordered set to keep track of variants (i.e. colors with shades)
+ // End Shading
for (int i = 0; i < opt.transfasta.size(); i++) { // Currently, this should only be one file
auto fasta = opt.transfasta[i];
fp = opt.transfasta.size() == 1 && opt.transfasta[0] == "-" ? gzdopen(fileno(stdin), "r") : gzopen(fasta.c_str(), "r");
@@ -414,9 +442,23 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
continue;
}
int color = std::atoi(strname.c_str());
+ // Begin Shading
+ std::string variant;
+ auto shade_info = shadedTargetName(strname);
+ if (shade_info.first != "") {
+ std::string tname = shade_info.first;
+ variant = shade_info.second;
+ color = std::atoi(tname.c_str());
+ variants_set.insert(std::to_string(color) + "_shade_" + variant);
+ }
+ // End Shading
external_input_names.insert(color);
if (color > max_color) max_color = color;
- of << ">" << std::to_string(color) << "\n" << str << std::endl;
+ of << ">" << std::to_string(color);
+ // Begin Shading
+ if (!variant.empty()) of << "_shade_" << variant;
+ // End Shading
+ of << "\n" << str << std::endl;
num_seqs++;
}
gzclose(fp);
@@ -437,6 +479,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
target_names_.push_back(std::to_string(i));
target_lens_.push_back(k); // dummy length (k-mer size)
}
+ // Begin Shading
+ for (const auto& v : variants_set) {
+ num_trans++; // Each color-shade duo counts as an additional target
+ target_names_.push_back(v);
+ target_lens_.push_back(k); // dummy length (k-mer size)
+ }
+ if (num_trans != ncolors) {
+ std::cerr << "[build] Detected " << std::to_string(num_trans-ncolors) << " shades" << std::endl;
+ }
+ // End Shading
std::cerr << "[build] Building graph from k-mers" << std::endl;
BuildDeBruijnGraph(opt, tmp_file2, out);
@@ -449,6 +501,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
std::vector<std::vector<TRInfo> > trinfos(dbg.size());
std::ifstream infile_a(tmp_file2);
int current_color = 0;
+ // Begin Shading
+ std::string current_variant;
+ // End Shading
std::string line;
while (std::getline(infile_a, line)) {
if (line.length() == 0) {
@@ -458,6 +513,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
current_color = onlist_sequences.cardinality();
} else {
current_color = std::atoi(line.c_str()+1);
+ // Begin Shading
+ current_variant = "";
+ std::string name = line.substr(1);
+ auto shade_info = shadedTargetName(name);
+ if (shade_info.first != "") {
+ std::string tname = shade_info.first;
+ current_variant = shade_info.second;
+ current_color = std::atoi(tname.c_str());
+ }
+ // End Shading
}
continue;
}
@@ -481,8 +546,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
tr.pos = (proc-um.len) | (!um.strand ? sense : missense);
tr.start = um.dist;
tr.stop = um.dist + um.len;
-
trinfos[n->id].push_back(tr);
+
+ // Begin Shading
+ if (!current_variant.empty()) {
+ auto it = variants_set.find(std::to_string(current_color) + "_shade_" + current_variant);
+ assert(it != variants_set.end());
+ tr.trid = ncolors + std::distance(variants_set.begin(), it);
+ trinfos[n->id].push_back(tr);
+ }
+ // End Shading
}
}
infile_a.close();
@@ -995,6 +1068,13 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st
tr.stop = um.dist + um.len;
trinfos[n->id].push_back(tr);
+ // Begin Shading
+ auto it = shadeToColorTranscriptMap.find(tr.trid);
+ if (it != shadeToColorTranscriptMap.end()) {
+ tr.trid = shadeToColorTranscriptMap[tr.trid];
+ trinfos[n->id].push_back(tr); // Add the color of the original transcript as well
+ }
+ // End Shading
}
j++;
}
@@ -1020,6 +1100,11 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st
std::cerr << "[build] target de Bruijn graph has k-mer length " << dbg.getK() << " and minimizer length " << dbg.getG() << std::endl;
std::cerr << "[build] target de Bruijn graph has " << dbg.size() << " contigs and contains " << dbg.nbKmers() << " k-mers " << std::endl;
+ // Begin Shading
+ if (shadeToColorTranscriptMap.size() != 0) {
+ std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
+ }
+ // End Shading
}
void KmerIndex::PopulateMosaicECs(std::vector<std::vector<TRInfo> >& trinfos) {
@@ -1209,6 +1294,7 @@ void KmerIndex::write(const std::string& index_out, bool writeKmerTable, int thr
}
// 4. write number of targets
+ num_trans -= d_list.size();
out.write((char *)&num_trans, sizeof(num_trans));
// 5. write out target lengths
@@ -1417,6 +1503,19 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
in.read(buffer, tmp_size);
target_names_.push_back(std::string(buffer));
+ // Begin Shading
+ auto shade_info = shadedTargetName(target_names_[target_names_.size()-1]);
+ if (shade_info.first != "") {
+ std::string tname = shade_info.first;
+ std::string variant = shade_info.second;
+ auto it = std::find(target_names_.begin(), target_names_.end(), tname);
+ if (it != target_names_.end()) {
+ shadeToColorTranscriptMap[i] = std::distance(target_names_.begin(), it);
+ }
+ use_shade = true;
+ shade_sequences.add(i);
+ }
+ // End Shading
}
delete[] buffer;
@@ -1437,6 +1536,11 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
if (num_trans-onlist_sequences.cardinality() > 0) {
std::cerr << "[index] number of D-list k-mers: " << pretty_num(static_cast<size_t>(num_trans-onlist_sequences.cardinality())) << std::endl;
}
+ // Begin Shading
+ if (shadeToColorTranscriptMap.size() != 0) {
+ std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
+ }
+ // End Shading
in.close();
@@ -1593,6 +1697,11 @@ int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2) const {
// post: v contains all equiv classes for the k-mers in s
void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMap<Node>, int>>& v, bool partial, bool cfc) const{
const Node* n;
+
+ // Begin Shading
+ if (use_shade) partial = false;
+ // End Shading
+ if (do_union) partial = false;
// TODO:
// Rework KmerIndex::match() such that it uses the following type of logic
@@ -1663,6 +1772,8 @@ void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMa
}
v.push_back({um, kit->second});
+
+ if (no_jump) continue;
// Find start and end of O.G. kallisto contig w.r.t. the bifrost-kallisto
// unitig
@@ -1828,6 +1939,238 @@ donejumping:
}
}
+// use: match_long(s,l,v)
+// pre: v is initialized
+// post: v contains all equiv classes for the k-mers in s
+double KmerIndex::match_long(const char *s, int l, std::vector<std::pair<const_UnitigMap<Node>, int>>& v, bool partial, bool cfc) const {
+ const Node* n;
+ double unmapped_ratio;
+
+ std::string s_string;
+ if (cfc) {
+ // translate nucleotide sequence to comma-free code (cfc)
+ s_string = nn_to_cfc(s, l);
+ s = s_string.c_str();
+ }
+ int empty_count = 0;
+
+ Roaring rtmp;
+ KmerIterator kit(s), kit_end;
+ size_t proc = 0;
+ while (kit != kit_end) { //should be + 2?
+ if (proc < l-k){
+ const_UnitigMap<Node> umU = dbg.findUnitig(s, proc, l);
+ if (!umU.isEmpty) {
+ int numK = 0;
+ do {
+ v.push_back({umU, proc});
+ numK++;
+ } while (numK < int(umU.len/k));
+ }
+ }
+
+ const_UnitigMap<Node> um = dbg.find(kit->first);
+ //const_UnitigMap<Node> um = dbg.findUnitig(s, proc, l);
+
+ n = um.getData();
+
+ int pos = kit->second;
+ if (um.isEmpty) {
+ empty_count++;
+ } else {
+
+ if (partial) {
+ if (rtmp.isEmpty()) {
+ const auto& rtmp2 = um.getData()->ec[um.dist].getIndices();
+ if (!rtmp2.isEmpty()) rtmp = std::move(rtmp2);
+ } else {
+ const auto& rtmp2 = um.getData()->ec[um.dist].getIndices();
+ if (!rtmp2.isEmpty()) rtmp &= rtmp2;
+ if (rtmp.isEmpty()) {
+ v.clear();
+ unmapped_ratio = (double)empty_count/(double)(l - k);
+ return unmapped_ratio;
+ }
+ }
+ }
+
+ v.push_back({um, kit->second});
+
+ // Find start and end of O.G. kallisto contig w.r.t. the bifrost-kallisto
+ // unitig
+ size_t contig_start = 0, contig_length = um.size - k + 1;
+ auto p = n->get_mc_contig(um.dist);
+ contig_start += p.first;
+ contig_length = p.second - contig_start;
+
+ // Looks like kallisto thinks that canonical kmer means forward strand?
+ //bool forward = (um.strand == (kit->first == kit->first.rep()));
+ bool forward = um.strand;
+ int dist = (forward) ? (contig_length - 1 - (um.dist - contig_start)) : um.dist - contig_start;
+
+ // see if we can skip ahead
+ if (dist >= 2) {
+ // where should we jump to?
+ int nextPos = pos+dist; // default jump
+
+ if (pos + dist >= l-k) { //should be + 1?
+ // if we can jump beyond the read, check the end
+ nextPos = l-k; //should be +1?
+ }
+
+ // check next position
+ KmerIterator kit2(kit);
+ kit2 += nextPos-pos;
+ if (kit2 != kit_end) { //(nextPos < l-k) { //should be +1?
+ const_UnitigMap<Node> um2 = dbg.find(kit2->first);
+ //const_UnitigMap<Node> um2 = dbg.findUnitig(s, nextPos, l);
+ bool found2 = false;
+ int found2pos = pos+dist;
+ if (um2.isEmpty) {
+ found2=true;
+ found2pos = pos;
+ } else if (um.isSameReferenceUnitig(um2) &&
+ n->ec[um.dist] == um2.getData()->ec[um2.dist]) {
+ // um and um2 are on the same unitig and also share the same EC
+ found2=true;
+ found2pos = pos+dist;
+ }
+ if (found2) {
+ // great, a match (or nothing) see if we can move the k-mer forward
+ if (found2pos >= l-k) { //should be +1?
+ v.push_back({um, l-k}); // push back a fake position //should be +2?
+ break; //
+ } else {
+ v.push_back({um, found2pos});
+ //trying incremental search
+ proc=found2pos;
+ kit = kit2; // move iterator to this new position
+ }
+ } else {
+ // this is weird, let's try the middle k-mer
+ bool foundMiddle = false;
+ if (dist > 4) {
+ int middlePos = (pos + nextPos)/2;
+ int middleContig = -1;
+ int found3pos = pos + dist; //middlePos+dist; //formerly pos+dist which is same as found2pos, but I think should be middlePos+dist
+ KmerIterator kit3(kit);
+ kit3 += middlePos-pos;
+
+ if (kit3 != kit_end) { //(found3pos < l-k) {
+ const_UnitigMap<Node> um3 = dbg.find(kit3->first);
+ //const_UnitigMap<Node> um3 = dbg.findUnitig(s, middlePos, l);
+ if (!um3.isEmpty) {
+ if (um.isSameReferenceUnitig(um3) &&
+ n->ec[um.dist] == um3.getData()->ec[um3.dist]) {
+ foundMiddle = true;
+ found3pos = middlePos;
+ } else if (um2.isSameReferenceUnitig(um3) &&
+ um2.getData()->ec[um2.dist] == um3.getData()->ec[um3.dist]) {
+ foundMiddle = true;
+ found3pos = pos+dist;
+ }
+ }
+
+ if (foundMiddle) {
+ if (partial) {
+ if (rtmp.isEmpty()) {
+ const auto& rtmp2 = um3.getData()->ec[um3.dist].getIndices();
+ if (!rtmp2.isEmpty()) rtmp = std::move(rtmp2);
+ } else {
+ const auto& rtmp2 = um3.getData()->ec[um3.dist].getIndices();
+ if (!rtmp2.isEmpty()) rtmp &= rtmp2;
+ if (rtmp.isEmpty()) {
+ v.clear();
+ unmapped_ratio = (double)empty_count/(double)(l - k);
+ return unmapped_ratio;
+ }
+ }
+ }
+ v.push_back({um3, found3pos});
+ if (nextPos >= l-k) { //should be +2?
+ break;
+ } else {
+ //trying incremental search
+ proc=found2pos;
+ kit = kit2;
+ }
+ }
+ }
+ }
+
+ if (!foundMiddle) {
+ ++proc;
+ ++kit;
+ // backup plan, let's play it safe and search incrementally for the rest, until nextStop
+ for (int j = 0; kit != kit_end; ++kit,++j) {
+ if (j==skip) {
+ j=0;
+ }
+ if (j==0) {
+ // need to check it
+ const_UnitigMap<Node> um4 = dbg.find(kit->first);
+ //const_UnitigMap<Node> um4 = dbg.findUnitig(s, proc, l);;
+ if (!um4.isEmpty) {
+ // if k-mer found
+ if (partial) {
+ if (rtmp.isEmpty()) {
+ const auto& rtmp2 = um4.getData()->ec[um4.dist].getIndices();
+ if (!rtmp2.isEmpty()) rtmp = std::move(rtmp2);
+ } else {
+ const auto& rtmp2 = um4.getData()->ec[um4.dist].getIndices();
+ if (!rtmp2.isEmpty()) rtmp &= rtmp2;
+ if (rtmp.isEmpty()) {
+ v.clear();
+ unmapped_ratio = (double)empty_count/(double)(l - k);
+ return unmapped_ratio;
+ }
+ }
+ }
+ v.push_back({um4, proc}); // add equivalence class, and position
+ }
+ }
+ if (kit->second >= nextPos) {
+ //continue;
+ //trying out the not breaking case to do a complete incremental search until kit_end
+ break;
+ }
+ ++proc;
+ }
+ //kit += (l-k)-nextPos;
+ //proc = l; //this is checking if jump logic works how I am suspecting it is working
+ }
+ }
+ }
+ //} //NOTE!!! When this if is outside of the check if nextPos is at the end of the kmer, then we allow for incrementally searching which is the correct behavior?
+ else {
+ // the sequence is messed up at this point, let's just take the match
+ //v.push_back({dbGraph.ecs[val.contig], l-k});
+ break;
+ }
+ } //adding this corresponding to NOTE!!!
+ }
+ kit++;
+ proc++;
+ }
+
+ // D-list:
+ KmerIterator kit_dlist(s);
+ if (d_list.size() > 0 && (v.size() > 0 || !partial)) {
+ for (int i = 0; kit_dlist != kit_end; ++i,++kit_dlist) {
+ // need to check it
+ bool is_in_dlist = (d_list.find((kit_dlist->first).rep()) != d_list.end());
+ if (is_in_dlist) {
+ v.push_back({um_dummy, kit_dlist->second});
+ break;
+ }
+ }
+ }
+
+ //std::cerr << "l: " << l << " empty_count: " << empty_count << std::endl;
+ unmapped_ratio = (double)empty_count/(double)(l - k);
+ return unmapped_ratio;
+}
+
std::pair<int,bool> KmerIndex::findPosition(int tr, Kmer km, int p) const{
const_UnitigMap<Node> um = dbg.find(km);
if (!um.isEmpty) {
=====================================
src/KmerIndex.h
=====================================
@@ -77,11 +77,17 @@ struct KmerIndex {
//LoadTranscripts(opt.transfasta);
load_positional_info = opt.bias || opt.pseudobam || opt.genomebam || !opt.single_overhang;
dfk_onlist = opt.dfk_onlist;
+ do_union = opt.do_union;
+ no_jump = opt.no_jump;
+ // Begin Shading
+ use_shade = false;
+ // End Shading
}
~KmerIndex() {}
std::pair<size_t,size_t> getECInfo() const; // Get max EC size encountered and second element is the number of nodes in which an EC is empty (b/c it was discarded)
+ double match_long(const char *s, int l, std::vector<std::pair<const_UnitigMap<Node>, int>>& v, bool partial = false, bool cfc = false) const;
void match(const char *s, int l, std::vector<std::pair<const_UnitigMap<Node>, int>>& v, bool partial = false, bool cfc = false) const;
// bool matchEnd(const char *s, int l, std::vector<std::pair<int, int>>& v, int p) const;
@@ -130,6 +136,8 @@ struct KmerIndex {
std::vector<std::string> target_names_;
std::vector<std::string> target_seqs_; // populated on demand
bool dfk_onlist; // If we want to not use D-list in intersecting ECs
+ bool do_union; // If we want to do "pseudoalignment" via a "union" rather than an "intersection"
+ bool no_jump; // If we want to skip the jumping logic during pseudoalignment
bool target_seqs_loaded;
bool load_positional_info; // when should we load positional info in addition to strandedness
@@ -140,6 +148,13 @@ struct KmerIndex {
u_set_<Kmer, KmerHash> d_list;
Kmer dummy_dfk;
const_UnitigMap<Node> um_dummy;
+
+ // Begin Shading
+ // Here, we use the concepts of "shades" as proposed by in Ornaments by Adduri & Kim, 2024 for bias-corrected allele-specific expression estimation
+ std::unordered_map<int, int> shadeToColorTranscriptMap;
+ Roaring shade_sequences;
+ bool use_shade;
+ // End Shading
};
#endif // KALLISTO_KMERINDEX_H
=====================================
src/MinCollector.cpp
=====================================
@@ -118,10 +118,56 @@ int MinCollector::intersectKmersCFC(std::vector<std::pair<const_UnitigMap<Node>,
return 1;
}
-int MinCollector::intersectKmers(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v1,
+int MinCollector::modeKmers(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v1,
std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v2, bool nonpaired, Roaring& r) const {
Roaring u1 = intersectECs(v1);
+ if (u1.isEmpty()) { u1 = modeECs(v1); }
+
Roaring u2 = intersectECs(v2);
+ if (u2.isEmpty()) { u2 = modeECs(v2); }
+
+ if (u1.isEmpty() && u2.isEmpty()) {
+ return -1;
+ }
+
+ // non-strict intersection.
+ if (u1.isEmpty()) {
+ if (v1.empty()) {
+ r = u2;
+ } else {
+ return -1;
+ }
+ } else if (u2.isEmpty()) {
+ if (v2.empty()) {
+ r = u1;
+ } else {
+ return -1;
+ }
+ } else {
+ if (index.dfk_onlist) { // In case we want to not intersect D-list targets
+ includeDList(u1, u2, index.onlist_sequences);
+ }
+ r = u1 & u2;
+ }
+
+ if (r.isEmpty()) {
+ return -1;
+ }
+ return 1;
+}
+
+
+int MinCollector::intersectKmers(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v1,
+ std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v2, bool nonpaired, Roaring& r) const {
+ Roaring u1, u2;
+ if (!index.do_union) {
+ u1 = intersectECs(v1);
+ u2 = intersectECs(v2);
+ } else {
+ u1 = unionECs(v1);
+ u2 = unionECs(v2);
+ }
+
if (u1.isEmpty() && u2.isEmpty()) {
return -1;
@@ -144,12 +190,30 @@ int MinCollector::intersectKmers(std::vector<std::pair<const_UnitigMap<Node>, in
if (index.dfk_onlist) { // In case we want to not intersect D-list targets
includeDList(u1, u2, index.onlist_sequences);
}
+ // Begin Shading
+ if (index.use_shade) u1 = u1 - index.shade_sequences;
+ if (index.use_shade) u2 = u2 - index.shade_sequences;
+ // End Shading
r = u1 & u2;
}
if (r.isEmpty()) {
return -1;
}
+ // Begin Shading
+ if (index.use_shade) {
+ // Take the union of the shades
+ u1 = unionECs(v1);
+ u2 = unionECs(v2);
+ Roaring r_shade = (u1 | u2) & index.shade_sequences;
+ Roaring r_shade_final;
+ for (auto shade : r_shade) { // Make sure the shades correspond to the targets in the intersection
+ auto color = index.shadeToColorTranscriptMap[shade];
+ if (r.contains(color)) r_shade_final.add(shade);
+ }
+ r |= r_shade_final; // Add the shades to the equivalence class
+ }
+ // End Shading
return 1;
}
@@ -216,7 +280,81 @@ struct ComparePairsBySecond {
}
};
-Roaring MinCollector::intersectECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const {
+Roaring MinCollector::modeECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const {
+ Roaring mode;
+ if (v.empty()) {
+ return mode;
+ }
+/***
+ sort(v.begin(), v.end(), [&](const std::pair<const_UnitigMap<Node>, int>& a, const std::pair<const_UnitigMap<Node>, int>& b)
+ {
+ if (a.first.isSameReferenceUnitig(b.first) &&
+ a.first.getData()->ec[a.first.dist] == b.first.getData()->ec[b.first.dist]) {
+ return a.second < b.second;
+ } else {
+ return a.first.getData()->id < b.first.getData()->id;
+ }
+ }); // sort by contig, and then first position
+***/
+
+ mode = v[0].first.getData()->ec[v[0].first.dist].getIndices();
+ bool found_nonempty = !mode.isEmpty();
+ bool modeMultiMapping = false;
+ Roaring lastEC = mode;
+ Roaring ec;
+ int modeCount = 0, curCount = 0;
+ for (int i = 1; i < v.size(); i++) {
+ // Find a non-empty EC before we start taking the intersection
+ if (!found_nonempty) {
+ mode = v[i].first.getData()->ec[v[i].first.dist].getIndices();
+ found_nonempty = !mode.isEmpty();
+ if (found_nonempty && mode.cardinality() == 1 ) {
+ modeMultiMapping = true;
+ }
+ }
+ if (!v[i].first.isSameReferenceUnitig(v[i-1].first) ||
+ !(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) {
+ ec = v[i].first.getData()->ec[v[i].first.dist].getIndices();
+ if (ec == lastEC && !ec.isEmpty()) {
+ curCount += 1;
+ }
+
+ // Don't intersect empty EC (because of thresholding)
+ if (!(ec == lastEC) && !ec.isEmpty()) {
+ if (index.dfk_onlist) { // In case we want to not intersect D-list targets
+ includeDList(mode, ec, index.onlist_sequences);
+ }
+ if (curCount > modeCount && (ec.cardinality() == 1 || modeMultiMapping)) {
+ if (ec.cardinality() == 1) {
+ modeMultiMapping = false;
+ }
+ mode = std::move(lastEC);
+ modeCount = curCount;
+ //curCount = 0; //Technically, not correct mode, but for some reason was giving better results than mode...
+ }
+ curCount = 0;
+ lastEC = std::move(ec);
+ }
+ }
+ }
+ // find the range of support
+ int minpos = std::numeric_limits<int>::max();
+ int maxpos = 0;
+ for (auto& x : v) {
+ minpos = std::min(minpos, x.second);
+ maxpos = std::max(maxpos, x.second);
+ }
+ if ((maxpos-minpos + k) < min_range) {
+ return {};
+ }
+ if (modeCount > 0) {
+ return mode;
+ } else {
+ return {};
+ }
+}
+
+Roaring MinCollector::intersectECs_long(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const {
Roaring r;
if (v.empty()) {
return r;
@@ -236,6 +374,7 @@ Roaring MinCollector::intersectECs(std::vector<std::pair<const_UnitigMap<Node>,
bool found_nonempty = !r.isEmpty();
Roaring lastEC = r;
Roaring ec;
+ int curCount = 0;
for (int i = 1; i < v.size(); i++) {
@@ -245,10 +384,86 @@ Roaring MinCollector::intersectECs(std::vector<std::pair<const_UnitigMap<Node>,
found_nonempty = !r.isEmpty();
}
+ if (!v[i].first.isSameReferenceUnitig(v[i-1].first) ||
+ !(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) {
+ curCount++;
+ ec = v[i].first.getData()->ec[v[i].first.dist].getIndices();
+
+ // Don't intersect empty EC (because of thresholding)
+ if (!(ec == lastEC) && !ec.isEmpty()) {
+ if (index.dfk_onlist) { // In case we want to not intersect D-list targets
+ includeDList(r, ec, index.onlist_sequences);
+ }
+ if (curCount > 1) {
+ r &= ec;
+ }
+ if (r.isEmpty()) {
+ return r;
+ }
+ lastEC = std::move(ec);
+ curCount = 0;
+ }
+ }
+ }
+
+ // find the range of support
+ int minpos = std::numeric_limits<int>::max();
+ int maxpos = 0;
+
+ for (auto& x : v) {
+ minpos = std::min(minpos, x.second);
+ maxpos = std::max(maxpos, x.second);
+ }
+
+ if ((maxpos-minpos + k) < min_range) {
+ return {};
+ }
+
+ return r;
+}
+
+Roaring MinCollector::intersectECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const {
+ Roaring r;
+ if (v.empty()) {
+ return r;
+ }
+ sort(v.begin(), v.end(), [&](const std::pair<const_UnitigMap<Node>, int>& a, const std::pair<const_UnitigMap<Node>, int>& b)
+ {
+ if (a.first.isSameReferenceUnitig(b.first) &&
+ a.first.getData()->ec[a.first.dist] == b.first.getData()->ec[b.first.dist]) {
+ return a.second < b.second;
+ } else {
+ return a.first.getData()->id < b.first.getData()->id;
+ }
+ }); // sort by contig, and then first position
+
+
+ r = v[0].first.getData()->ec[v[0].first.dist].getIndices();
+ // Begin Shading
+ if (index.use_shade) r = r - index.shade_sequences;
+ // End Shading
+ bool found_nonempty = !r.isEmpty();
+ Roaring lastEC = r;
+ Roaring ec;
+
+ for (int i = 1; i < v.size(); i++) {
+
+ // Find a non-empty EC before we start taking the intersection
+ if (!found_nonempty) {
+ r = v[i].first.getData()->ec[v[i].first.dist].getIndices();
+ // Begin Shading
+ if (index.use_shade) r = r - index.shade_sequences;
+ // End Shading
+ found_nonempty = !r.isEmpty();
+ }
+
if (!v[i].first.isSameReferenceUnitig(v[i-1].first) ||
!(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) {
ec = v[i].first.getData()->ec[v[i].first.dist].getIndices();
+ // Begin Shading
+ if (index.use_shade) ec = ec - index.shade_sequences;
+ // End Shading
// Don't intersect empty EC (because of thresholding)
if (!(ec == lastEC) && !ec.isEmpty()) {
@@ -276,7 +491,57 @@ Roaring MinCollector::intersectECs(std::vector<std::pair<const_UnitigMap<Node>,
if ((maxpos-minpos + k) < min_range) {
return {};
}
+
+ return r;
+}
+Roaring MinCollector::unionECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const {
+ Roaring r;
+ if (v.empty()) {
+ return r;
+ }
+ sort(v.begin(), v.end(), [&](const std::pair<const_UnitigMap<Node>, int>& a, const std::pair<const_UnitigMap<Node>, int>& b)
+ {
+ if (a.first.isSameReferenceUnitig(b.first) &&
+ a.first.getData()->ec[a.first.dist] == b.first.getData()->ec[b.first.dist]) {
+ return a.second < b.second;
+ } else {
+ return a.first.getData()->id < b.first.getData()->id;
+ }
+ }); // sort by contig, and then first position
+
+ r = v[0].first.getData()->ec[v[0].first.dist].getIndices();
+ bool found_nonempty = !r.isEmpty();
+ Roaring lastEC = r;
+ Roaring ec;
+
+ for (int i = 1; i < v.size(); i++) {
+ // Find a non-empty EC before we start taking the intersection
+ if (!found_nonempty) {
+ r = v[i].first.getData()->ec[v[i].first.dist].getIndices();
+ found_nonempty = !r.isEmpty();
+ }
+
+ if (!v[i].first.isSameReferenceUnitig(v[i-1].first) ||
+ !(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) {
+ ec = v[i].first.getData()->ec[v[i].first.dist].getIndices();
+ r |= ec;
+ }
+ }
+
+ // find the range of support
+ int minpos = std::numeric_limits<int>::max();
+ int maxpos = 0;
+
+ for (auto& x : v) {
+ minpos = std::min(minpos, x.second);
+ maxpos = std::max(maxpos, x.second);
+ }
+
+ if ((maxpos-minpos + k) < min_range) {
+ return {};
+ }
+
return r;
}
=====================================
src/MinCollector.h
=====================================
@@ -21,6 +21,9 @@ struct MinCollector {
index(ind),
counts(index.ecmapinv.size(), 0),
flens(MAX_FRAG_LEN),
+ unmapped_list(3000000),
+ flens_lr(index.target_lens_.size()),
+ flens_lr_c(index.target_lens_.size()),
bias3(4096),
bias5(4096),
min_range(opt.min_range),
@@ -51,6 +54,9 @@ struct MinCollector {
int decreaseCount(const int ec);
Roaring intersectECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const;
+ Roaring intersectECs_long(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const;
+ Roaring unionECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const;
+ Roaring modeECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const;
int intersectKmersCFC(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v1,
std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v3,
std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v4,
@@ -59,6 +65,8 @@ struct MinCollector {
std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v7, Roaring& r) const;
int intersectKmers(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v1,
std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v2, bool nonpaired, Roaring& r) const;
+ int modeKmers(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v1,
+ std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v2, bool nonpaired, Roaring& r) const;
int findEC(const std::vector<int32_t>& u) const;
@@ -88,6 +96,9 @@ struct MinCollector {
KmerIndex& index;
std::vector<uint32_t> counts;
std::vector<uint32_t> flens;
+ std::vector<double> unmapped_list;
+ std::vector<uint32_t> flens_lr;
+ std::vector<uint32_t> flens_lr_c;
std::vector<int32_t> bias3, bias5;
int min_range;
int k;
=====================================
src/PlaintextWriter.cpp
=====================================
@@ -146,6 +146,7 @@ void plaintext_aux(
const std::string& n_unique,
const std::string& version,
const std::string& index_v,
+ const std::string& index_k,
const std::string& start_time,
const std::string& call,
const std::string& cardinality_clashes) {
@@ -184,6 +185,7 @@ void plaintext_aux(
to_json("p_unique", p_uniq_s, false) << std::endl <<
to_json("kallisto_version", version, true) << std::endl <<
to_json("index_version", index_v, false) << std::endl <<
+ to_json("k-mer length", index_k, false) << std::endl <<
to_json("start_time", start_time, true) << std::endl;
if (cardinality_clashes != "") {
of << to_json("call", call, true) << std::endl;
=====================================
src/PlaintextWriter.h
=====================================
@@ -42,6 +42,7 @@ void plaintext_aux(
const std::string& n_unique,
const std::string& version,
const std::string& index_v,
+ const std::string& index_k,
const std::string& start_time,
const std::string& call,
const std::string& cardinality_clashes="");
=====================================
src/ProcessReads.cpp
=====================================
@@ -58,7 +58,28 @@ std::pair<const_UnitigMap<Node>, int> findFirstMappingKmer(const std::vector<std
return std::make_pair(um, p);
}
-void doStrandSpecificity(Roaring& u, const ProgramOptions::StrandType strand, const std::vector<std::pair<const_UnitigMap<Node>, int32_t> >& v, const std::vector<std::pair<const_UnitigMap<Node>, int32_t> >& v2) {
+void doStrandSpecificity(Roaring& u, const ProgramOptions::StrandType strand, const std::vector<std::pair<const_UnitigMap<Node>, int32_t> >& v, const std::vector<std::pair<const_UnitigMap<Node>, int32_t> >& v2, bool comprehensive = false) {
+ if (comprehensive) { // Comprehensive means we check every position's strand specificity (not used in standard usage)
+ Roaring final_u;
+ for (int i = 0; i < v.size(); i++) {
+ Roaring u_ = u; // for union and shades and no-jump
+ std::vector<std::pair<const_UnitigMap<Node>, int32_t> > v_;
+ std::vector<std::pair<const_UnitigMap<Node>, int32_t> > v2_;
+ v_.push_back(v[i]);
+ doStrandSpecificity(u_, strand, v_, v2, false);
+ final_u |= u_;
+ }
+ for (int i = 0; i < v2.size(); i++) {
+ Roaring u_ = u; // for union and shades and no-jump
+ std::vector<std::pair<const_UnitigMap<Node>, int32_t> > v_;
+ std::vector<std::pair<const_UnitigMap<Node>, int32_t> > v2_;
+ v2_.push_back(v2[i]);
+ doStrandSpecificity(u_, strand, v_, v2, false);
+ final_u |= u_;
+ }
+ u = final_u;
+ return;
+ }
int p = -1;
const_UnitigMap<Node> um;
Roaring vtmp;
@@ -121,10 +142,12 @@ int64_t ProcessBatchReads(MasterProcessor& MP, const ProgramOptions& opt) {
size_t numreads = 0;
size_t nummapped = 0;
- bool paired = !opt.single_end;
+ bool paired = !opt.single_end && !opt.long_read;
if (paired) {
std::cerr << "[quant] running in paired-end mode" << std::endl;
+ } else if (opt.long_read) {
+ std::cerr << "[quant] running in long read mode" << std::endl;
} else {
std::cerr << "[quant] running in single-end mode" << std::endl;
}
@@ -171,11 +194,13 @@ int64_t ProcessReads(MasterProcessor& MP, const ProgramOptions& opt) {
//int tlencount = (opt.fld == 0.0) ? 10000 : 0;
size_t numreads = 0;
size_t nummapped = 0;
- bool paired = !opt.single_end;
+ bool paired = !opt.single_end && !opt.long_read;
if (paired) {
std::cerr << "[quant] running in paired-end mode" << std::endl;
+ } else if (opt.long_read) {
+ std::cerr << "[quant] running in long read mode" << std::endl;
} else {
std::cerr << "[quant] running in single-end mode" << std::endl;
}
@@ -214,7 +239,6 @@ int64_t ProcessReads(MasterProcessor& MP, const ProgramOptions& opt) {
if (nummapped == 0) {
std::cerr << "[~warn] no reads pseudoaligned." << std::endl;
}
-
// write output to outdir
if (opt.write_index) {
std::string outfile = opt.output + "/counts.txt";
@@ -224,7 +248,6 @@ int64_t ProcessReads(MasterProcessor& MP, const ProgramOptions& opt) {
of.close();
}
-
return numreads;
}
@@ -237,8 +260,15 @@ int64_t ProcessBUSReads(MasterProcessor& MP, const ProgramOptions& opt) {
//int tlencount = (opt.fld == 0.0) ? 10000 : 0;
size_t numreads = 0;
size_t nummapped = 0;
- bool paired = !opt.single_end;
+ bool paired = !opt.single_end && !opt.long_read;
+ if (paired) {
+ std::cerr << "[quant] running in paired-end mode" << std::endl;
+ } else if (opt.long_read) {
+ std::cerr << "[quant] running in long read mode" << std::endl;
+ } else {
+ std::cerr << "[quant] running in single-end mode" << std::endl;
+ }
for (int i = 0, si=1; i < opt.files.size(); si++) {
auto& busopt = opt.busOptions;
std::cerr << "[quant] will process sample " << si<< ": ";
@@ -268,7 +298,6 @@ int64_t ProcessBUSReads(MasterProcessor& MP, const ProgramOptions& opt) {
if (nummapped == 0) {
std::cerr << "[~warn] no reads pseudoaligned." << std::endl;
}
-
return numreads;
}
@@ -358,7 +387,7 @@ void MasterProcessor::processReads() {
batchSR.files = opt.batch_files[id+i];
batchSR.nfiles = opt.batch_files[id+i].size();
batchSR.reserveNfiles(opt.batch_files[id+i].size());
- batchSR.paired = !opt.single_end;
+ batchSR.paired = !opt.single_end && !opt.long_read;
FSRs.push_back(std::move(batchSR));
}
@@ -394,7 +423,7 @@ void MasterProcessor::processReads() {
void MasterProcessor::update(const std::vector<uint32_t>& c, const std::vector<Roaring> &newEcs,
std::vector<std::pair<Roaring, std::string>>& ec_umi, std::vector<std::pair<Roaring, std::string>> &new_ec_umi,
- int n, std::vector<int>& flens, std::vector<int> &bias, const PseudoAlignmentBatch& pseudobatch, std::vector<BUSData> &bv, std::vector<std::pair<BUSData, Roaring>> newBP, int *bc_len, int *umi_len, int id, int local_id) {
+ int n, std::vector<int>& flens, std::vector<double>& unmapped_list, std::vector<int>& flens_lr, std::vector<int>& flens_lr_c, std::vector<int> &bias, const PseudoAlignmentBatch& pseudobatch, std::vector<BUSData> &bv, std::vector<std::pair<BUSData, Roaring>> newBP, int *bc_len, int *umi_len, int id, int local_id) {
// acquire the writer lock
std::lock_guard<std::mutex> lock(this->writer_lock);
size_t num_new_ecs = 0;
@@ -470,6 +499,45 @@ void MasterProcessor::update(const std::vector<uint32_t>& c, const std::vector<R
tlencount += local_tlencount;
}
}
+ if (opt.unmapped) {
+ if (opt.batch_mode) {
+ /***for (int i = 0; i < batchUnmapped_list[id].size(); i++) {
+ std::cerr << batchUnmapped_list[id][i] << "," << std::endl;
+ tc.unmapped_list.push_back(batchUnmapped_list[id][i]);
+ }***/
+ for (int i = 0; i < unmapped_list.size(); i++) {
+ //std::cerr << unmapped_list[i] << "," << std::endl;
+ tc.unmapped_list.push_back(unmapped_list[i]);
+ }
+ } else {
+ for (int i = 0; i < unmapped_list.size(); i++) {
+ tc.unmapped_list.push_back(unmapped_list[i]);
+ }
+ }
+ }
+
+ if (!flens_lr.empty()) {
+ if (opt.batch_mode) {
+ auto &bflen_lr = batchFlens_lr[id];
+ auto &bflen_lr_c = batchFlens_lr_c[id];
+ auto &tcount = batchFlens_lr_c[id];
+ for (int i = 0; i < flens_lr.size(); i++) {
+ flens_lr[i] += bflen_lr[i];
+ flens_lr_c[i] += bflen_lr_c[i];
+ tcount[i] += bflen_lr_c[i];
+ }
+
+ } else {
+ auto &local_tlencount = flens_lr_c;
+ for (int i = 0; i < flens_lr.size(); i++) {
+ tc.flens_lr[i] += flens_lr[i];
+ tc.flens_lr_c[i] += flens_lr_c[i];
+ local_tlencount[i] += flens_lr_c[i];
+ tlencount += local_tlencount[i];
+ }
+ }
+ }
+
if (!bias.empty()) {
int local_biasCount = 0;
@@ -562,7 +630,7 @@ void MasterProcessor::update(const std::vector<uint32_t>& c, const std::vector<R
attempt_transfer_ecs(num_new_ecs);
numreads += n;
-
+
//if (opt.verbose) {
counter += n;
if (counter >= 1000000) {
@@ -800,9 +868,16 @@ void MasterProcessor::outputFusion(const std::stringstream &o) {
}
}
+void MasterProcessor::outputNovel(const std::stringstream &o) {
+ std::string os = o.str();
+ if (!os.empty()) {
+ std::lock_guard<std::mutex> lock(this->writer_lock);
+ ofusion << os << "\n";
+ }
+}
ReadProcessor::ReadProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp, int _id, int _local_id) :
- paired(!opt.single_end), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
+ paired(!opt.single_end && !opt.long_read), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
// initialize buffer
bufsize = mp.bufsize;
buffer = new char[bufsize];
@@ -812,11 +887,12 @@ ReadProcessor::ReadProcessor(const KmerIndex& index, const ProgramOptions& opt,
batchSR.files = opt.batch_files[id];
batchSR.nfiles = opt.batch_files[id].size();
batchSR.reserveNfiles(opt.batch_files[id].size());
- batchSR.paired = !opt.single_end;
+ batchSR.paired = !opt.single_end && !opt.long_read;
}
seqs.reserve(bufsize/50);
newEcs.reserve(1000);
+ if (opt.unmapped) unmapped_list.reserve(seqs.size());
counts.reserve((int) (tc.counts.size() * 1.25));
clear();
}
@@ -837,6 +913,9 @@ ReadProcessor::ReadProcessor(ReadProcessor && o) :
umis(std::move(o.umis)),
newEcs(std::move(o.newEcs)),
flens(std::move(o.flens)),
+ unmapped_list(std::move(o.unmapped_list)),
+ flens_lr(std::move(o.flens_lr)),
+ flens_lr_c(std::move(o.flens_lr_c)),
bias5(std::move(o.bias5)),
batchSR(std::move(o.batchSR)),
counts(std::move(o.counts)) {
@@ -881,7 +960,7 @@ void ReadProcessor::operator()() {
// update the results, MP acquires the lock
std::vector<BUSData> tmp_v{};
- mp.update(counts, newEcs, ec_umi, new_ec_umi, paired ? seqs.size()/2 : seqs.size(), flens, bias5, pseudobatch, tmp_v, std::vector<std::pair<BUSData, Roaring>>{}, nullptr, nullptr, id, local_id);
+ mp.update(counts, newEcs, ec_umi, new_ec_umi, paired ? seqs.size()/2 : seqs.size(), flens, unmapped_list, flens_lr, flens_lr_c, bias5, pseudobatch, tmp_v, std::vector<std::pair<BUSData, Roaring>>{}, nullptr, nullptr, id, local_id);
clear();
}
}
@@ -899,20 +978,41 @@ void ReadProcessor::processBuffer() {
const char* s2 = 0;
int l1, l2;
- bool findFragmentLength = (mp.opt.fld == 0) && (mp.tlencount < 10000);
+ bool findFragmentLength;
+ if (mp.opt.long_read) {
+ findFragmentLength = (mp.tlencount < 1000000);
+ } else {
+ findFragmentLength = (mp.opt.fld == 0) && (mp.tlencount < 10000);
+ }
if (mp.opt.batch_mode) {
- findFragmentLength = (mp.opt.fld == 0) && (mp.tlencounts[id] < 10000);
+ if (mp.opt.long_read) {
+ findFragmentLength = (mp.tlencount < 1000000);
+ } else {
+ findFragmentLength = (mp.opt.fld == 0) && (mp.tlencount < 10000);
+ }
}
int flengoal = 0;
flens.clear();
+ unmapped_list.clear();
+ flens_lr.clear();
+ flens_lr_c.clear();
if (findFragmentLength) {
- flengoal = (10000 - mp.tlencount);
+ if (mp.opt.long_read) {
+ flengoal = (1000000 - mp.tlencount);
+ } else {
+ flengoal = (10000 - mp.tlencount);
+ }
if (flengoal <= 0) {
findFragmentLength = false;
flengoal = 0;
} else {
- flens.resize(tc.flens.size(), 0);
+ if (mp.opt.long_read) {
+ flens_lr.resize(tc.flens_lr.size(), 0);
+ flens_lr_c.resize(tc.flens_lr_c.size(), 0);
+ } else {
+ flens.resize(tc.flens.size(), 0);
+ }
}
}
@@ -947,13 +1047,27 @@ void ReadProcessor::processBuffer() {
u = Roaring();
// process read
- index.match(s1, l1, v1, !paired);
+
+ bool novel = false;
+ double unmapped_r=0;
+ if (mp.opt.long_read) {
+ unmapped_r = index.match_long(s1, l1, v1, !paired);
+ std::cerr << unmapped_r << "," << std::endl;
+ novel = (unmapped_r > mp.opt.threshold*l1);
+ } else {
+ index.match(s1, l1, v1, !paired);
+ }
if (paired) {
index.match(s2, l2, v2, !paired);
}
// collect the target information
- int r = tc.intersectKmers(v1, v2, !paired, u);
+ int r = 0;
+ if (mp.opt.long_read) {
+ r = tc.modeKmers(v1, v2, !paired, u);
+ } else {
+ r = tc.intersectKmers(v1, v2, !paired, u);
+ }
// Mask out off-listed kmers
u &= index.onlist_sequences;
@@ -963,128 +1077,172 @@ void ReadProcessor::processBuffer() {
exit(1);
//searchFusion(index,mp.opt,tc,mp,ec,names[i-1].first,s1,v1,names[i].first,s2,v2,paired);
}
+ novel = true;
}
- /* -- possibly modify the pseudoalignment -- */
-
- // If we have paired end reads where one end maps or single end reads, check if some transcsripts
- // are not compatible with the mean fragment length
- if (!mp.opt.single_overhang && !u.isEmpty() && (!paired || v1.empty() || v2.empty()) && tc.has_mean_fl) {
- vtmp = Roaring();
- // inspect the positions
- int fl = (int) tc.get_mean_frag_len();
- int p = -1;
- const_UnitigMap<Node> um;
- Kmer km;
-
- if (!v1.empty()) {
- auto res = findFirstMappingKmer(v1);
- um = res.first;
- p = res.second;
- km = um.getMappedHead();
- }
- if (!v2.empty()) {
- auto res = findFirstMappingKmer(v2);
- um = res.first;
- p = res.second;
- km = um.getMappedHead();
- }
-
- // for each transcript in the pseudoalignment
- for (auto tr : u) {
-
- auto x = index.findPosition(tr, km, um, p);
- // if the fragment is within bounds for this transcript, keep it
- if (x.second && x.first + fl <= (int)index.target_lens_[tr]) {
- vtmp.add(tr);
- } else {
- //pass
- }
- if (!x.second && x.first - fl >= 0) {
- vtmp.add(tr);
- } else {
- //pass
- }
- }
-
- if (vtmp.cardinality() < u.cardinality()) {
- u = vtmp; // copy
- }
+ if (mp.opt.long_read && (r == -1 || u.isEmpty()) && mp.opt.unmapped) {
+ std::stringstream ss;
+ std::string s(s1);
+ s = "@unmapped\n"+s;
+ ss << s;
+ mp.outputNovel(ss);
}
- if (mp.opt.strand_specific && !u.isEmpty()) {
- doStrandSpecificity(u, mp.opt.strand, v1, v2);
- }
-
- // find the ec
- if (!u.isEmpty()) {
- std::lock_guard<std::mutex> lock(mp.transfer_locks[local_id]);
-
- // count the pseudoalignment
- auto elem = index.ecmapinv.find(u);
- if (elem == index.ecmapinv.end()) {
- // something we haven't seen before
- newEcs.push_back(u);
- } else if (elem->second >= counts.size()) { // handle race condition (if counts isn't updated yet)
- newEcs.push_back(u);
+ /* -- possibly modify the pseudoalignment -- */
+ if (!novel || !mp.opt.long_read) {
+ // If we have paired end reads where one end maps or single end reads, check if some transcsripts
+ // are not compatible with the mean fragment length
+ if (!mp.opt.single_overhang && !u.isEmpty() && (!paired || v1.empty() || v2.empty()) && tc.has_mean_fl) {
+ vtmp = Roaring();
+ // inspect the positions
+ int fl = (int) tc.get_mean_frag_len();
+ int p = -1;
+ const_UnitigMap<Node> um;
+ Kmer km;
+
+ if (!v1.empty()) {
+ auto res = findFirstMappingKmer(v1);
+ um = res.first;
+ p = res.second;
+ km = um.getMappedHead();
+ }
+ if (!v2.empty()) {
+ auto res = findFirstMappingKmer(v2);
+ um = res.first;
+ p = res.second;
+ km = um.getMappedHead();
+ }
+
+ // for each transcript in the pseudoalignment
+ for (auto tr : u) {
+
+ auto x = index.findPosition(tr, km, um, p);
+ // if the fragment is within bounds for this transcript, keep it
+ if (x.second && x.first + fl <= (int)index.target_lens_[tr]) {
+ vtmp.add(tr);
+ } else {
+ //pass
+ }
+ if (!x.second && x.first - fl >= 0) {
+ vtmp.add(tr);
+ } else {
+ //pass
+ }
+ }
+
+ if (vtmp.cardinality() < u.cardinality()) {
+ u = vtmp; // copy
+ }
+ }
+
+ if (mp.opt.strand_specific && !u.isEmpty()) {
+ bool comprehensive = false;
+ if (mp.opt.do_union || mp.opt.no_jump) comprehensive = true;
+ // Begin Shading
+ if (index.use_shade) comprehensive = true;
+ // End Shading
+ doStrandSpecificity(u, mp.opt.strand, v1, v2, comprehensive);
+ }
+
+ // find the ec
+ if (!u.isEmpty()) {
+ std::lock_guard<std::mutex> lock(mp.transfer_locks[local_id]);
+
+ // count the pseudoalignment
+ auto elem = index.ecmapinv.find(u);
+ if (elem == index.ecmapinv.end()) {
+ // something we haven't seen before
+ newEcs.push_back(u);
+ } else if (elem->second >= counts.size()) { // handle race condition (if counts isn't updated yet)
+ newEcs.push_back(u);
+ } else {
+ // add to count vector
+ ++counts[elem->second];
+ }
+
+ /* -- collect extra information -- */
+ // collect bias info
+ if (findBias && !u.isEmpty() && biasgoal > 0) {
+ // collect sequence specific bias info
+
+ if (tc.countBias(s1, (paired) ? s2 : nullptr, v1, v2, paired, bias5)) {
+ biasgoal--;
+ }
+ }
+
+ // collect fragment length info
+ if (!mp.opt.long_read && findFragmentLength && flengoal > 0 && paired && u.cardinality() == 1 && !v1.empty() && !v2.empty()) {
+ // try to map the reads
+ int tl = index.mapPair(s1, l1, s2, l2);
+ if (0 < tl && tl < flens.size()) {
+ flens[tl]++;
+ flengoal--;
+ }
+ }
+
+ if (mp.opt.long_read) {
+ if (findFragmentLength && flengoal > 0 && u.cardinality() == 1 && !v1.empty()) {
+ for ( auto tr : u) {
+ flens_lr[tr] += l1;
+ flens_lr_c[tr]++;
+ flengoal--;
+ }
+ }
+ if (findFragmentLength && mp.opt.unmapped)
+ {
+ unmapped_list.push_back(unmapped_r);
+ }
+ }
+ }
+ // pseudobam
+ if (mp.opt.pseudobam) {
+ PseudoAlignmentInfo info;
+ info.id = (paired) ? (i/2) : i; // read id
+ info.paired = paired;
+ if (!u.isEmpty()) {
+ info.r1empty = v1.empty();
+ info.r2empty = v2.empty();
+ const_UnitigMap<Node> um;
+ info.k1pos = -1;
+ info.k2pos = -1;
+ if (!info.r1empty) {
+ auto res = findFirstMappingKmer(v1);
+ info.k1pos = res.second;
+ }
+ if (!info.r2empty) {
+ auto res = findFirstMappingKmer(v2);
+ info.k1pos = res.second;
+ }
+
+ info.ec = u;
+ }
+ pseudobatch.aln.push_back(std::move(info));
+ }
+
+ } else { // now address reads that are considered novel and need to be written to novel fastq and processed later.
+ std::stringstream ss;
+ if (u.isEmpty()) {
+ std::string s(s1);
+ s = "@novel_disjointIntersect\n"+s;
+ ss << s;
+ mp.outputNovel(ss);
} else {
- // add to count vector
- ++counts[elem->second];
- }
-
- /* -- collect extra information -- */
- // collect bias info
- if (findBias && !u.isEmpty() && biasgoal > 0) {
- // collect sequence specific bias info
-
- if (tc.countBias(s1, (paired) ? s2 : nullptr, v1, v2, paired, bias5)) {
- biasgoal--;
- }
- }
-
- // collect fragment length info
- if (findFragmentLength && flengoal > 0 && paired && u.cardinality() == 1 && !v1.empty() && !v2.empty()) {
- // try to map the reads
- int tl = index.mapPair(s1, l1, s2, l2);
- if (0 < tl && tl < flens.size()) {
- flens[tl]++;
- flengoal--;
- }
+ std::string s(s1);
+ s = "@novel_tooManyEmptyKmers\n"+s;
+ ss << s;
+ mp.outputNovel(ss);
}
}
-
- // pseudobam
-
- if (mp.opt.pseudobam) {
- PseudoAlignmentInfo info;
- info.id = (paired) ? (i/2) : i; // read id
- info.paired = paired;
- if (!u.isEmpty()) {
- info.r1empty = v1.empty();
- info.r2empty = v2.empty();
- const_UnitigMap<Node> um;
- info.k1pos = -1;
- info.k2pos = -1;
- if (!info.r1empty) {
- auto res = findFirstMappingKmer(v1);
- info.k1pos = res.second;
- }
- if (!info.r2empty) {
- auto res = findFirstMappingKmer(v2);
- info.k1pos = res.second;
- }
-
- info.ec = u;
- }
- pseudobatch.aln.push_back(std::move(info));
- }
- }
+ }
}
void ReadProcessor::clear() {
numreads=0;
memset(buffer,0,bufsize);
newEcs.clear();
+ flens_lr.clear();
+ flens_lr_c.clear();
+ unmapped_list.clear();
counts.clear();
counts.resize(tc.counts.size(),0);
ec_umi.clear();
@@ -1100,6 +1258,7 @@ BUSProcessor::BUSProcessor(/*const*/ KmerIndex& index, const ProgramOptions& opt
buffer = new char[bufsize];
seqs.reserve(bufsize/50);
newEcs.reserve(1000);
+ if (opt.unmapped) unmapped_list.reserve(seqs.size());
bv.reserve(1000);
counts.reserve((int) (tc.counts.size() * 1.25));
memset(&bc_len[0],0,sizeof(bc_len));
@@ -1124,7 +1283,10 @@ BUSProcessor::BUSProcessor(BUSProcessor && o) :
quals(std::move(o.quals)),
flags(std::move(o.flags)),
newEcs(std::move(o.newEcs)),
+ unmapped_list(std::move(o.unmapped_list)),
flens(std::move(o.flens)),
+ flens_lr(std::move(o.flens_lr)),
+ flens_lr_c(std::move(o.flens_lr_c)),
bias5(std::move(o.bias5)),
bv(std::move(o.bv)),
batchSR(std::move(o.batchSR)),
@@ -1207,7 +1369,7 @@ void BUSProcessor::operator()() {
// update the results, MP acquires the lock
std::vector<std::pair<Roaring, std::string>> ec_umi;
std::vector<std::pair<Roaring, std::string>> new_ec_umi;
- mp.update(counts, newEcs, ec_umi, new_ec_umi, seqs.size() / mp.opt.busOptions.nfiles , flens, bias5, pseudobatch, bv, std::move(newB), &bc_len[0], &umi_len[0], id, local_id);
+ mp.update(counts, newEcs, ec_umi, new_ec_umi, seqs.size() / mp.opt.busOptions.nfiles , flens, unmapped_list, flens_lr, flens_lr_c, bias5, pseudobatch, bv, std::move(newB), &bc_len[0], &umi_len[0], id, local_id);
clear();
if (mp.opt.max_num_reads != 0 && mp.numreads >= mp.opt.max_num_reads) {
return;
@@ -1236,18 +1398,40 @@ void BUSProcessor::processBuffer() {
tcount = mp.tlencounts[id];
}
bool findFragmentLength = tcount < 10000 && busopt.paired;
+ if (busopt.long_read) {
+ findFragmentLength = tcount < 1000000;
+ }
+ if (mp.opt.batch_mode) {
+ if (busopt.long_read) {
+ findFragmentLength = (mp.tlencount < 1000000);
+ }
+ }
+
int flengoal = 0;
flens.clear();
+ //unmapped_list.clear();
+ flens_lr.clear();
+ flens_lr_c.clear();
if (findFragmentLength) {
- flengoal = (10000 - tcount);
+ if (busopt.long_read) {
+ flengoal = (1000000 - tcount);
+ } else {
+ flengoal = (10000 - tcount);
+ }
if (flengoal <= 0) {
findFragmentLength = false;
flengoal = 0;
} else {
- flens.resize(tc.flens.size(), 0);
+ if (busopt.long_read) {
+ flens_lr.resize(tc.flens_lr.size(), 0);
+ flens_lr_c.resize(tc.flens_lr_c.size(), 0);
+ } else {
+ flens.resize(tc.flens.size(), 0);
+ }
}
}
+
char buffer[100];
memset(buffer, 0, 100);
char *umi = &(buffer[50]);
@@ -1448,7 +1632,16 @@ void BUSProcessor::processBuffer() {
bool match_partial = !busopt.paired && !index.dfk_onlist && !busopt.aa;
- index.match(seq, seqlen, v, match_partial, busopt.aa);
+ bool novel = false;
+ double unmapped_r = 0;
+ if (mp.opt.long_read) {
+ unmapped_r = index.match_long(seq, seqlen, v, match_partial, busopt.aa);
+ //std::cerr << unmapped_r << "," << std::endl;
+ unmapped_list.push_back(unmapped_r);
+ novel = (unmapped_r > mp.opt.threshold*seqlen);
+ } else {
+ index.match(seq, seqlen, v, match_partial, busopt.aa);
+ }
// process 2nd read
if (busopt.paired) {
@@ -1457,6 +1650,7 @@ void BUSProcessor::processBuffer() {
}
// process frames for commafree (to do: extend to paired-end reads)
+ int r = 0;
if (busopt.aa) {
// initiate equivalence classes
std::vector<std::pair<const_UnitigMap<Node>, int>> v3, v4, v5, v6, v7;
@@ -1497,73 +1691,120 @@ void BUSProcessor::processBuffer() {
// intersect set of equivalence classes for each frame
// NOTE: intersectKmers is called again further up. to-do: Do I need to modify that too?
- int r = tc.intersectKmersCFC(v, v3, v4, v5, v6, v7, u);
+ r = tc.intersectKmersCFC(v, v3, v4, v5, v6, v7, u);
}
else {
// collect the target information
- int r = tc.intersectKmers(v, v2, !busopt.paired, u);
- }
- if (!u.isEmpty()) {
- if (index.dfk_onlist) { // In case we want to not intersect D-list targets
- auto usize = u.cardinality();
- u &= index.onlist_sequences;
- if (u.cardinality() != usize && !(usize > 0 && u.cardinality() == 0)) {
- // Add if a D-list elem exists but not if ALL elems are D-listed
- u.add(index.onlist_sequences.cardinality());
- }
- } else { // Normal/standard workflow:
- // Mask out off-listed kmers
- u &= index.onlist_sequences;
+ if (mp.opt.long_read) {
+ r = tc.modeKmers(v, v2, !busopt.paired, u);
+ } else {
+ r = tc.intersectKmers(v, v2, !busopt.paired, u);
}
}
- if (doStrandSpecificityIfPossible && mp.opt.strand_specific && !u.isEmpty()) { // Strand-specificity
- doStrandSpecificity(u, mp.opt.strand, v, v2);
+ if (mp.opt.long_read && (r == -1 || u.isEmpty()) && mp.opt.unmapped) {
+ std::stringstream ss;
+ std::string s(seq);
+ s = "@unmapped\n"+s;
+ ss << s;
+ mp.outputNovel(ss);
}
- // find the ec
- if (!u.isEmpty()) {
- BUSData b;
- uint32_t f = 0;
- b.flags = 0;
- b.barcode = stringToBinary(bc, blen, f);
- b.flags |= f;
- b.UMI = check_tag_sequence || bulk_like ? umi_binary : stringToBinary(umi, ulen, f);
- b.flags |= (f) << 8;
- b.count = 1;
- if (num) {
- b.flags = (uint32_t) flags[i / jmax];
- }
-
- if (busopt.paired && getFragLenIfPaired) {
- if (findFragmentLength && flengoal > 0 && u.cardinality() == 1 && !v.empty() && !v2.empty()) {
- // try to map the reads
- int tl = index.mapPair(seq, seqlen, seq2, seqlen2);
- if (0 < tl && tl < flens.size()) {
- flens[tl]++;
- flengoal--;
+ if (!novel || !mp.opt.long_read) {
+ if (!u.isEmpty()) {
+ if (index.dfk_onlist) { // In case we want to not intersect D-list targets
+ auto usize = u.cardinality();
+ u &= index.onlist_sequences;
+ if (u.cardinality() != usize && !(usize > 0 && u.cardinality() == 0)) {
+ // Add if a D-list elem exists but not if ALL elems are D-listed
+ u.add(index.onlist_sequences.cardinality());
}
+ } else { // Normal/standard workflow:
+ // Mask out off-listed kmers
+ u &= index.onlist_sequences;
}
}
-
- // count the pseudoalignment
- std::lock_guard<std::mutex> lock(mp.transfer_locks[local_id]);
- auto elem = index.ecmapinv.find(u);
- if (elem == index.ecmapinv.end()) {
- // something we haven't seen before
- //newEcs.push_back(u); // don't store newEcs (it's redundant)
- newB.push_back({b, u});
- } /*else if (elem->second >= counts.size()) {
- newEcs.push_back(u);
+
+ if (doStrandSpecificityIfPossible && mp.opt.strand_specific && !u.isEmpty()) { // Strand-specificity
+ bool comprehensive = false;
+ if (mp.opt.do_union || mp.opt.no_jump) comprehensive = true;
+ // Begin Shading
+ if (index.use_shade) comprehensive = true;
+ // End Shading
+ doStrandSpecificity(u, mp.opt.strand, v, v2, comprehensive);
+ }
+
+ // find the ec
+ if (!u.isEmpty()) {
+ BUSData b;
+ uint32_t f = 0;
+ b.flags = 0;
+ b.barcode = stringToBinary(bc, blen, f);
+ b.flags |= f;
+ b.UMI = check_tag_sequence || bulk_like ? umi_binary : stringToBinary(umi, ulen, f);
+ b.flags |= (f) << 8;
+ b.count = 1;
+ if (num) {
+ b.flags = (uint32_t) flags[i / jmax];
+ }
+
+ if (busopt.paired && getFragLenIfPaired && !mp.opt.long_read) {
+ if (findFragmentLength && flengoal > 0 && u.cardinality() == 1 && !v.empty() && !v2.empty()) {
+ // try to map the reads
+ int tl = index.mapPair(seq, seqlen, seq2, seqlen2);
+ if (0 < tl && tl < flens.size()) {
+ flens[tl]++;
+ flengoal--;
+ }
+ }
+ }
+
+ if (mp.opt.long_read) {
+ if (findFragmentLength && flengoal > 0 && u.cardinality() == 1 && !v.empty()) {
+ for ( auto tr : u) {
+ flens_lr[tr] += seqlen;
+ flens_lr_c[tr]++;
+ flengoal--;
+ }
+ }
+ if(findFragmentLength && mp.opt.unmapped) {
+ unmapped_list.push_back(unmapped_r);
+ }
+ }
+
+ // count the pseudoalignment
+ std::lock_guard<std::mutex> lock(mp.transfer_locks[local_id]);
+ auto elem = index.ecmapinv.find(u);
+ if (elem == index.ecmapinv.end()) {
+ // something we haven't seen before
+ //newEcs.push_back(u); // don't store newEcs (it's redundant)
newB.push_back({b, u});
- } */ else {
- // add to count vector
- //++counts[elem->second]; // don't store counts; we have BUS records that we can count up
- // push back BUS record
- b.ec = elem->second;
- bv.push_back(b);
+ } /*else if (elem->second >= counts.size()) {
+ newEcs.push_back(u);
+ newB.push_back({b, u});
+ } */ else {
+ // add to count vector
+ //++counts[elem->second]; // don't store counts; we have BUS records that we can count up
+ // push back BUS record
+ b.ec = elem->second;
+ bv.push_back(b);
+ }
}
- }
+ } else {
+ // now address reads that are considered novel and need to be written to novel fastq and processed later.
+ std::stringstream ss;
+ if (u.isEmpty()) {
+ std::string s(seq);
+ s = "@novel_disjointIntersect\n"+s;
+ ss << s;
+ mp.outputNovel(ss);
+ } else {
+ std::string s(seq);
+ s = "@novel_tooManyEmptyKmers\n"+s;
+ ss << s;
+ mp.outputNovel(ss);
+ }
+ }
if (mp.opt.pseudobam) {
PseudoAlignmentInfo info;
@@ -1594,6 +1835,7 @@ void BUSProcessor::clear() {
numreads=0;
memset(buffer,0,bufsize);
newEcs.clear();
+ unmapped_list.clear();
counts.clear();
//counts.resize(tc.counts.size(), 0);
bv.clear();
=====================================
src/ProcessReads.h
=====================================
@@ -196,9 +196,18 @@ public:
transfer_locks.swap(mutexes);
if (opt.batch_mode) { // Set up recording of lengths individually for each batch
- tlencounts.assign(opt.batch_ids.size(), 0);
- batchFlens.assign(opt.batch_ids.size(), std::vector<uint32_t>(1000,0));
- }
+ if (opt.long_read) {
+ tlencounts.assign(opt.batch_ids.size(), 0);
+ //std::cerr << "Is this assignment of batchFlens space causing abort?" << std::endl; std::cerr.flush();
+ batchFlens_lr.assign(opt.batch_ids.size(), std::vector<uint32_t>(index.target_lens_.size(),0));
+ batchFlens_lr_c.assign(opt.batch_ids.size(), std::vector<uint32_t>(index.target_lens_.size(),0));
+ //std::cerr << "Passed assignment of batchFlens" << std::endl; std::cerr.flush();
+ batchUnmapped_list.assign(opt.batch_ids.size(), std::vector<double>(3000000/opt.batch_ids.size(),0));
+ } else {
+ tlencounts.assign(opt.batch_ids.size(), 0);
+ batchFlens.assign(opt.batch_ids.size(), std::vector<uint32_t>(1000,0));
+ }
+ }
if (opt.batch_ids.size() > 0) {
std::unordered_map<std::string,int> batch_map;
batch_id_mapping.resize(opt.batch_ids.size());
@@ -217,6 +226,9 @@ public:
ofusion.open(opt.output + "/fusion.txt");
ofusion << "TYPE\tNAME1\tSEQ1\tKPOS1\tNAME2\tSEQ2\tKPOS2\tINFO\tPOS1\tPOS2\n";
}
+ if (opt.long_read) {
+ ofusion.open(opt.output + "/novel.fastq");
+ }
if (opt.pseudobam) {
pseudobatchf_out.open(opt.output + "/pseudoaln.bin", std::ios::out | std::ios::binary);
}
@@ -293,7 +305,10 @@ public:
std::atomic<int> tlencount;
std::vector<int> tlencounts;
std::atomic<int> biasCount;
+ std::vector<std::vector<double>> batchUnmapped_list;
std::vector<std::vector<uint32_t>> batchFlens;
+ std::vector<std::vector<uint32_t>> batchFlens_lr;
+ std::vector<std::vector<uint32_t>> batchFlens_lr_c;
std::vector<std::vector<int32_t>> tmp_bc;
const int maxBiasCount;
u_map_<Roaring, uint32_t, RoaringHasher> newECcount;
@@ -302,12 +317,14 @@ public:
std::vector<int> batch_id_mapping; // minimal perfect mapping of batch ID
std::ofstream ofusion;
+ std::ofstream onovel;
std::ofstream pseudobatchf_out;
std::ofstream busf_out;
std::ifstream pseudobatchf_in;
std::vector<PseudoAlignmentBatch> pseudobatch_stragglers;
int last_pseudobatch_id;
void outputFusion(const std::stringstream &o);
+ void outputNovel(const std::stringstream &o);
void processReads();
#ifndef NO_HTSLIB
htsFile *bamfp;
@@ -318,7 +335,7 @@ public:
void writeSortedPseudobam(const std::vector<std::vector<bam1_t>> &bvv);
#endif
std::vector<uint64_t> breakpoints;
- void update(const std::vector<uint32_t>& c, const std::vector<Roaring>& newEcs, std::vector<std::pair<Roaring, std::string>>& ec_umi, std::vector<std::pair<Roaring, std::string>> &new_ec_umi, int n, std::vector<int>& flens, std::vector<int> &bias, const PseudoAlignmentBatch& pseudobatch, std::vector<BUSData> &bv, std::vector<std::pair<BUSData, Roaring>> newB, int *bc_len, int *umi_len, int id = -1, int local_id = -1);
+ void update(const std::vector<uint32_t>& c, const std::vector<Roaring>& newEcs, std::vector<std::pair<Roaring, std::string>>& ec_umi, std::vector<std::pair<Roaring, std::string>> &new_ec_umi, int n, std::vector<int>& flens, std::vector<double>& unmapped_list, std::vector<int>& flens_lr, std::vector<int>& flens_lr_c, std::vector<int> &bias, const PseudoAlignmentBatch& pseudobatch, std::vector<BUSData> &bv, std::vector<std::pair<BUSData, Roaring>> newB, int *bc_len, int *umi_len, int id = -1, int local_id = -1);
};
class ReadProcessor {
@@ -347,7 +364,10 @@ public:
std::vector<uint32_t> flags;
std::vector<std::string> umis;
std::vector<Roaring> newEcs;
+ std::vector<double> unmapped_list;
std::vector<int> flens;
+ std::vector<int> flens_lr;
+ std::vector<int> flens_lr_c;
std::vector<int> bias5;
std::vector<uint32_t> counts;
@@ -387,7 +407,10 @@ public:
std::vector<uint32_t> flags;
std::vector<Roaring> newEcs;
+ std::vector<double> unmapped_list;
std::vector<int> flens;
+ std::vector<int> flens_lr;
+ std::vector<int> flens_lr_c;
std::vector<int> bias5;
std::vector<uint32_t> counts;
std::vector<BUSData> bv;
=====================================
src/common.h
=====================================
@@ -1,7 +1,7 @@
#ifndef KALLISTO_COMMON_H
#define KALLISTO_COMMON_H
-#define KALLISTO_VERSION "0.50.1"
+#define KALLISTO_VERSION "0.51.1"
// NOTE: MAKE SURE THIS FILE GETS INCLUDED FIRST IN ALL OTHER FILES AND BEFORE ANY EXTERNAL LIBRARIES
@@ -51,6 +51,10 @@ struct BUSOptions {
std::vector<BUSOptionSubstr> seq;
bool paired;
+ bool long_read;
+ bool unmapped;
+ double error_rate;
+ double threshold;
bool aa;
int getBCLength() const {
@@ -99,6 +103,8 @@ struct ProgramOptions {
std::string output;
int skip;
size_t seed;
+ double error_rate;
+ double threshold;
double fld;
double sd;
int min_range;
@@ -119,6 +125,8 @@ struct ProgramOptions {
bool plaintext;
bool write_index;
bool single_end;
+ bool long_read;
+ bool unmapped;
bool strand_specific;
bool peek; // only used for H5Dump
bool bias;
@@ -127,6 +135,8 @@ struct ProgramOptions {
bool make_unique;
bool fusion;
bool dfk_onlist;
+ bool do_union;
+ bool no_jump;
enum class StrandType {None, FR, RF};
StrandType strand;
std::string gfa; // used for inspect
@@ -140,6 +150,7 @@ struct ProgramOptions {
std::string chromFile;
std::string bedFile;
std::string technology;
+ std::string platform;
std::string tagsequence;
std::string tccFile;
std::string ecFile;
@@ -158,6 +169,8 @@ ProgramOptions() :
iterations(500),
skip(1),
seed(42),
+ error_rate(0.0),
+ threshold(0.8),
fld(0.0),
sd(0.0),
min_range(1),
@@ -169,11 +182,13 @@ ProgramOptions() :
matrix_to_directories(false),
batch_mode(false),
bus_mode(false),
+ unmapped(false),
bam(false),
num(false),
plaintext(false),
write_index(false),
single_end(false),
+ long_read(false),
strand_specific(false),
peek(false),
bias(false),
@@ -187,7 +202,9 @@ ProgramOptions() :
single_overhang(false),
aa(false),
distinguish(false),
- d_list_overhang(1)
+ d_list_overhang(1),
+ do_union(false),
+ no_jump(false)
{}
};
=====================================
src/main.cpp
=====================================
@@ -213,6 +213,7 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
int plaintext_flag = 0;
int write_index_flag = 0;
int single_flag = 0;
+ int long_read_flag = 0;
int single_overhang_flag = 0;
int strand_FR_flag = 0;
int strand_RF_flag = 0;
@@ -220,14 +221,17 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
int pbam_flag = 0;
int gbam_flag = 0;
int fusion_flag = 0;
+ int do_union_flag = 0;
+ int no_jump_flag = 0;
- const char *opt_string = "t:i:l:s:o:n:m:d:b:g:c:p:";
+ const char *opt_string = "t:i:l:P:s:o:n:m:d:b:g:c:p:";
static struct option long_options[] = {
// long args
{"verbose", no_argument, &verbose_flag, 1},
{"plaintext", no_argument, &plaintext_flag, 1},
{"write-index", no_argument, &write_index_flag, 1},
{"single", no_argument, &single_flag, 1},
+ {"long", no_argument, &long_read_flag, 1},
{"single-overhang", no_argument, &single_overhang_flag, 1},
{"fr-stranded", no_argument, &strand_FR_flag, 1},
{"rf-stranded", no_argument, &strand_RF_flag, 1},
@@ -240,6 +244,7 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
{"threads", required_argument, 0, 't'},
{"index", required_argument, 0, 'i'},
{"fragment-length", required_argument, 0, 'l'},
+ {"platform", required_argument, 0, 'P'},
{"sd", required_argument, 0, 's'},
{"output-dir", required_argument, 0, 'o'},
{"iterations", required_argument, 0, 'n'},
@@ -248,6 +253,8 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
{"gtf", required_argument, 0, 'g'},
{"chromosomes", required_argument, 0, 'c'},
{"priors", required_argument, 0, 'p'},
+ {"union", no_argument, &do_union_flag, 1},
+ {"no-jump", no_argument, &no_jump_flag, 1},
{0,0,0,0}
};
int c;
@@ -274,6 +281,10 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
stringstream(optarg) >> opt.fld;
break;
}
+ case 'P': {
+ stringstream(optarg) >> opt.platform;
+ break;
+ }
case 's': {
stringstream(optarg) >> opt.sd;
break;
@@ -335,6 +346,11 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
opt.single_end = true;
}
+ if (long_read_flag) {
+ opt.long_read = true;
+ opt.single_end = true;
+ }
+
if (single_overhang_flag) {
opt.single_overhang = true;
}
@@ -365,13 +381,22 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
if (fusion_flag) {
opt.fusion = true;
}
+
+ if (do_union_flag) {
+ opt.do_union = true;
+ }
+
+ if (no_jump_flag) {
+ opt.no_jump = true;
+ }
}
void ParseOptionsTCCQuant(int argc, char **argv, ProgramOptions& opt) {
- const char *opt_string = "o:i:T:e:f:l:s:t:g:G:b:d:p:";
+ const char *opt_string = "o:i:T:e:f:P:l:s:t:g:G:b:d:p:";
int matrix_to_files = 0;
int matrix_to_directories = 0;
int plaintext_flag = 0;
+ int long_read_flag = 0;
static struct option long_options[] = {
{"plaintext", no_argument, &plaintext_flag, 1},
{"matrix-to-files", no_argument, &matrix_to_files, 1},
@@ -380,6 +405,8 @@ void ParseOptionsTCCQuant(int argc, char **argv, ProgramOptions& opt) {
{"txnames", required_argument, 0, 'T'},
{"threads", required_argument, 0, 't'},
{"fragment-file", required_argument, 0, 'f'},
+ {"long", no_argument, &long_read_flag, 1},
+ {"platform", required_argument, 0, 'P'},
{"fragment-length", required_argument, 0, 'l'},
{"sd", required_argument, 0, 's'},
{"output-dir", required_argument, 0, 'o'},
@@ -411,6 +438,11 @@ void ParseOptionsTCCQuant(int argc, char **argv, ProgramOptions& opt) {
stringstream(optarg) >> opt.fldFile;
break;
}
+ case 'P': {
+ stringstream(optarg) >> opt.platform;
+ std::transform(opt.platform.begin(), opt.platform.end(),opt.platform.begin(), ::toupper);
+ break;
+ }
case 'l': {
stringstream(optarg) >> opt.fld;
break;
@@ -458,6 +490,9 @@ void ParseOptionsTCCQuant(int argc, char **argv, ProgramOptions& opt) {
default: break;
}
}
+ if (long_read_flag) {
+ opt.long_read = true;
+ }
if (matrix_to_files) {
opt.matrix_to_files = true;
}
@@ -507,6 +542,8 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
int verbose_flag = 0;
int gbam_flag = 0;
int paired_end_flag = 0;
+ int long_read_flag = 0;
+ int unmapped_flag = 0;
int aa_flag = 0;
int strand_FR_flag = 0;
int strand_RF_flag = 0;
@@ -514,8 +551,10 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
int interleaved_flag = 0;
int batch_barcodes_flag = 0;
int dfk_onlist_flag = 0;
+ int do_union_flag = 0;
+ int no_jump_flag = 0;
- const char *opt_string = "i:o:x:t:lbng:c:T:B:N:";
+ const char *opt_string = "i:o:x:t:lbng:c:T:P:r:e:B:N:";
static struct option long_options[] = {
{"verbose", no_argument, &verbose_flag, 1},
{"dfk-onlist", no_argument, &dfk_onlist_flag, 1},
@@ -535,16 +574,24 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
{"rf-stranded", no_argument, &strand_RF_flag, 1},
{"unstranded", no_argument, &unstranded_flag, 1},
{"paired", no_argument, &paired_end_flag, 1},
+ {"long", no_argument, &long_read_flag, 1},
+ {"platform", required_argument, 0, 'P'},
+ {"threshold", required_argument, 0, 'r'},
+ //{"error-rate", required_argument, 0, 'e'},
+ {"unmapped", no_argument, &unmapped_flag, 1},
{"aa", no_argument, &aa_flag, 1},
{"inleaved", no_argument, &interleaved_flag, 1},
{"numReads", required_argument, 0, 'N'},
{"batch-barcodes", no_argument, &batch_barcodes_flag, 1},
+ {"union", no_argument, &do_union_flag, 1},
+ {"no-jump", no_argument, &no_jump_flag, 1},
{0,0,0,0}
};
int list_flag = 0;
int c;
int option_index = 0;
+
while (true) {
c = getopt_long(argc,argv,opt_string, long_options, &option_index);
@@ -589,6 +636,19 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
opt.num = true;
break;
}
+ case 'P': {
+ stringstream(optarg) >> opt.platform;
+ std::transform(opt.platform.begin(), opt.platform.end(),opt.platform.begin(), ::toupper);
+ break;
+ }
+ case 'e': {
+ stringstream(optarg) >> opt.error_rate;
+ break;
+ }
+ case 'r': {
+ stringstream(optarg) >> opt.threshold;
+ break;
+ }
case 'N': {
stringstream(optarg) >> opt.max_num_reads;
if (opt.max_num_reads == 0) {
@@ -668,6 +728,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
opt.single_end = true;
}
+ if (long_read_flag) {
+ opt.long_read = true;
+ }
+
+ if (unmapped_flag) {
+ opt.unmapped = true;
+ }
+
if (interleaved_flag) {
opt.input_interleaved_nfiles = 1;
}
@@ -680,6 +748,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
opt.dfk_onlist = true;
}
+ if (do_union_flag) {
+ opt.do_union = true;
+ }
+
+ if (no_jump_flag) {
+ opt.no_jump = true;
+ }
+
opt.single_overhang = true;
// throw warning when --aa is passed with paired-end arg
@@ -845,6 +921,16 @@ void ParseOptionsH5Dump(int argc, char **argv, ProgramOptions& opt) {
}
bool CheckOptionsBus(ProgramOptions& opt) {
+ // Initialize BUS options (to ensure no variables are uninitialized for good measure); BUS options shouldn't be intialized before this
+ opt.busOptions.nfiles = 1;
+ opt.busOptions.keep_fastq_comments = false;
+ opt.busOptions.paired = false;
+ opt.busOptions.long_read = false;
+ opt.busOptions.unmapped = false;
+ opt.busOptions.error_rate = 0.0;
+ opt.busOptions.threshold = 0.8;
+ opt.busOptions.aa = false;
+
bool ret = true;
cerr << endl;
@@ -875,6 +961,26 @@ bool CheckOptionsBus(ProgramOptions& opt) {
ret = false;
}
+ if (opt.long_read && !(0 < opt.threshold && opt.threshold < 1)) {
+ std::cerr << "Threshold not in (0,1). Setting default threshold for unmapped kmers to 0.8" << std::endl;
+ opt.threshold = 0.8;
+ }
+
+ if (opt.do_union && (opt.long_read || opt.aa)) {
+ std::cerr << "--union is not compatible with this mode" << std::endl;
+ ret = false;
+ }
+ if (opt.no_jump && (opt.long_read || opt.aa)) {
+ std::cerr << "--no-jump is not compatible with this mode" << std::endl;
+ ret = false;
+ }
+
+ if (opt.long_read) { //opt.error_rate <= 0) {
+ //hiding for release, not used for this version
+ //std::cerr << "No sequencing error-rate: invalid error-rate; must be greater than zero" << std::endl;
+ //ret = false;
+ }
+
// check files
bool read_stdin = opt.files.size() == 1 && !opt.batch_mode && !opt.bam && opt.files[0] == "-"; // - means read from stdin
if (opt.files.size() == 0 && !opt.batch_mode) {
@@ -1104,6 +1210,11 @@ bool CheckOptionsBus(ProgramOptions& opt) {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.paired = true;
}
+ if (opt.long_read) {
+ busopt.long_read = true;
+ busopt.paired = false;
+ busopt.error_rate = opt.error_rate;
+ }
busopt.umi.push_back(BUSOptionSubstr(-1,-1,-1));
}
return ret;
@@ -1164,6 +1275,8 @@ bool CheckOptionsBus(ProgramOptions& opt) {
auto& busopt = opt.busOptions;
busopt.aa = opt.aa;
+ busopt.long_read = opt.long_read;
+ busopt.threshold = opt.threshold;
busopt.paired = false;
busopt.keep_fastq_comments = false;
if (opt.bam) busopt.nfiles = 1; // Note: only 10xV2 has been tested
@@ -1273,7 +1386,9 @@ bool CheckOptionsBus(ProgramOptions& opt) {
if (!opt.single_end) {
busopt.nfiles++;
busopt.seq.push_back(BUSOptionSubstr(3,0,0));
- busopt.paired = true;
+ if (!opt.long_read) {
+ busopt.paired = true;
+ }
}
} else if (opt.technology == "BDWTA") {
busopt.nfiles = 2;
@@ -1299,7 +1414,7 @@ bool CheckOptionsBus(ProgramOptions& opt) {
//bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
bool valid = ParseTechnology(opt.technology, busopt, errorList);
- if (busopt.seq.size() == 2 && !opt.single_end) {
+ if (busopt.seq.size() == 2 && !opt.single_end && !opt.long_read) {
busopt.paired = true;
}
@@ -1357,7 +1472,7 @@ bool CheckOptionsBus(ProgramOptions& opt) {
}
}
- if (!opt.single_end && !opt.busOptions.paired) {
+ if (!opt.single_end && !opt.busOptions.paired && !opt.long_read) {
cerr << "Error: Paired reads are not compatible with the specified technology" << endl;
ret = false;
}
@@ -1526,12 +1641,12 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) {
}
/*
- if (opt.strand_specific && !opt.single_end) {
+ if (opt.strand_specific && !opt.single_end && !opt.long_read) {
cerr << "Error: strand-specific mode requires single-end mode" << endl;
ret = false;
}*/
- if (!opt.single_end) {
+ if (!opt.single_end && !opt.long_read) {
if (opt.files.size() % 2 != 0) {
cerr << "Error: paired-end mode requires an even number of input files" << endl
<< " (use --single for processing single-end reads)" << endl;
@@ -1544,8 +1659,18 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) {
cerr << "Error: cannot supply mean/sd without supplying both -l and -s" << endl;
ret = false;
}
+
+
+ if (opt.do_union && (opt.long_read || opt.aa)) {
+ std::cerr << "--union is not compatible with this mode" << std::endl;
+ ret = false;
+ }
+ if (opt.no_jump && (opt.long_read || opt.aa)) {
+ std::cerr << "--no-jump is not compatible with this mode" << std::endl;
+ ret = false;
+ }
- if (opt.single_end && (opt.fld == 0.0 || opt.sd == 0.0)) {
+ if ((opt.single_end && !opt.long_read) && (opt.fld == 0.0 || opt.sd == 0.0)) {
cerr << "Error: fragment length mean and sd must be supplied for single-end reads using -l and -s" << endl;
ret = false;
} else if (opt.fld == 0.0 && ret) {
@@ -1998,6 +2123,10 @@ void usageBus() {
<< " --rf-stranded Strand specific reads for UMI-tagged reads, first read reverse" << endl
<< " --unstranded Treat all read as non-strand-specific" << endl
<< " --paired Treat reads as paired" << endl
+ << " --long Treat reads as long" << endl
+ //<< " --error_rate Estimated error rate of long reads (required for --long)" << endl
+ << " --threshold Threshold for rate of unmapped kmers per read" << endl
+ //<< " --unmapped Computed ratio of unmapped kmers for first 1M reads and output unmapped reads" << endl
<< " --aa Align to index generated from a FASTA-file containing amino acid sequences" << endl
<< " --inleaved Specifies that input is an interleaved FASTQ file" << endl
<< " --batch-barcodes Records both batch and extracted barcode in BUS file" << endl
@@ -2104,6 +2233,8 @@ void usageTCCQuant(bool valid_input = true) {
<< " (default: equivalence classes are taken from the index)" << endl
<< "-f, --fragment-file=FILE File containing fragment length distribution" << endl
<< " (default: effective length normalization is not performed)" << endl
+ << "--long Use version of EM for long reads " << endl
+ << "-P, --platform. [PacBio or ONT] used for sequencing " << endl
<< "-l, --fragment-length=DOUBLE Estimated average fragment length" << endl
<< "-s, --sd=DOUBLE Estimated standard deviation of fragment length" << endl
<< " (note: -l, -s values only should be supplied when" << endl
@@ -2207,7 +2338,8 @@ int main(int argc, char *argv[]) {
usageBus();
return 0;
}
- ParseOptionsBus(argc-1, argv+1,opt);
+
+ ParseOptionsBus(argc-1, argv+1,opt);
int64_t num_processed = 0;
if (!CheckOptionsBus(opt)) {
usageBus();
@@ -2228,10 +2360,32 @@ int main(int argc, char *argv[]) {
opt.bus_mode = true; // bus_mode = !batch_mode (however, if either is true, means we're writing BUS file)
opt.single_end = false;
}
-
+
KmerIndex index(opt);
index.load(opt);
+ if (opt.long_read) {
+ double error_rate_threshold_tmp = ((1.0/opt.error_rate - 2*index.k) * opt.error_rate);
+ //std::cerr << "Suggested threshold for novel reads to " << error_rate_threshold_tmp << std::endl;
+ if (1 > opt.threshold && opt.threshold > 0) {
+ //std::cerr << "Using supplied threshold " << opt.threshold << std::endl;
+ } else if (0 < error_rate_threshold_tmp && error_rate_threshold_tmp < 1) {
+ opt.threshold = error_rate_threshold_tmp;
+ std::cerr << "Using computed threshold " << opt.threshold << std::endl;
+ } else {
+ opt.threshold = .8;
+ std::cerr << "Supplied and computed threshold are invalid, using default value of " << opt.threshold << std::endl;
+ }
+ /***
+ int suggested_k = int((1.0/opt.error_rate)/2.0) - 1;
+ if (suggested_k % 2 == 0) {
+ std::cerr << "Suggested kmer length for error rate is: " << suggested_k-1 << std::endl;
+ } else {
+ std::cerr << "Suggested kmer length for error rate is: " << suggested_k << std::endl;
+ }
+ ***/
+ }
+
bool guessChromosomes = true;
Transcriptome model; // empty
if (opt.genomebam) {
@@ -2263,21 +2417,56 @@ int main(int argc, char *argv[]) {
if (!opt.single_end || opt.technology.empty() || opt.busOptions.paired || opt.busOptions.umi[0].fileno == -1) {
index.write((opt.output + "/index.saved"), false, opt.threads);
}
- // Write out fragment length distributions if reads paired-end:
- if (!opt.single_end) {
+ // Write out fragment length distributions if reads paired-end or long:
+ if (!opt.single_end || opt.long_read) {
+ std::remove((opt.output + "/flens.txt").c_str());
std::ofstream flensout_f((opt.output + "/flens.txt"));
for (size_t id = 0; id < opt.batch_ids.size(); id++) {
- std::vector<uint32_t> fld = MP.batchFlens[id];
- for ( size_t i = 0 ; i < fld.size(); ++i ) {
- if (i != 0) {
- flensout_f << " ";
- }
- flensout_f << fld[i];
- }
- flensout_f << "\n";
- }
- flensout_f.close();
- }
+ if (opt.long_read) {
+ //Should I be using batchFlens?
+ std::vector<uint32_t> fld_lr = MP.batchFlens_lr[id];
+ std::vector<uint32_t> fld_lr_c = MP.batchFlens_lr_c[id];
+ for ( size_t i = 0 ; i < fld_lr.size(); ++i ) {
+ if (i != 0) {
+ flensout_f << " ";
+ }
+ if (fld_lr_c[i] > 0.5) {
+ //Good results with comment below.
+ //flensout_f << std::fabs((double)fld_lr[i] / (double)fld_lr_c[i] - index.k);//index.target_lens_[i] - (double)fld_lr[i] / (double)fld_lr_c[i] - k); // take mean of recorded uniquely aligning read lengths
+ flensout_f << std::fabs(((double)fld_lr[i] / (double)fld_lr_c[i]) - index.k);
+ } else {
+ flensout_f << std::fabs(index.target_lens_[i] - index.k);//index.target_lens_[i]);
+ }
+ }
+ flensout_f << "\n";
+ } else {
+ std::vector<uint32_t> fld = MP.batchFlens[id];
+ for ( size_t i = 0 ; i < fld.size(); ++i ) {
+ if (i != 0) {
+ flensout_f << " ";
+ }
+ flensout_f << fld[i];
+ }
+ flensout_f << "\n";
+ }
+ }
+ flensout_f.close();
+
+ if (opt.unmapped) {
+ std::ofstream um_f((opt.output + "/unmapped_ratio.txt"));
+ for (size_t id = 0; id < opt.batch_ids.size(); id++) {
+ std::vector<double> unmapped_l = MP.tc.unmapped_list;
+ for ( size_t i = 0 ; i < unmapped_l.size(); ++i ) {
+ if (i != 0) {
+ um_f << ",";
+ }
+ um_f << unmapped_l[i];
+ }
+ um_f << ",";
+ }
+ um_f.close();
+ }
+ }
} else {
num_processed = ProcessBUSReads(MP, opt);
for (int i = 0; i <= 32; i++) {
@@ -2318,12 +2507,14 @@ int main(int argc, char *argv[]) {
}
}
std::vector<uint32_t> fld;
- if (opt.busOptions.paired) {
+ std::vector<uint32_t> fld_c;
+ if (opt.busOptions.paired && !opt.long_read) {
fld = collection.flens; // copy
collection.compute_mean_frag_lens_trunc();
// Write out index:
index.write((opt.output + "/index.saved"), false, opt.threads);
// Write out fragment length distribution:
+ std::remove((opt.output + "/flens.txt").c_str());
std::ofstream flensout_f((opt.output + "/flens.txt"));
for ( size_t i = 0 ; i < fld.size(); ++i ) {
if (i != 0) {
@@ -2333,12 +2524,44 @@ int main(int argc, char *argv[]) {
}
flensout_f << "\n";
flensout_f.close();
- } else if (opt.busOptions.umi[0].fileno == -1) {
+ } else if (opt.long_read) {
+ opt.busOptions.threshold = opt.threshold;
+ fld = collection.flens_lr; // copy
+ fld_c = collection.flens_lr_c; //copy
+ // Write out index:
+ index.write((opt.output + "/index.saved"), false, opt.threads);
+ // Write out fragment length distribution:
+ std::remove((opt.output + "/flens.txt").c_str());
+ std::ofstream flensout_f((opt.output + "/flens.txt"));
+ for ( size_t i = 0 ; i < fld.size(); ++i ) {
+ if (i > 0.0001) {
+ flensout_f << " ";
+ }
+ if (fld_c[i] > 0.5) {
+ flensout_f << std::fabs((double)fld[i] / (double)fld_c[i] - index.k);//index.target_lens_[i] - (double)fld[i] / (double)fld_c[i] - k); // take mean of recorded uniquely aligning read lengths
+ } else {
+ flensout_f << std::fabs(index.target_lens_[i] - index.k); //index.target_lens_[i] - 31);
+ }
+ }
+ flensout_f << "\n";
+ flensout_f.close();
+ std::cerr << "Finished fragment length write out line 2487" << std::endl;
+ if (opt.unmapped) {
+ std::ofstream um_f((opt.output + "/unmapped_ratio.txt"));
+ std::vector<double> unmapped_l = collection.unmapped_list;
+ for ( size_t i = 0 ; i < unmapped_l.size(); ++i ) {
+ if (i != 0) {
+ um_f << ",";
+ }
+ um_f << unmapped_l[i];
+ }
+ um_f.close();
+ }
+ } else if (opt.busOptions.umi[0].fileno == -1) {
// Write out index:
index.write((opt.output + "/index.saved"), false, opt.threads);
}
}
-
writeECList(ecfilename, index);
// write transcript names
@@ -2348,6 +2571,7 @@ int main(int argc, char *argv[]) {
}
transout_f.close();
+
for (const auto& elem : index.ecmapinv) {
if (elem.first.cardinality() == 1) {
num_unique += collection.counts[elem.second];
@@ -2366,6 +2590,7 @@ int main(int argc, char *argv[]) {
std::string(std::to_string(num_unique)),
KALLISTO_VERSION,
std::string(std::to_string(index.INDEX_VERSION)),
+ std::string(std::to_string(index.k)),
start_time,
call,
opt.aa ? std::to_string(collection.cardinality_clashes) : "");
@@ -2496,6 +2721,7 @@ int main(int argc, char *argv[]) {
std::string(std::to_string(num_unique)),
KALLISTO_VERSION,
std::string(std::to_string(index.INDEX_VERSION)),
+ std::string(std::to_string(index.k)),
start_time,
call,
opt.aa ? std::to_string(collection.cardinality_clashes) : "");
@@ -2707,7 +2933,7 @@ int main(int argc, char *argv[]) {
size_t num_trans = index.num_trans;
const bool calcEffLen = !opt.fldFile.empty() || opt.fld != 0.0;
- if (calcEffLen && !opt.fldFile.empty()) { // Parse supplied fragment length distribution file
+ if (calcEffLen && !opt.fldFile.empty() && (!opt.long_read || opt.platform == "PACBIO")) { // Parse supplied fragment length distribution file
std::ifstream infld((opt.fldFile));
if (infld.is_open()) {
std::string line;
@@ -2728,14 +2954,14 @@ int main(int argc, char *argv[]) {
}
tmp_vec.push_back(tmp_val_num);
}
- if (tmp_vec.size() != MAX_FRAG_LEN) {
+ if (tmp_vec.size() != MAX_FRAG_LEN && tmp_vec.size() != index.target_lens_.size()) {
std::cerr << "Error: Fragment length distribution file contains a line with "
<< tmp_vec.size() << " values; expected: " << MAX_FRAG_LEN << std::endl;
exit(1);
}
FLDs.push_back(tmp_vec);
}
- if (FLDs.size() != 1 && FLDs.size() != nrow) {
+ if (FLDs.size() != 1 && (FLDs.size() != nrow && FLDs.size() != index.target_lens_.size())) {
std::cerr << "Error: Fragment length distribution file contains "
<< FLDs.size() << " valid lines; expected: " << nrow << std::endl;
exit(1);
@@ -2753,7 +2979,7 @@ int main(int argc, char *argv[]) {
}
const bool gene_level_counting = !opt.genemap.empty() || !opt.gtfFile.empty();
- std::cerr << "[quant] Running EM algorithm..."; std::cerr.flush();
+ std::cerr << "[quant] Running EM algorithm..." << std::endl;
std::vector<double> priors;
if (opt.priors != "") {
@@ -2768,7 +2994,7 @@ int main(int argc, char *argv[]) {
}
std::vector<double> fl_means;
- if (calcEffLen) {
+ if (calcEffLen && (!opt.long_read || !(opt.platform == "ONT"))) {
if (opt.fld != 0.0) {
collection.init_mean_fl_trunc(opt.fld, opt.sd);
} else {
@@ -2790,7 +3016,7 @@ int main(int argc, char *argv[]) {
}
EMAlgorithm em(collection.counts, index, collection, fl_means, opt);
- em.set_priors(priors);
+ if (opt.priors != "") em.set_priors(priors);
em.run(10000, 50, false, false);
if (isMatrixFile) { // Update abundances matrix
@@ -2952,8 +3178,6 @@ int main(int argc, char *argv[]) {
}
}; // end of EM_lambda
- priors.clear();
-
std::vector<std::thread> workers;
int num_ids = nrow;
View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/compare/7d125706ac5f607a89e1b3fa6533f9c93b0a0919...84c31f65218466b3a00c0bff495fc1ed5e563935
--
View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/compare/7d125706ac5f607a89e1b3fa6533f9c93b0a0919...84c31f65218466b3a00c0bff495fc1ed5e563935
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241013/441c2443/attachment-0001.htm>
More information about the debian-med-commit
mailing list