[med-svn] [salmon] 02/04: Imported Upstream version 0.4.2+ds1
Michael Crusoe
misterc-guest at moszumanska.debian.org
Sat Jan 30 08:16:07 UTC 2016
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository salmon.
commit b6b142af4f009ec66dd631e277f40b1e79434477
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Sat Jan 30 00:07:34 2016 -0800
Imported Upstream version 0.4.2+ds1
---
include/CollapsedIterativeOptimizer.hpp.bak | 2069 ---------------------------
src/SalmonQuantify.cpp.bak | 1314 -----------------
2 files changed, 3383 deletions(-)
diff --git a/include/CollapsedIterativeOptimizer.hpp.bak b/include/CollapsedIterativeOptimizer.hpp.bak
deleted file mode 100644
index cc9e4bd..0000000
--- a/include/CollapsedIterativeOptimizer.hpp.bak
+++ /dev/null
@@ -1,2069 +0,0 @@
-/**
->HEADER
- Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
-
- This file is part of Sailfish.
-
- Sailfish is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- Sailfish is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with Sailfish. If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-#ifndef COLLAPSED_ITERATIVE_OPTIMIZER_HPP
-#define COLLAPSED_ITERATIVE_OPTIMIZER_HPP
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <unordered_map>
-#include <map>
-#include <vector>
-#include <unordered_set>
-#include <mutex>
-#include <thread>
-#include <sstream>
-#include <exception>
-#include <random>
-#include <queue>
-#include "btree_map.h"
-
-/** Boost Includes */
-#include <boost/dynamic_bitset/dynamic_bitset.hpp>
-#include <boost/range/irange.hpp>
-#include <boost/program_options.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/scoped_ptr.hpp>
-#include <boost/heap/fibonacci_heap.hpp>
-#include <boost/accumulators/statistics/stats.hpp>
-#include <boost/accumulators/framework/accumulator_set.hpp>
-#include <boost/accumulators/statistics/p_square_quantile.hpp>
-#include <boost/accumulators/statistics/count.hpp>
-#include <boost/accumulators/statistics/median.hpp>
-#include <boost/accumulators/statistics/weighted_mean.hpp>
-#include <boost/lockfree/queue.hpp>
-#include <boost/thread/thread.hpp>
-#include <boost/math/special_functions/digamma.hpp>
-#include <boost/math/distributions/beta.hpp>
-
-
-//#include <Eigen/Core>
-#include <jellyfish/sequence_parser.hpp>
-#include <jellyfish/parse_read.hpp>
-#include <jellyfish/mer_counting.hpp>
-#include <jellyfish/misc.hpp>
-#include <jellyfish/compacted_hash.hpp>
-
-#include "tbb/concurrent_unordered_set.h"
-#include "tbb/concurrent_vector.h"
-#include "tbb/concurrent_unordered_map.h"
-#include "tbb/concurrent_queue.h"
-#include "tbb/parallel_for.h"
-#include "tbb/parallel_for_each.h"
-#include "tbb/parallel_reduce.h"
-#include "tbb/blocked_range.h"
-#include "tbb/task_scheduler_init.h"
-#include "tbb/partitioner.h"
-
-#include "BiasIndex.hpp"
-#include "ezETAProgressBar.hpp"
-#include "LookUpTableUtils.hpp"
-#include "SalmonMath.hpp"
-
-template <typename ReadHash>
-class CollapsedIterativeOptimizer {
-
-private:
- /**
- * Type aliases
- */
- using TranscriptID = uint32_t;
- using KmerID = uint64_t;
- using Count = uint32_t;
- using KmerQuantity = double;
- using Promiscutity = double;
- using KmerMap = tbb::concurrent_unordered_map< uint64_t, tbb::concurrent_vector<uint32_t> >;
-
- struct TranscriptGeneVectors;
- using TranscriptIDVector = std::vector<TranscriptID>;
- using KmerIDMap = std::vector<TranscriptIDVector>;
-
-
- using TranscriptKmerSet = std::tuple<TranscriptID, std::vector<KmerID>>;
- using StringPtr = std::string*;
- using TranscriptScore = uint64_t;
- using HashArray = jellyfish::invertible_hash::array<uint64_t, atomic::gcc, allocators::mmap>;
- using ReadLength = size_t;
-
- // Necessary forward declaration
- struct TranscriptData;
- using HeapPair = std::tuple<TranscriptScore, TranscriptID>;
- using Handle = typename boost::heap::fibonacci_heap<HeapPair>::handle_type;
- using BlockedIndexRange = tbb::blocked_range<size_t>;
-
- struct TranscriptGeneVectors {
- tbb::concurrent_vector<uint32_t> transcripts;
- tbb::concurrent_vector<uint32_t> genes;
- };
-
- struct TranscriptData {
- TranscriptID id;
- StringPtr header;
- std::map<KmerID, KmerQuantity> binMers;
- KmerQuantity mean;
- size_t length;
- };
-
- class TranscriptInfo {
- public:
-
- TranscriptInfo() : binMers(std::unordered_map<KmerID, KmerQuantity>()),
- //logLikes(std::vector<KmerQuantity>()),
- //weights(std::vector<KmerQuantity>()),
- mean(0.0), fracLow(0.0), fracHigh(0.0), length(0), effectiveLength(0) { updated.store(0); /* weightNum.store(0); totalWeight.store(0.0);*/ }
- TranscriptInfo(TranscriptInfo&& other) {
- std::swap(binMers, other.binMers);
- //std::swap(weights, other.weights);
- //std::swap(logLikes, other.logLikes);
- //totalWeight.store(other.totalWeight.load());
- //weightNum.store(other.weightNum.load());
- updated.store(other.updated.load());
- mean = other.mean;
- length = other.length;
- effectiveLength = other.effectiveLength;
- }
- //std::atomic<double> totalWeight;
- //btree::btree_map<KmerID, KmerQuantity> binMers;
- //std::vector<KmerQuantity> weights;
- //std::vector<KmerQuantity> logLikes;
- //std::atomic<uint32_t> weightNum;
- std::atomic<uint32_t> updated;
- std::unordered_map<KmerID, KmerQuantity> binMers;
- KmerQuantity mean;
- KmerQuantity fracLow, fracHigh;
- ReadLength length;
- ReadLength effectiveLength;
- };
-
- // This struct represents a "job" (transcript) that needs to be processed
- struct TranscriptJob {
- StringPtr header;
- StringPtr seq;
- TranscriptID id;
- };
-
- struct TranscriptResult {
- TranscriptData *data;
- TranscriptKmerSet *ks;
- };
-
- struct BinmerUpdates {
- std::vector<KmerID> zeroedBinmers;
- std::vector<KmerID> updatedBinmers;
- };
-
- bool loggedCounts_;
- uint32_t numThreads_;
- size_t merLen_;
- ReadHash & readHash_;
- BiasIndex& biasIndex_;
- std::string kmerEquivClassFname_;
-
- // The number of occurences above whcih a kmer is considered promiscuous
- size_t promiscuousKmerCutoff_ {std::numeric_limits<size_t>::max()};
-
- // Map each kmer to the set of transcripts it occurs in
- KmerIDMap transcriptsForKmer_;
-
- // The actual data for each transcript
- std::vector<TranscriptInfo> transcripts_;
-
- TranscriptGeneMap& transcriptGeneMap_;
-
- tbb::concurrent_unordered_set<uint64_t> genePromiscuousKmers_;
-
- std::vector<Promiscutity> kmerGroupPromiscuities_;
- std::vector<Promiscutity> kmerGroupBiases_;
- std::vector<KmerQuantity> kmerGroupCounts_;
- std::vector<Count> kmerGroupSizes_;
-
-
- /**
- * logs all counts in the kmerGroupCounts_ variable. Groups with a
- * count of zero are assigned the special value LOG_0.
- */
- inline void logKmerGroupCounts_() {
- std::for_each(kmerGroupCounts_.begin(), kmerGroupCounts_.end(),
- [](KmerQuantity& count) {
- if (count > 0) {
- count = std::log(count);
- } else {
- count = salmon::math::LOG_0;
- }
- });
- }
-
- /**
- * Compute the "Inverse Document Frequency" (IDF) of a kmer within a set of transcripts.
- * The inverse document frequency is the log of the number of documents (i.e. transcripts)
- * divded by the number of documents containing this term (i.e. kmer).
- * @param k [kmer id for which the IDF should be computed]
- * @return [IDF(k)]
- */
- inline double _idf( uint64_t k ) {
- double df = transcriptsForKmer_[k].size();
- return (df > 0.0) ? std::log(transcripts_.size() / df) : 0.0;
- }
-
- /**
- * Returns true if this kmer should be considered in our estimation, false
- * otherwise
- * @param mer [kmer id to test for consideration]
- * @return [true if we consider the kmer with this id, false otherwise]
- */
- inline bool _considered( uint64_t mer ) {
- // The kmer is only considered if it exists in the transcript set
- // (i.e. it's possible to cover) and it's less prmiscuous than the
- // cutoff.
- return true;
- }
-
- /**
- * The weight attributed to each appearence of the kmer with the given ID.
- * If the kmer with ID k occurs in m different transcripts, then
- * weight_(k) = 1 / m.
- * @param k [The ID of the kmer whose weight is to be computed]
- * @return [The weight of each appearance of the kmer with the given ID]
- */
- KmerQuantity weight_( KmerID k ) {
- return 1.0 / (kmerGroupPromiscuities_[k] );
- }
-
- KmerQuantity _computeMedian( const TranscriptInfo& ti ) {
-
- using namespace boost::accumulators;
- using Accumulator = accumulator_set<double, stats<tag::median(with_p_square_quantile)>>;
-
- Accumulator acc;
- for (auto binmer : ti.binMers) {
- acc(binmer.second);
- }
-
- return median(acc);
- }
-
- /**
- * Computes the sum of kmer counts within the transcript given by ti, but clamping
- * all non-zero counts to the given quantile. For example, if quantile was 0.25, and
- * x and y represented the 1st and 3rd quantile of kmer counts, then every nonzero count c
- * would be transformed as c = max(x, min(y,c));
- *
- * @param ti [description]
- * @param quantile [description]
- * @return [description]
- */
- KmerQuantity _computeSumQuantile( const TranscriptInfo& ti, double quantile ) {
- using namespace boost::accumulators;
- using accumulator_t = accumulator_set<double, stats<tag::p_square_quantile> >;
- //using tail_t = accumulator_set<double, stats<tag::tail
- KmerQuantity sum = 0.0;
-
-
- accumulator_t accLow(quantile_probability = quantile);
- accumulator_t accHigh(quantile_probability = 1.0-quantile);
- for ( auto binmer : ti.binMers ) {
- accLow(binmer.second);
- accHigh(binmer.second);
- }
-
- auto cutLow = p_square_quantile(accLow);
- auto cutHigh = p_square_quantile(accHigh);
-
- for ( auto binmer : ti.binMers ) {
- KmerQuantity res = std::min( cutHigh, std::max(cutLow, binmer.second ) );
- if (res != binmer.second) {
- std::cerr << "cutLow = " << cutLow << ", cutHigh = " << cutHigh << " :: ";
- std::cerr << "res = " << res << ", binmer.second = " << binmer.second << "\n";
- }
- sum += std::min( cutHigh, std::max(cutLow, binmer.second ) );
- }
- return sum;
- }
-
- KmerQuantity _computeSum( const TranscriptInfo& ti ) {
- KmerQuantity sum = 0.0;
- for ( auto binmer : ti.binMers ) {
- sum += kmerGroupBiases_[binmer.first] * binmer.second;
- }
- return sum;
- }
-
- KmerQuantity _computeSumClamped( const TranscriptInfo& ti ) {
- if (ti.binMers.size() < 5) {return _computeSum(ti);}
- KmerQuantity sum = 0.0;
-
- auto maxQuant = std::numeric_limits<KmerQuantity>::max();
- auto minQuant = 0.0;
-
- auto startValue = ti.binMers.begin()->second;
- KmerQuantity lowestCount = startValue;
- KmerQuantity secondLowestCount = startValue;
-
- KmerQuantity highestCount = startValue;
- KmerQuantity secondHighestCount = startValue;
-
- for ( auto binmer : ti.binMers ) {
- auto prevLowest = lowestCount;
- lowestCount = std::min(binmer.second, lowestCount);
- if (lowestCount < prevLowest) { secondLowestCount = prevLowest; }
- auto prevHighest = highestCount;
- highestCount = std::max(binmer.second, highestCount);
- if (highestCount > prevHighest) { secondHighestCount = prevHighest; }
- }
- //std::cerr << lowestCount << ", " << secondLowestCount << ", " << highestCount << ", " << secondHighestCount << "\n";
-
- for ( auto binmer : ti.binMers ) {
- if (binmer.second > lowestCount and binmer.second < highestCount) {
- sum += kmerGroupBiases_[binmer.first] * binmer.second;
- } else if (binmer.second >= highestCount ) {
- sum += secondHighestCount;
- } else if (binmer.second <= lowestCount ) {
- sum += secondLowestCount;
- }
- }
-
- return sum;
- }
-
-
- bool _discard( const TranscriptInfo& ti) {
- if ( ti.mean == 0.0 ) {
- return false;
- } else {
- ti.mean = 0.0;
- ti.binMers.clear();
- return true;
- }
- }
-
- KmerQuantity _computeSumVec( const TranscriptInfo& ti ) {
- KmerQuantity sum = 0.0;
- for ( auto w : ti.weights ) {
- sum += w;
- }
- return sum;
- }
-
- KmerQuantity _computeClampedMean( const TranscriptInfo& ti ) {
- return (ti.effectiveLength > 0.0) ? (_computeSumClamped(ti) / ti.effectiveLength) : 0.0;
- }
-
- KmerQuantity _computeMean( const TranscriptInfo& ti ) {
- return (ti.effectiveLength > 0.0) ? (_computeSum(ti) / ti.effectiveLength) : 0.0;
- //return (ti.effectiveLength > 0.0) ? (ti.totalWeight.load() / ti.effectiveLength) : 0.0;
- //return (ti.effectiveLength > 0.0) ? (_computeSumVec(ti) / ti.effectiveLength) : 0.0;
- }
-
- KmerQuantity _computeWeightedMean( const TranscriptInfo& ti ) {
- using namespace boost::accumulators;
- accumulator_set<double, stats<tag::count, tag::weighted_mean>, double> acc;
-
- for ( auto binmer : ti.binMers ) {
- if ( this->genePromiscuousKmers_.find(binmer.first) == this->genePromiscuousKmers_.end() ){
- acc(binmer.second, weight=kmerGroupBiases_[binmer.first] * weight_(binmer.first));
- }
- }
-
- auto nnz = count(acc);
-
- if ( nnz < ti.effectiveLength ) {
- acc(0.0, weight=ti.effectiveLength-nnz);
- }
-
- auto sum = sum_of_weights(acc);
- return sum > 0.0 ? weighted_mean(acc) : 0.0;
- }
-
- double _effectiveLength( const TranscriptInfo& ts ) {
- double length = 0.0;
- for ( auto binmer : ts.binMers ) {
- length += weight_(binmer.first);
- }
- return length;
- }
-
- template <typename T>
- T dotProd_(std::vector<T>& u, std::vector<T>& v) {
-
- auto dot = tbb::parallel_reduce(
- BlockedIndexRange(size_t(0), v.size()),
- T(0.0), // identity element for summation
- [&]( const BlockedIndexRange& r, T current_sum ) -> T {
- for (size_t i=r.begin(); i!=r.end(); ++i) {
- current_sum += (u[i]*v[i]);
- }
- return current_sum; // body returns updated value of the accumulator
- },
- []( double s1, double s2 ) -> double {
- return s1+s2; // "joins" two accumulated values
- });
-
- return dot;
- }
-
- void normalizeTranscriptMeans_(){
- //auto sumMean = 0.0;
- //for ( auto ti : transcripts_ ) { sumMean += ti.mean; }
-
- auto sumMean = tbb::parallel_reduce(
- BlockedIndexRange(size_t(0), transcripts_.size()),
- double(0.0), // identity element for summation
- [&, this]( const BlockedIndexRange& r, double current_sum ) -> double {
- for (size_t i=r.begin(); i!=r.end(); ++i) {
- double x = this->transcripts_[i].mean;
- current_sum += x;
- }
- return current_sum; // body returns updated value of the accumulator
- },
- []( double s1, double s2 ) -> double {
- return s1+s2; // "joins" two accumulated values
- });
-
- // compute the new mean for each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
- [this, sumMean](const BlockedIndexRange& range) -> void {
- for(size_t tid = range.begin(); tid != range.end(); ++tid) {
- this->transcripts_[tid].mean /= sumMean;
- }
- });
-
- }
-
- template <typename T>
- T psumAccumulate(const tbb::blocked_range<T*>& r, T value) {
- return std::accumulate(r.begin(),r.end(),value);
- }
-
- template <typename T>
- T psum_(std::vector<T>& v) {
- auto func = std::bind( std::mem_fn(&CollapsedIterativeOptimizer<ReadHash>::psumAccumulate<T>),
- this, std::placeholders::_1, std::placeholders::_2 );
- auto sum = tbb::parallel_reduce(
- tbb::blocked_range<T*>(&v[0], &v[v.size()]),
- T{0}, // identity element for summation
- func,
- std::plus<T>()
- );
- return sum;
- }
-
- template <typename T>
- T pdiff_(std::vector<T>& v0, std::vector<T>& v1) {
- auto diff = tbb::parallel_reduce(
- BlockedIndexRange(size_t(0), v0.size()),
- double(0.0), // identity element for difference
- [&](const BlockedIndexRange& r, double currentDiff ) -> double {
- for (size_t i=r.begin(); i!=r.end(); ++i) {
- currentDiff += v0[i] - v1[i];
- }
- return currentDiff; // body returns updated value of the accumulator
- },
- []( double s1, double s2 ) -> double {
- return s1+s2; // "joins" two accumulated values
- }
- );
-
- return diff;
- }
-
- template <typename T>
- T pabsdiff_(std::vector<T>& v0, std::vector<T>& v1) {
- auto diff = tbb::parallel_reduce(
- BlockedIndexRange(size_t(0), v0.size()),
- double(0.0), // identity element for difference
- [&]( const BlockedIndexRange& r, double currentDiff ) -> double {
- for (size_t i=r.begin(); i!=r.end(); ++i) {
- currentDiff = std::abs(v0[i] - v1[i]);
- }
- return currentDiff; // body returns updated value of the accumulator
- },
- []( double s1, double s2 ) -> double {
- return s1+s2; // "joins" two accumulated values
- }
- );
-
- return diff;
- }
-
- template <typename T>
- std::vector<T> relAbsDiff_(std::vector<T>& v0, std::vector<T>& v1, T minVal) {
- std::vector<T> relDiff(v0.size(), T());
- // compute the new mean for each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(v0.size())),
- [&v0, &v1, &relDiff, minVal](const BlockedIndexRange& range ) -> void {
- for (auto tid : boost::irange(range.begin(), range.end())) {
- T oldVal = v0[tid];
- T newVal = v1[tid];
- if (oldVal > minVal or newVal > minVal) {
- relDiff[tid] = std::abs(newVal - oldVal) / oldVal;
- }
- }
- });
- return relDiff;
- }
-
- void normalize_(std::vector<double>& means) {
- auto sumMean = psum_(means);
- auto invSumMean = 1.0 / sumMean;
-
- // compute the new mean for each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
- [&means, invSumMean](const BlockedIndexRange& range ) -> void {
- for (auto tid : boost::irange(range.begin(), range.end())) {
- means[tid] *= invSumMean;
- }
- });
- }
-
- double averageCount(const TranscriptInfo& ts){
- if ( ts.binMers.size() == 0 ) { return 0.0; }
- double sum = 0.0;
- for ( auto binmer : ts.binMers ) {
- sum += kmerGroupBiases_[binmer.first] * binmer.second;
- }
- return sum / ts.binMers.size();
-
- }
-
- /**
- * This function should be run only after <b>after</b> an EM loop.
- * It estimates kmer specific biases based on how much a kmer's count
- * deviates from it's corresponding transcript's mean
- */
- void computeKmerFidelities_() {
-
- // For each transcript, compute it's overall fidelity. This is related
- // to the variance of coverage across the transcript. The more uniform
- // the coverage, the more we believe the transcript.
- std::vector<double> transcriptFidelities(transcripts_.size(), 0.0);
- tbb::parallel_for( BlockedIndexRange(size_t(0), transcripts_.size()),
- [this, &transcriptFidelities]( const BlockedIndexRange& range ) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- double sumDiff = 0.0;
- //if (tid >= this->transcripts_.size()) { std::cerr << "attempting to access transcripts_ out of range\n";}
- auto& ts = this->transcripts_[tid];
-
- //std::cerr << "transcript " << tid << "\n";
- for ( auto& b : ts.binMers ) {
- //if (b.first >= this->kmerGroupBiases_.size()) { std::cerr << "attempting to access kmerGroupBiases_ out of range\n";}
- auto scaledMean = this->kmerGroupSizes_[b.first] * ts.mean;
- auto diff = std::abs(b.second - scaledMean);
- sumDiff += diff;//*diff;
- }
- // The rest of the positions have 0 coverage have an error
- // of |0 - \mu_t| = \mu_t. There are l(t) - ts.binMers.size() of these.
- sumDiff += ts.mean * (ts.length - ts.binMers.size());
- auto fidelity = (ts.length > 0.0) ? sumDiff / ts.length : 0.0;
- fidelity = 1.0 / (fidelity + 1.0);
- //if (tid >= transcriptFidelities.size()) { std::cerr << "attempting to access transcriptFidelities out of range\n";}
- transcriptFidelities[tid] = fidelity;
- //std::cerr << "fidelity (" << tid << ") = " << fidelity << "\n";
- }
- });
-
- tbb::parallel_for(BlockedIndexRange(size_t(0), transcriptsForKmer_.size()),
- [this, &transcriptFidelities](const BlockedIndexRange& range) -> void {
- // Each transcript this kmer group appears in votes on the bias of this kmer.
- // Underrepresented kmers get bias values > 1.0 while overrepresented kmers get
- // bias values < 1.0. The vote of each transcript is weigted by it's fidelity
- for (auto kid = range.begin(); kid != range.end(); ++kid) {
- double totalBias = 0.0;
- double totalFidelity = 0.0;
- for( auto tid : this->transcriptsForKmer_[kid] ) {
- auto& transcript = this->transcripts_[tid];
- auto fidelity = transcriptFidelities[tid];
- auto totalMean = transcript.mean * this->kmerGroupSizes_[kid];
- auto curAlloc = transcript.binMers[kid];
- totalBias += (curAlloc > 0.0) ? fidelity * (totalMean / curAlloc) : 0.0;
- totalFidelity += fidelity;
- }
- double bias = totalBias / totalFidelity;
- double alpha = 0.25; //std::min( 1.0, 10.0*averageFidelity / confidence);
- double prevBias = this->kmerGroupBiases_[kid];
- this->kmerGroupBiases_[kid] = alpha * bias + (1.0 - alpha) * prevBias;
- }
- }
- );
-
-
- }
-
- double logLikelihood_(std::vector<double>& means) {
-
- const auto numTranscripts = transcripts_.size();
- std::vector<double> likelihoods(numTranscripts, 0.0);
-
- // Compute the log-likelihood
- tbb::parallel_for(BlockedIndexRange(size_t(0), numTranscripts),
- // for each transcript
- [&likelihoods, &means, this](const BlockedIndexRange& range) ->void {
- auto epsilon = 1e-40;
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- auto& ti = transcripts_[tid];
- double relativeAbundance = means[tid];
-
- if (ti.binMers.size() > 0 ) { likelihoods[tid] = 1.0; }
- // For each kmer in this transcript
- for ( auto& binmer : ti.binMers ) {
- likelihoods[tid] *= binmer.second /
- (this->kmerGroupBiases_[binmer.first] * this->kmerGroupCounts_[binmer.first]);
- }
- likelihoods[tid] = (relativeAbundance > epsilon and likelihoods[tid] > epsilon) ?
- std::log(relativeAbundance * likelihoods[tid]) : 0.0;
- }
- });
-
- return psum_(likelihoods);
-
- }
-
- double expectedLogLikelihood_(std::vector<double>& means) {
- // alpha in arXiv:1104.3889v2
- std::vector<double> sampProbs(means.size(), 0.0);
-
- tbb::parallel_for(BlockedIndexRange(size_t(0), means.size()),
- // for each transcript
- [&sampProbs, &means, this](const BlockedIndexRange& range) ->void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- auto& ti = transcripts_[tid];
- double relativeAbundance = means[tid];
- sampProbs[tid] = ti.length * relativeAbundance;
- }});
-
- normalize_(sampProbs);
-
- return (means.size() > 0) ? logLikelihood2_(sampProbs) / means.size() : 0.0;
- }
-
- double logLikelihood2_(std::vector<double>& sampProbs) {
-
- std::vector<double> likelihoods(transcriptsForKmer_.size(), 0.0);
-
- // Compute the log-likelihood
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptsForKmer_.size())),
- // for each transcript
- [&likelihoods, &sampProbs, this](const BlockedIndexRange& range) ->void {
- for (auto kid = range.begin(); kid != range.end(); ++kid) {
- double kmerLikelihood = 0.0;
- KmerQuantity totalKmerMass = 0.0;
- for (auto& tid : this->transcriptsForKmer_[kid]) {
- double kmerMass{this->transcripts_[tid].binMers[kid]};
- kmerLikelihood += kmerMass * (sampProbs[tid] / this->transcripts_[tid].length);
- totalKmerMass += kmerMass;
- }
- // If
- if (totalKmerMass > 0) {
- if (kmerLikelihood < 1e-20) {
- std::cerr << "kmer group: " << kid << " has probability (" << kmerLikelihood << "); too low\n";
- } else {
- likelihoods[kid] = std::log(kmerLikelihood);
- }
- }
- }
- });
-
- return psum_(likelihoods);
-
- }
-
-
- // Since there's no built in hash code for vectors
- template <typename T>
- class my_hasher{
- public:
- size_t operator()(const T& x) const {
- if (x.size() == 0 ) { return 0; }
- size_t seed = x[0];
- for (auto i : boost::irange(size_t(1), x.size())) {
- boost::hash_combine(seed, static_cast<size_t>(x[i]));
- }
- return seed;
- }
- };
-
-/**
- * Collapses all kmer which share the same transcript multiset. Such kmers can be
- * treated as a "batch" with a count whose value is the sum of individual "batch"
- * members.
- * @param isActiveKmer [A bitvector which designates, for each kmer,
- * whether or not that kmer is active in the current
- * read set.]
- */
- void collapseKmers_( boost::dynamic_bitset<>& isActiveKmer ) {
-
- auto numTranscripts = transcriptGeneMap_.numTranscripts();
-
- /**
- * Map from a vector of transcript IDs to the list of kmers that have this
- * transcript list. This allows us to collapse all kmers that exist in the
- * exact same set of transcripts into a single kmer group.
- */
- tbb::concurrent_unordered_map< TranscriptIDVector,
- tbb::concurrent_vector<KmerID>,
- my_hasher<std::vector<TranscriptID>> > m;
-
- // Asynchronously print out the progress of our hashing procedure
- std::atomic<size_t> prog{0};
- std::thread t([this, &prog]() -> void {
- ez::ezETAProgressBar pb(this->transcriptsForKmer_.size());
- pb.start();
- size_t prevProg{0};
- while ( prevProg < this->transcriptsForKmer_.size() ) {
- if (prog > prevProg) {
- auto diff = prog - prevProg;
- pb += diff;
- prevProg += diff;
- }
- boost::this_thread::sleep_for(boost::chrono::seconds(1));
- }
- if (!pb.isDone()) { pb.done(); }
- });
-
- //For every kmer, compute it's kmer group.
- tbb::parallel_for(BlockedIndexRange(size_t(0), transcriptsForKmer_.size()),
- [&](const BlockedIndexRange& range ) -> void {
- for (auto j = range.begin(); j != range.end(); ++j) {
- if (isActiveKmer[j]) {
- m[ transcriptsForKmer_[j] ].push_back(j);
- }
- ++prog;
- }
- });
-
- // wait for the parallel hashing to finish
- t.join();
-
- // TESTING
- std::ofstream eqFile("KMER_EQUIV_CLASSES.txt");
- for (auto& kv : m ) {
- for (auto e : kv.second) { eqFile << e << "\t"; }
- eqFile << "\n";
- }
- eqFile.close();
- // END TESTING
-
- std::cerr << "Out of " << transcriptsForKmer_.size() << " potential kmers, "
- << "there were " << m.size() << " distinct groups\n";
-
- size_t totalKmers = 0;
- size_t index = 0;
- std::vector<KmerQuantity> kmerGroupCounts(m.size());
- std::vector<Promiscutity> kmerGroupPromiscuities(m.size());
- std::vector<TranscriptIDVector> transcriptsForKmer(m.size());
- kmerGroupSizes_.resize(m.size(), 0);
-
- using namespace boost::accumulators;
- std::cerr << "building collapsed transcript map\n";
- for ( auto& kv : m ) {
-
- // For each transcript covered by this kmer group, add this group to the set of kmer groups contained in
- // the transcript. For efficiency, we also compute the kmer promiscuity values for each kmer
- // group here --- the promiscuity of a kmer group is simply the number of distinct transcripts in
- // which this group of kmers appears.
- auto prevTID = std::numeric_limits<TranscriptID>::max();
- KmerQuantity numDistinctTranscripts = 0.0;
- for ( auto& tid : kv.first ) {
- transcripts_[tid].binMers[index] += 1;
- // Since the transcript IDs are sorted we just have to check
- // if this id is different from the previous one
- if (tid != prevTID) { numDistinctTranscripts += 1.0; }
- prevTID = tid;
- }
- // Set the promiscuity and the set of transcripts for this kmer group
- kmerGroupPromiscuities[index] = numDistinctTranscripts;
- transcriptsForKmer[index] = kv.first;
-
- // Aggregate the counts attributable to each kmer into its repective
- // group's counts.
- for (auto kid : kv.second) {
- kmerGroupCounts[index] += readHash_.atIndex(kid);
- }
- kmerGroupSizes_[index] = kv.second.size();
-
- // Update the total number of kmers we're accounting for
- // and the index of the current kmer group.
- totalKmers += kv.second.size();
- ++index;
- }
-
- std::cerr << "Verifying that the unique set encodes " << totalKmers << " kmers\n";
- std::cerr << "collapsedCounts.size() = " << transcriptsForKmer.size() << "\n";
-
- // update the relevant structures holding info for the full kmer
- // set with those holding the info for our collapsed kmer sets
- std::swap(kmerGroupPromiscuities, kmerGroupPromiscuities_);
- std::swap(kmerGroupCounts, kmerGroupCounts_);
- std::swap(transcriptsForKmer, transcriptsForKmer_);
-
- /*
- uint64_t groupCounts = 0;
- for(auto c : kmerGroupCounts_) { groupCounts += c; }
- auto tmp = psum_(kmerGroupCounts_);
- std::cerr << "groupCount(" << groupCounts << ") - parallelCount(" << tmp << ") = " << groupCounts - tmp << "\n";
- std::atomic<uint64_t> individualCounts{0};
- tbb::parallel_for(size_t{0}, readHash_.size(),
- [&](size_t i) {
- if (isActiveKmer[i]) {
- individualCounts += readHash_.atIndex(i);
- } });
- auto diff = groupCounts - individualCounts;
- std::cerr << "groupTotal(" << groupCounts << ") - totalNumKmers(" << individualCounts << ") = " << diff << "\n";
- */
- }
-
-
- /**
- * NEW! Assuming that equivalence classes were computed in the index
- **/
- void prepareCollapsedMaps_(
- const std::string& kmerEquivClassFname,
- bool discardZeroCountKmers) {
-
- using std::cerr;
-
- cerr << "reading Kmer equivalence classes \n";
-
- auto memberships = LUTTools::readKmerEquivClasses(kmerEquivClassFname);
- size_t numKmers{readHash_.size()};
- size_t numKmerClasses{(*std::max_element(memberships.begin(), memberships.end())) + 1};
- boost::dynamic_bitset<> isActiveKmer(numKmers);
-
- kmerGroupCounts_.resize(numKmerClasses, 0.0);
- kmerGroupSizes_.resize(numKmerClasses, 0.0);
- kmerGroupPromiscuities_.resize(numKmerClasses, 0.0);
-
- cerr << "updating Kmer group counts\n";
- // Update the kmer group counts using the information from the read hash
- for (auto kid : boost::irange(size_t{0}, numKmers)) {
-
- size_t count = readHash_.atIndex(kid);
- if (!discardZeroCountKmers or count != 0) {
- auto kmerClassID = memberships[kid];
- isActiveKmer[kmerClassID] = true;
- kmerGroupCounts_[kmerClassID] += count;
- kmerGroupSizes_[kmerClassID]++;
- }
-
- }
-
- cerr << "updating transcript map\n";
- for (auto kmerClassID : boost::irange(size_t{0}, numKmerClasses)) {
-
- // For each transcript covered by this kmer group, add this group to the set of kmer groups contained in
- // the transcript. For efficiency, we also compute the kmer promiscuity values for each kmer
- // group here --- the promiscuity of a kmer group is simply the number of distinct transcripts in
- // which this group of kmers appears.
- auto prevTID = std::numeric_limits<TranscriptID>::max();
- KmerQuantity numDistinctTranscripts = 0.0;
-
- //cerr << "numKmerClasses = " << numKmerClasses << ", kmerClassID = " << kmerClassID << ", transcriptsForKmer_.size() = " << transcriptsForKmer_.size() << "\n";
- for (auto& tid : transcriptsForKmer_[kmerClassID]) {
- transcripts_[tid].binMers[kmerClassID] += 1;
- // Since the transcript IDs are sorted we just have to check
- // if this id is different from the previous one
- if (tid != prevTID) { numDistinctTranscripts += 1.0; }
- prevTID = tid;
- }
- // Set the promiscuity and the set of transcripts for this kmer group
- kmerGroupPromiscuities_[kmerClassID] = numDistinctTranscripts;
- //kmerGroupPromiscuities_[kmerClassID] = transcriptsForKmer_[kmerClassID].size();
- }
- cerr << "done\n";
-
- //promiscuousKmerCutoff_ = 50;
- for (auto kmerClassID : boost::irange(size_t{0}, numKmerClasses)) {
- if (kmerGroupPromiscuities_[kmerClassID] > promiscuousKmerCutoff_ ) {
- kmerGroupCounts_[kmerClassID] = 0;
- }
- }
- // By now we should have properly filled out the vairables
- // kmerGroupPromiscuities_
- // kmerGroupCounts_
- // transcripts_
- }
-
-
- /**
- * This function should be called before performing any optimization procedure.
- * It builds all of the necessary data-structures which are used during the transcript
- * quantification procedure.
- * @param klutfname [The name of the file containing the kmer lookup table.]
- * @param tlutfname [The name of the file containing the transcript lookup table.]
- */
- void initialize_(
- const std::string& klutfname,
- const std::string& tlutfname,
- const std::string& kmerEquivClassFname,
- const bool discardZeroCountKmers) {
-
- // So we can concisely identify each transcript
- TranscriptID transcriptIndex {0};
-
- size_t numTranscripts = transcriptGeneMap_.numTranscripts();
- size_t numKmers = readHash_.size();
- auto merSize = readHash_.kmerLength();
-
- size_t numActors = numThreads_;
- std::vector<std::thread> threads;
-
- transcripts_.resize(transcriptGeneMap_.numTranscripts());
-
- // Get the kmer look-up-table from file
- LUTTools::readKmerLUT(klutfname, transcriptsForKmer_);
-
-/* size_t numContainingTranscripts = transcriptsForKmer_[kid].size();
- assert(numContainingTranscripts > 0);
- if (numContainingTranscripts == 1 or
- std::all_of(transcriptsForKmer_[kid].begin(), transcriptsForKmer_[kid].end(), [this, kid](TranscriptID tid) -> bool {
- return tid == this->transcriptsForKmer_[kid][0]; } )) {
- uniquelyAnchoredTranscripts.insert(transcriptsForKmer_[kid][0]);
- }
-
- }
-*/
- //std::cerr << "\nIn the original set, there were " << uniquelyAnchoredTranscripts.size() << " transcripts with unique nonzero anchors\n";
-
- std::cerr << "\n";
- // collapseKmers_(isActiveKmer); // equiv-classes
- prepareCollapsedMaps_(kmerEquivClassFname, discardZeroCountKmers);
-
-
- // we have no k-mer-specific biases currently
- kmerGroupBiases_.resize(transcriptsForKmer_.size(), 1.0);
-
- // Get transcript lengths
- std::ifstream ifile(tlutfname, std::ios::binary);
- size_t numRecords {0};
- ifile.read(reinterpret_cast<char *>(&numRecords), sizeof(numRecords));
-
- std::cerr << "Transcript LUT contained " << numRecords << " records\n";
- for (auto i : boost::irange(size_t(0), numRecords)) {
- auto ti = LUTTools::readTranscriptInfo(ifile);
- // copy over the length, then we're done.
- transcripts_[ti->transcriptID].length = ti->length;
- transcripts_[ti->transcriptID].effectiveLength = ti->length - merSize + 1;
- }
- ifile.close();
- // --- done ---
-
- // tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
- // [this]( size_t idx ) {
- // auto& transcript = this->transcripts_[idx];
- // transcript.effectiveLength = transcript.effectiveLength - transcript.binMers.size();
- // for (auto binmer : transcript.binMers) {
- // transcript.effectiveLength += this->_weight(binmer.first);
- // }
- // });
-
- size_t numRes = 0;
- std::cerr << "\n\nRemoving duplicates from kmer transcript lists ... ";
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptsForKmer_.size())),
- [&numRes, this](const BlockedIndexRange& range) -> void {
- for (auto idx = range.begin(); idx != range.end(); ++idx) {
- auto& transcripts = this->transcriptsForKmer_[idx];
- // should already be sorted -- extra check can be removed eventually
- std::is_sorted(transcripts.begin(), transcripts.end());
- // Uniqify the transcripts
- auto it = std::unique(transcripts.begin(), transcripts.end());
- transcripts.resize(std::distance(transcripts.begin(), it));
- ++numRes;
- }
- });
-
- std::cerr << "done\n";
-
- std::cerr << "Computing kmer group promiscuity rates\n";
- /* -- done
- kmerGroupPromiscuities_.resize(transcriptsForKmer_.size());
- tbb::parallel_for( size_t{0}, kmerGroupPromiscuities_.size(),
- [this]( KmerID kid ) -> void { this->kmerGroupPromiscuities_[kid] = this->_weight(kid); }
- );
- */
-
- tbb::parallel_for(BlockedIndexRange(size_t{0}, transcripts_.size()),
- [&, this](const BlockedIndexRange& range) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- auto& ti = this->transcripts_[tid];
- for (auto& binmer : ti.binMers) {
- if (binmer.second > promiscuousKmerCutoff_) {
- ti.effectiveLength -= 1.0;
- }
- }
- }
- });
-
- /**
- * gene-promiscuous kmers can never be added to a transcript's counts, so
- * it's unfair to consider them in the transcripts effective length.
- */
- /*
- std::for_each( genePromiscuousKmers_.begin(), genePromiscuousKmers_.end(),
- [this]( KmerID kmerId ) -> void {
- for ( auto tid : transcriptsForKmer_[kmerId] ) {
- transcripts_[tid].effectiveLength -= 1.0;
- }
- });
- */
- std::cerr << "done\n";
-
- //return mappedReads;
- }
-
- void _dumpCoverage( const boost::filesystem::path &cfname) {
- auto memberships = LUTTools::readKmerEquivClasses(kmerEquivClassFname_);
-
- size_t numTrans = transcripts_.size();
- size_t numProc = 0;
- std::ifstream kmerStructFile("zm_transcript.map", std::ios::in | std::ios::binary);
- std::unordered_map<std::string, std::vector<KmerID>> kstruct;
- uint64_t tlen{0};
- uint32_t nameLen{0};
- uint64_t tid{0};
- for (size_t i = 0; i < numTrans; ++i) {
- kmerStructFile.read(reinterpret_cast<char*>(&nameLen), sizeof(nameLen));
- std::string name(nameLen, ' ');
- kmerStructFile.read(reinterpret_cast<char*>(&name[0]), nameLen * sizeof(name[0]));
- kmerStructFile.read(reinterpret_cast<char*>(&tlen), sizeof(tlen));
- kstruct[name].resize(tlen);
- auto& tvec = kstruct[name];
- kmerStructFile.read(reinterpret_cast<char*>(&tvec[0]), tlen * sizeof(kstruct[name][0]));
- }
-
- kmerStructFile.close();
-
- std::ofstream ofile(cfname.native());
-
- ofile << "# numtranscripts_\n";
- ofile << "# transcript_name_{1} num_kmer_classes{1} class_1_count class_2_count ... class_{num_eq_clases}_count\n";
- ofile << "# ... \n";
-
- ofile << transcripts_.size() << "\n";
-
- std::cerr << "Dumping coverage statistics to " << cfname << "\n";
-
-
- tbb::concurrent_queue<StringPtr> covQueue;
-
- tbb::parallel_for(BlockedIndexRange(size_t{0}, transcripts_.size()),
- [this, &covQueue, &memberships, &kstruct] (const BlockedIndexRange& range) -> void {
- for (auto index = range.begin(); index != range.end(); ++index) {
- auto& td = this->transcripts_[index];
- const auto& name = this->transcriptGeneMap_.transcriptName(index);
-
- std::stringstream ostream;
- ostream << this->transcriptGeneMap_.transcriptName(index) << " " << kstruct[name].size();
-
- std::map<uint64_t, double> kmerClassRelativeMass;
-
- for (auto bm : kstruct[name]) {
- auto kclass = memberships[bm];
- double totalMass = 0.0;
- for (auto tid : this->transcriptsForKmer_[kclass]) {
- totalMass += this->transcripts_[tid].mean;
- }
- kmerClassRelativeMass[kclass] = (totalMass > 0.0) ? td.mean / totalMass : 0.0;
- }
-
- for (auto bm : kstruct[name]) {
- auto kclass = memberships[bm];
- //double classSize = this->kmerGroupSizes_[kclass];
- //double mass = td.binMers[kclass];
- //ostream << " " << mass / classSize;
- auto count = readHash_.atIndex(bm);
- ostream << " " << count * kmerClassRelativeMass[kclass];
- }
- ostream << "\n";
- std::string* ostr = new std::string(ostream.str());
- covQueue.push(ostr);
- // for boost lockfree
- // while(!covQueue.push(ostr));
- }
- }
- );
-
- ez::ezETAProgressBar pb(transcripts_.size());
- pb.start();
-
- std::string* sptr = nullptr;
- while ( numProc < numTrans ) {
- while( covQueue.try_pop(sptr) ) {
- ofile << (*sptr);
- ++pb;
- ++numProc;
- delete sptr;
- }
- }
-
- ofile.close();
-
- }
-
- void filterByCoverage_( double coverageCutoff, std::vector<double>& means ) {
-
- tbb::parallel_for(BlockedIndexRange(size_t{0}, transcripts_.size()),
- [this, coverageCutoff, &means] (const BlockedIndexRange& range) -> void {
- for (auto index = range.begin(); index != range.end(); ++index) {
- double numCovered{1.0};
- double totalNumKmers{1.0};
- bool unAnchored{true};
- const auto& td = this->transcripts_[index];
-
- for ( auto bm : td.binMers ) {
- double numKmersToCover = this->kmerGroupSizes_[bm.first];
- numCovered += (bm.second > 0.0) ? numKmersToCover : 0.0;
- totalNumKmers += numKmersToCover;
- //if (bm.second > 1000 * numKmersToCover and (std::abs(bm.second - this->kmerGroupCounts_[bm.first]) < 1e-5)) {
- // unAnchored = false;
- // break;
- //}
-
- }
-
- if (unAnchored and ((numCovered / totalNumKmers) < coverageCutoff)) {
- means[index] = this->transcripts_[index].mean = 0.0;
- }
- }
- }
- );
-
- }
-
- void EMUpdate_( const std::vector<double>& meansIn, std::vector<double>& meansOut ) {
- assert(meansIn.size() == meansOut.size());
-
- auto reqNumJobs = transcriptsForKmer_.size();
-
- std::atomic<size_t> numJobs{0};
- std::atomic<size_t> completedJobs{0};
-
- // Print out our progress
- auto pbthread = std::thread(
- [&completedJobs, reqNumJobs]() -> bool {
- auto prevNumJobs = 0;
- ez::ezETAProgressBar show_progress(reqNumJobs);
- show_progress.start();
- while ( prevNumJobs < reqNumJobs ) {
- if ( prevNumJobs < completedJobs ) {
- show_progress += completedJobs - prevNumJobs;
- }
- prevNumJobs = completedJobs.load();
- boost::this_thread::sleep_for(boost::chrono::seconds(1));
- }
- if (!show_progress.isDone()) { show_progress.done(); }
- return true;
- });
-
- // E-Step : reassign the kmer group counts proportionally to each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptsForKmer_.size())),
- // for each kmer group
- [&completedJobs, &meansIn, &meansOut, this](const BlockedIndexRange& range) -> void {
- for (auto kid : boost::irange(range.begin(), range.end())) {
- auto kmer = kid;
- // for each transcript containing this kmer group
- auto& transcripts = this->transcriptsForKmer_[kmer];
-
- double totalMass = 0.0;
- for ( auto tid : transcripts ) {
- totalMass += meansIn[tid];
- /** TRY SPARSE
- auto& trans = this->transcripts_[tid];
- totalMass += trans.effectiveLength ? (meansIn[tid] - (trans.binMers[kmer] / trans.effectiveLength)) : 0.0;
- **/
- }
-
- double norm = (totalMass > 0.0) ? 1.0 / totalMass : 0.0;
- for ( auto tid : transcripts ) {
- auto& trans = this->transcripts_[tid];
- auto lastIndex = trans.binMers.size() - 1;
- // binMer based
- //auto idx = trans.weightNum++;
- //auto kmerIt = trans.binMers.find(kmer);
-
- trans.binMers[kmer] = meansIn[tid] * norm *
- kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
- /** TRY SPARSE
- double myMass = trans.effectiveLength ? (meansIn[tid] - (trans.binMers[kmer] / trans.effectiveLength)) : 0.0;
- trans.binMers[kmer] = myMass * norm *
- kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
- **/
-
- // If we've seen all of the k-mers that appear in this transcript,
- // then we can compute it's new estimated abundance
- if (trans.updated++ == lastIndex) {
- trans.mean = meansOut[tid] = this->_computeMean(trans);
- //trans.mean = meansOut[tid] = this->_computeClampedMean(trans);
-
- //trans.weightNum.store(0);
- trans.updated.store(0);
- }
- //this->transcripts_[tid].binMers[kmer] =
- //meansIn[tid] * norm * kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
-
-
-
-
- /*
- auto idx = trans.weightNum++;
- auto weight = meansIn[tid] * norm * kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
- trans.logLikes[idx] = (weight > 0.0) ? std::log(weight / this->kmerGroupCounts_[kmer] ) : 0.0;
- trans.weights[idx] = weight;
- ++trans.updated;
- if (idx == trans.weights.size()-1) {
- while (trans.updated.load() < trans.weightNum.load() ) {}
- meansOut[tid] = this->_computeMean(trans);
- trans.weightNum.store(0);
- trans.updated.store(0);
- }*/
-
- /*
- auto& weight = trans.totalWeight;
- auto current = weight.load();
- auto updated = current + delta;
- while( !weight.compare_exchange_strong(current, updated) ) {
- current = weight.load(); updated = current + delta;
- }
- */
- }
-
-
- ++completedJobs;
- } // for kid in range
-
- });
-
- // wait for all kmer groups to be processed
- pbthread.join();
-
- /*
- double delta = 0.0;
- double norm = 1.0 / transcripts_.size();
- size_t discard = 0;
-
-
- // M-Step
- // compute the new mean for each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
- [this, &meansOut](const BlockedIndexRange& range) -> void {
- for (auto tid : boost::irange(range.begin(), range.end())) {
- auto& ts = this->transcripts_[tid];
- auto tsNorm = 1.0;//(ts.effectiveLength > 0.0) ? 1.0 / std::sqrt(ts.effectiveLength) : 1.0;
- meansOut[tid] = tsNorm * this->_computeMean( ts );
- ts.weightNum.store(0);
- }
- });
- */
-
- normalize_(meansOut);
- }
-
-
-public:
- /**
- * Construct the solver with the read and transcript hashes
- */
- CollapsedIterativeOptimizer( ReadHash &readHash, TranscriptGeneMap& transcriptGeneMap,
- BiasIndex& biasIndex, uint32_t numThreads ) :
- readHash_(readHash), merLen_(readHash.kmerLength()),
- transcriptGeneMap_(transcriptGeneMap), biasIndex_(biasIndex),
- numThreads_(numThreads) {}
-
-
- KmerQuantity optimize(const std::string& klutfname,
- const std::string& tlutfname,
- const std::string& kmerEquivClassFname,
- size_t numIt,
- double minMean,
- double maxDelta) {
-
- kmerEquivClassFname_ = kmerEquivClassFname;
- const bool discardZeroCountKmers = true;
- initialize_(klutfname, tlutfname, kmerEquivClassFname, discardZeroCountKmers);
-
- KmerQuantity globalError {0.0};
- bool done {false};
- std::atomic<size_t> numJobs {0};
- std::atomic<size_t> completedJobs {0};
- std::vector<KmerID> kmerList( transcriptsForKmer_.size(), 0 );
- size_t idx = 0;
-
- tbb::task_scheduler_init tbb_init(numThreads_);
-
- std::cerr << "Computing initial coverage estimates ... ";
-
- std::vector<double> means0(transcripts_.size(), 0.0);
- std::vector<double> means1(transcripts_.size(), 0.0);
- std::vector<double> means2(transcripts_.size(), 0.0);
- std::vector<double> meansPrime(transcripts_.size(), 0.0);
-
- std::vector<double> r(transcripts_.size(), 0.0);
- std::vector<double> v(transcripts_.size(), 0.0);
- std::atomic<size_t> uniquelyAnchoredTranscripts{0};
- std::atomic<size_t> nonZeroTranscripts{0};
-
- loggedCounts_ = true;
- logKmerGroupCounts_();
-
- // Compute the initial mean for each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptGeneMap_.numTranscripts())),
- [this, &uniquelyAnchoredTranscripts, &nonZeroTranscripts, &means0](const BlockedIndexRange& range) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- auto& transcriptData = this->transcripts_[tid];
- KmerQuantity total = 0.0;
- //transcriptData.weights.resize(transcriptData.binMers.size());
- //transcriptData.logLikes.resize(transcriptData.binMers.size());
-
- bool notTotallyPromiscuous{false};
- bool nonZero{false};
-
- for ( auto & kv : transcriptData.binMers ) {
- auto kmer = kv.first;
- if ( this->genePromiscuousKmers_.find(kmer) == this->genePromiscuousKmers_.end() ){
- // count is the number of times kmer appears in transcript (tid)
- auto logCount = std::log(kv.second);
- kv.second = logCount * std::log(this->kmerGroupCounts_[kmer]) * std::log(this->weight_(kmer));
- if (kv.second > 0.0 and kmerGroupPromiscuities_[kmer] == 1) { notTotallyPromiscuous = true; }
- if (kv.second > 0.0) { nonZero = true; }
- //transcriptData.weights[transcriptData.weightNum++] = kv.second;
- //total += kv.second;
- }
- }
- //transcriptData.totalWeight.store(total);
- //transcriptData.weightNum.store(0);
-
- transcriptData.mean = means0[tid] = this->_computeMean(transcriptData);
- //transcriptData.mean = means0[tid] = this->_computeClampedMean(transcriptData);
-
- //transcriptData.mean = notTotallyPromiscuous ? means0[tid] = this->_computeMean(transcriptData) : 0.0;
- if (notTotallyPromiscuous) { ++uniquelyAnchoredTranscripts; }
- if (nonZero) { ++nonZeroTranscripts; }
- //transcriptData.mean = means0[tid] = this->_computeWeightedMean(transcriptData);
- //transcriptData.mean = means0[tid] = 1.0 / this->transcripts_.size();//this->_computeMean(transcriptData);
- //transcriptData.mean = this->_computeWeightedMean(transcriptData);
- //transcriptData.mean = distribution(generator);
- //this->_computeWeightedMean( transcriptData );
- }
- }
- );
- normalizeTranscriptMeans_();
- normalize_(means0);
-
- std::cerr << "\nThere were " << uniquelyAnchoredTranscripts.load() << " uniquely anchored transcripts\n";
- std::cerr << "There were " << nonZeroTranscripts.load() << " transcripts with at least one overlapping k-mer\n";
- std::cerr << "done\n";
- size_t outerIterations = 1;
- /*
- for ( size_t iter = 0; iter < numIt; ++iter ) {
- std::cerr << "EM iteraton: " << iter << "\n";
- EMUpdate_(means0, means1);
- std::swap(means0, means1);
- }
- */
-
- /**
- * Defaults for these values taken from the R implementation of
- * [SQUAREM](http://cran.r-project.org/web/packages/SQUAREM/index.html).
- */
- double minStep0, minStep, maxStep0, maxStep, mStep, nonMonotonicity;
- minStep0 = 1.0; minStep = 1.0;
- maxStep0 = 1.0; maxStep = 1.0;
- mStep = 4.0;
- nonMonotonicity = 1.0;
-
- double negLogLikelihoodOld = std::numeric_limits<double>::infinity();
- double negLogLikelihoodNew = std::numeric_limits<double>::infinity();
-
- std::function<bool(std::vector<double>&, std::vector<double>&)> hasConverged;
- if (std::isfinite(maxDelta)) {
- hasConverged = [maxDelta, this] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
- double minVal = 1e-7;
- auto relDiff = this->relAbsDiff_(v0, v1, minVal);
- double maxRelativeChange = *std::max_element( relDiff.begin(), relDiff.end() );
- std::cerr << "max relative change: " << maxRelativeChange << "\n";
- return maxRelativeChange < maxDelta;
- };
- } else {
- hasConverged = [] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
- std::cerr << "no data-driven convergence criterion specified\n";
- return false;
- };
- }
-
- std::string clearline = " \r\r";
-
- // Until we've reached the specified maximum number of iterations, or hit ourt
- // tolerance threshold
- for ( size_t iter = 0; iter < numIt; ++iter ) {
- std::string jumpBack = "\x1b[A";
- std::cerr << clearline << "SQUAREM iteraton [" << iter << "]\n";
- jumpBack += "\x1b[A";
-
-
- // Theta_1 = EMUpdate(Theta_0)
- std::cerr << clearline << "1/3\n";
- EMUpdate_(means0, means1);
- jumpBack += "\x1b[A\x1b[A";
-
- if (!std::isfinite(negLogLikelihoodOld)) {
- negLogLikelihoodOld = -expectedLogLikelihood_(means0);
- }
-
- // Theta_2 = EMUpdate(Theta_1)
- std::cerr << clearline << "2/3\n";
- EMUpdate_(means1, means2);
- jumpBack += "\x1b[A\x1b[A";
-
- // Check for data-driven convergence criteria
- if (hasConverged(means1, means2)) {
- std::cerr << "convergence criteria met; terminating SQUAREM\n";
- break;
- }
-
- double delta = pabsdiff_(means1, means2);
- std::cerr << clearline << "delta = " << delta << "\n";
- jumpBack += "\x1b[A";
-
- // r = Theta_1 - Theta_0
- // v = (Theta_2 - Theta_1) - r
- tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
- [&means0, &means1, &means2, &r, &v](const BlockedIndexRange& range) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- r[tid] = means1[tid] - means0[tid];
- v[tid] = (means2[tid] - means1[tid]) - r[tid];
- }
- }
- );
-
- double rNorm = std::sqrt(dotProd_(r,r));
- double vNorm = std::sqrt(dotProd_(v,v));
- double alphaS = rNorm / vNorm;
-
- alphaS = std::max(minStep, std::min(maxStep, alphaS));
-
- tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
- [&r, &v, alphaS, &means0, &meansPrime](const BlockedIndexRange& range) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- meansPrime[tid] = std::max(0.0, means0[tid] + 2*alphaS*r[tid] + (alphaS*alphaS)*v[tid]);
- }
- }
- );
-
- // Stabilization step
- if ( std::abs(alphaS - 1.0) > 0.01) {
- std::cerr << clearline << "alpha = " << alphaS << ". ";
- std::cerr << "Performing a stabilization step.\n";
- EMUpdate_(meansPrime, meansPrime);
- jumpBack += "\x1b[A\x1b[A";
- }
-
- /** Check for an error in meansPrime **/
-
- /** If there is **/
- if (std::isfinite(nonMonotonicity)) {
- negLogLikelihoodNew = -expectedLogLikelihood_(meansPrime);
- } else {
- negLogLikelihoodNew = negLogLikelihoodOld;
- }
-
- if (negLogLikelihoodNew > negLogLikelihoodOld + nonMonotonicity) {
- std::swap(meansPrime, means2);
- negLogLikelihoodNew = -expectedLogLikelihood_(meansPrime);
- if (alphaS == maxStep) { maxStep = std::max(maxStep0, maxStep/mStep); }
- alphaS = 1.0;
- }
- std::cerr << clearline << "alpha = " << alphaS << ", ";
-
- if (alphaS == maxStep) { maxStep = mStep * maxStep; }
- if (minStep < 0 and alphaS == minStep) { minStep = mStep * minStep; }
- std::swap(meansPrime, means0);
-
- if (!std::isnan(negLogLikelihoodNew)) {
- std::cerr << "negLogLikelihood = " << negLogLikelihoodNew << "\n";
- jumpBack += "\x1b[A";
- negLogLikelihoodOld = negLogLikelihoodNew;
-
- }
-
- bool doFilter{true};
- if ( doFilter and ((iter == numIt - 1) or (iter > 0 and iter % 10 == 0))) {
- double itFrac = iter / static_cast<double>(numIt);
- // cutoff = e^{x^2 - 0.5}
- double cutoff = std::min(1.0, std::exp((itFrac*itFrac) - 0.12));//std::max(0.5, std::min(1.0, (iter + 10) / static_cast<double>(numIt)));
- std::cerr << clearline << "Iteration[" << iter << "] :: Filter cutoff = " << cutoff << "\n";
- jumpBack += "\x1b[A";
- filterByCoverage_(cutoff, means0);
- }
-
- // If it's not the final iteration, then jump back and clear the console.
- if (iter < numIt - 1) { std::cerr << jumpBack; }
-
-
- /** Filter out super low-abundance transcripts **/
- /* double epsilon = 10e-8;
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
- [&means0, epsilon, this](const BlockedIndexRange& range) -> void {
- for (auto tid : boost::irange(range.begin(), range.end())) {
- if (means0[tid] < epsilon) {
- means0[tid] = 0.0;
- this->transcripts_[tid].mean = 0.0;
- }
- }
- });
- */
-
- // if (iter > 0 and iter % 5 == 0 and iter < numIt - 1) {
- // std::cerr << "updating kmer biases: . . . ";
- // tbb::parallel_for( BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
- // [&means0, this](const BlockedIndexRange& range) -> void {
- // for (auto tid : boost::irange(range.begin(), range.end())) {
- // this->transcripts_[tid].mean = means0[tid];
- // }
- // });
-
- // computeKmerFidelities_();
- // std::cerr << "done\n";
- // }
-
-
- }
-
-
- /*
- for ( size_t oiter = 0; oiter < outerIterations; ++oiter ) {
- for ( size_t iter = 0; iter < numIt; ++iter ) {
-
- auto reqNumJobs = transcriptsForKmer_.size();
- std::cerr << "iteraton: " << iter << "\n";
-
- globalError = 0.0;
- numJobs = 0;
- completedJobs = 0;
-
- // Print out our progress
- auto pbthread = std::thread(
- [&completedJobs, reqNumJobs]() -> bool {
- auto prevNumJobs = 0;
- ez::ezETAProgressBar show_progress(reqNumJobs);
- show_progress.start();
- while ( prevNumJobs < reqNumJobs ) {
- if ( prevNumJobs < completedJobs ) {
- show_progress += completedJobs - prevNumJobs;
- }
- prevNumJobs = completedJobs.load();
- boost::this_thread::sleep_for(boost::chrono::seconds(1));
- }
- return true;
- });
-
- // E-Step : reassign the kmer group counts proportionally to each transcript
- tbb::parallel_for( size_t(0), size_t(transcriptsForKmer_.size()),
- // for each kmer group
- [&kmerList, &completedJobs, this]( size_t kid ) {
- auto kmer = kid;
- if ( this->genePromiscuousKmers_.find(kmer) == this->genePromiscuousKmers_.end() ){
-
- // for each transcript containing this kmer group
- auto &transcripts = this->transcriptsForKmer_[kmer];
- if ( transcripts.size() > 0 ) {
-
- double totalMass = 0.0;
- for ( auto tid : transcripts ) {
- totalMass += this->transcripts_[tid].mean;
- }
-
- if ( totalMass > 0.0 ) {
- double norm = 1.0 / totalMass;
- for ( auto tid : transcripts ) {
- if ( this->transcripts_[tid].mean > 0.0 ) {
- this->transcripts_[tid].binMers[kmer] =
- this->transcripts_[tid].mean * norm * kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
- }
- }
- }
-
- }
- }
- ++completedJobs;
- });
-
- // wait for all kmer groups to be processed
- pbthread.join();
-
- // reset the job counter
- completedJobs = 0;
-
- double delta = 0.0;
- double norm = 1.0 / transcripts_.size();
-
- std::vector<KmerQuantity> prevMeans( transcripts_.size(), 0.0 );
- tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
- [this, &prevMeans]( size_t tid ) -> void { prevMeans[tid] = this->transcripts_[tid].mean; });
-
- std::cerr << "\ncomputing new means ... ";
- size_t discard = 0;
-
- // M-Step
- // compute the new mean for each transcript
- tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
- [this, iter, numIt, norm, minMean, &discard]( size_t tid ) -> void {
- auto& ts = this->transcripts_[tid];
- auto tsNorm = 1.0;//(ts.effectiveLength > 0.0) ? 1.0 / std::sqrt(ts.effectiveLength) : 1.0;
- //ts.mean = tsNorm * this->_computeWeightedMean( ts );
- //ts->mean = tsNorm * this->averageCount( ts );
- ts.mean = tsNorm * this->_computeMean( ts );
- //ts.mean = tsNorm * this->_computeMedian( ts );
- });
-
- normalizeTranscriptMeans_();
- for( auto tid : boost::irange(size_t{0}, prevMeans.size()) ){
- delta += std::abs( transcripts_[tid].mean - prevMeans[tid] );
- }
-
- std::cerr << "done\n";
- std::cerr << "total variation in mean = " << delta << "\n";
- std::cerr << "discarded " << discard << " transcripts in this round whose mean was below " << minMean << "\n";
- }
-
- std::cerr << "end of outer iteration " << oiter << " recomputing biases\n";
- // Thresholding
- tbb::parallel_for( size_t(0), size_t(transcripts_.size()),
- [this, minMean](size_t tid) -> void {
- auto& ts = this->transcripts_[tid];
- if (ts.mean < minMean) {
- ts.mean = 0.0;
- for (auto& kv : ts.binMers) {
- kv.second = 0.0;
- }
- }
- });
- //computeKmerFidelities_();
- }
- */
-
- }
-
-
- void writeAbundances(const boost::filesystem::path& outputFilePath,
- const std::string& headerLines,
- double minAbundance,
- bool haveCI) {
-
- haveCI = false;
-
- auto writeCoverageInfo = true;
- if ( writeCoverageInfo ) {
- boost::filesystem::path covPath = outputFilePath.parent_path();
- covPath /= "equivClassCoverage.txt";
- _dumpCoverage( covPath );
- }
-
- std::cerr << "Writing output\n";
- ez::ezETAProgressBar pb(transcripts_.size());
- pb.start();
-
- auto estimatedGroupTotal = psum_(kmerGroupCounts_);
- auto totalNumKmers = readHash_.totalLength() * (std::ceil(readHash_.averageLength()) - readHash_.kmerLength() + 1);
- std::vector<KmerQuantity> fracTran(transcripts_.size(), 0.0);
- std::vector<KmerQuantity> fracTranLow(transcripts_.size(), 0.0);
- std::vector<KmerQuantity> fracTranHigh(transcripts_.size(), 0.0);
-
- std::vector<KmerQuantity> numKmersFromTranscript(transcripts_.size(), 0.0);
-
- // Compute nucleotide fraction (\nu_i in RSEM)
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcripts_.size())),
- [this, &numKmersFromTranscript, &fracTran, totalNumKmers, estimatedGroupTotal](const BlockedIndexRange& range) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- auto& ts = this->transcripts_[tid];
- //fracTran[tid] = this->_computeMean(ts);
-
- // EM
- numKmersFromTranscript[tid] = this->_computeSum(ts);
-
- // VB
- //numKmersFromTranscript[tid] = ts.mean * estimatedGroupTotal;//this->_computeSum(ts);
- //fracTran[tid] = this->_computeClampedMean( ts );
- }
- });
- // noise transcript
- // fracTran[transcripts_.size()] = totalNumKmers - estimatedGroupTotal;
-
- // Normalize to obtain the fraction of kmers that originated from each transcript
- normalize_(numKmersFromTranscript);
- auto& fracNuc = numKmersFromTranscript;
-
-
- // Compute transcript fraction (\tau_i in RSEM)
- tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
- [&, this](const BlockedIndexRange& range) -> void {
- for (auto i = range.begin(); i != range.end(); ++i) {
- auto& ts = transcripts_[i];
-
- fracTran[i] = this->_computeMean(ts);
-
- //fracTran[i] = (ts.effectiveLength > 0) ? ts.mean / ts.effectiveLength : 0.0;
- fracTranLow[i] = (ts.effectiveLength > 0) ? ts.fracLow / ts.effectiveLength : 0.0;
- fracTranHigh[i] = (ts.effectiveLength > 0) ? ts.fracHigh / ts.effectiveLength : 0.0;
-
- }
- });
- //normalize_(fracTran);
-
- auto fracNorm = 1.0 / psum_(fracTran);
- tbb::parallel_for(BlockedIndexRange(size_t(0), transcripts_.size()),
- [&, fracNorm](const BlockedIndexRange& range) -> void {
- for (auto i = range.begin(); i != range.end(); ++i) {
- auto& ts = transcripts_[i];
- fracTran[i] *= fracNorm;
- fracTranLow[i] *= fracNorm;
- fracTranHigh[i] *= fracNorm;
- }
- });
-
-
- std::ofstream ofile( outputFilePath.string() );
- size_t index = 0;
- double million = std::pow(10.0, 6);
- double billion = std::pow(10.0, 9);
- double estimatedReadLength = readHash_.averageLength();
- double kmersPerRead = ((estimatedReadLength - merLen_) + 1);
- double totalMappedKmers = std::accumulate(numKmersFromTranscript.begin(),
- numKmersFromTranscript.end(),
- 0.0);
- double totalMappedReads = totalMappedKmers / kmersPerRead;
-
- ofile << headerLines;
-
- ofile << "# " <<
- "Transcript" << '\t' <<
- "Length" << '\t' <<
- "TPM" << '\t' <<
- "RPKM" << '\t' <<
- "KPKM" << '\t' <<
- "EstimatedNumReads";
-
- if (haveCI) {
- ofile << '\t' << "TPM_LOW" << '\t' << "TPM_HIGH";
- }
-
- ofile << '\n';
-
- for ( auto i : boost::irange(size_t{0}, transcripts_.size()) ) {
- auto& ts = transcripts_[i];
- // expected # of kmers coming from transcript i
-
- // auto ci = estimatedGroupTotal * fracNuc[i];
- auto ci = numKmersFromTranscript[i];
-
- // expected # of reads coming from transcript i
- auto ri = ci / kmersPerRead;
- double effectiveLength = ts.length - std::floor(estimatedReadLength) + 1;
- double effectiveLengthKmer = ts.length - merLen_ + 1;
-
- auto kpkm = (effectiveLengthKmer > 0) ?
- (ci * billion) / (effectiveLengthKmer * totalMappedKmers) : 0.0;
- kpkm = (kpkm < minAbundance) ? 0.0 : kpkm;
-
- auto rpkm = (effectiveLength > 0) ?
- (ri * billion) / (effectiveLength * totalMappedReads) : 0.0;
- rpkm = (kpkm < minAbundance) ? 0.0 : rpkm;
-
- // PREVIOUS
- // auto ci = estimatedGroupTotal * fracNuc[i];
- /*
- auto kpkm = (effectiveLengthKmer > 0) ?
- (billion * (fracNuc[i] / ts.effectiveLength)) : 0.0;
- kpkm = (kpkm < minAbundance) ? 0.0 : kpkm;
- auto rpkm = (effectiveLength > 0) ?
- (billion * (fracNuc[i] / ts.effectiveLength)) : 0.0;
- rpkm = (kpkm < minAbundance) ? 0.0 : rpkm;
-
- */
-
-
- auto tpm = fracTran[i] * million;
- tpm = (kpkm < minAbundance) ? 0.0 : tpm;
-
- ofile << transcriptGeneMap_.transcriptName(index) <<
- '\t' << ts.length << '\t' <<
- tpm << '\t' <<
- rpkm << '\t' <<
- kpkm << '\t' <<
- ri;
- if (haveCI) {
- ofile << '\t' <<
- fracTranLow[i] * million << '\t' <<
- fracTranHigh[i] * million;
- }
- ofile << '\n';
-
- ++index;
- ++pb;
- }
- ofile.close();
- }
-
-
- /**
- * Instead of using the EM (SQUAREM) algorithm, infer the posterior distribution
- * using Streaming, Distributed, Asynchronous (SDA) variational Bayes
- */
- KmerQuantity optimizeVB(const std::string& klutfname,
- const std::string& tlutfname,
- const std::string& kmerEquivClassFname,
- size_t numIt,
- double minMean,
- double maxDelta) {
-
-
- kmerEquivClassFname_ = kmerEquivClassFname;
- // Prepare the necessary structures
- const bool discardZeroCountKmers = false;
- initialize_(klutfname, tlutfname, kmerEquivClassFname, discardZeroCountKmers);
-
- const size_t numTranscripts = transcripts_.size();
- const size_t numKmers = transcriptsForKmer_.size();
-
- // Set the appropriate, user-specified, convergence criteria
- std::function<bool(std::vector<double>&, std::vector<double>&)> hasConverged;
- if (std::isfinite(maxDelta)) {
- hasConverged = [maxDelta, this] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
- double maxVal = *std::max_element(v1.begin(), v1.end());
- std::cerr << "maxVal: " << maxVal << "\n";
- double minVal = 1e-7;
- auto relDiff = this->relAbsDiff_(v0, v1, minVal);
- double maxRelativeChange = *std::max_element( relDiff.begin(), relDiff.end() );
- std::cerr << "max relative change: " << maxRelativeChange << "\n";
- return maxRelativeChange < maxDelta;
- };
- } else {
- hasConverged = [] (std::vector<double>& v0, std::vector<double>& v1) -> bool {
- std::cerr << "no data-driven convergence criterion specified\n";
- return false;
- };
- }
-
-
- // We'll use these to check convergence
- std::vector<double> meansOld(transcripts_.size(), 0.0);
- std::vector<double> meansNew(transcripts_.size(), 0.0);
-
-
- std::atomic<size_t> uniquelyAnchoredTranscripts{0};
- std::atomic<size_t> nonZeroTranscripts{0};
-
- const double DirichletPriorAlpha = 0.1;
- std::vector<double> priorAlphas(transcripts_.size(), 0.0);
- std::vector<double> posteriorAlphas(transcripts_.size(), 0.0);
- /*
- std::vector<double> kmerWeights(numKmers, 0.0);
- for (auto tid : boost::irange(size_t{0}, numTranscripts)) {
- auto& transcriptData = transcripts_[tid];
- for ( auto & kv : transcriptData.binMers ) {
- kmerWeights[kv.first] += kv.second * weight_(kv.first);
- }
- }
- for (auto kw : kmerWeights) {
- if (std::abs(kw - 1.0) > 1e-3) {
- std::cerr << "Kmer had weight " << kw << "\n";
- }
- }
- */
-
- // Compute the initial mean for each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), numTranscripts),
- [this, DirichletPriorAlpha, &uniquelyAnchoredTranscripts, &nonZeroTranscripts, &meansOld, &posteriorAlphas](const BlockedIndexRange& range) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- auto& transcriptData = this->transcripts_[tid];
- KmerQuantity total = 0.0;
-
- bool notTotallyPromiscuous{false};
- bool nonZero{false};
-
- auto effLength = transcriptData.effectiveLength;
- for ( auto & kv : transcriptData.binMers ) {
- auto kmer = kv.first;
-
- // count is the number of times occurrences of the k-mer in this transcript
- auto count = kv.second;
-
- // weight(kmer) = 1 / total # occurences
- // count = # occurences in this transcript
- // kmerGroupCounts(kmer) = # of observations of kmer in the read set
- // The weight attributed to this transcript (kv.second) is:
- // count * weight(kmer) * kmerGroupCounts(kmer) = (# occurrences in ts / total # occurrences) * total count
- kv.second = (effLength > 0.0) ?
- (count * this->kmerGroupCounts_[kmer] * this->weight_(kmer)) :
- 0.0;
-
- if (kv.second > 0.0 and kmerGroupPromiscuities_[kmer] == 1) { notTotallyPromiscuous = true; }
- if (kv.second > 0.0) { nonZero = true; }
- }
-
- //transcriptData.mean = this->_computeMean(transcriptData);
- auto weightedKmerSum = this->_computeSum(transcriptData);
- // Set \alpha_t = \alpha_0 + \mu_t
- posteriorAlphas[tid] = DirichletPriorAlpha ;//+ weightedKmerSum;
- if (notTotallyPromiscuous) { ++uniquelyAnchoredTranscripts; }
- if (nonZero) { ++nonZeroTranscripts; }
- }
- }
- );
-
- auto posteriorAlphaSum = psum_(posteriorAlphas);
- std::cerr << "posteriorAlphaSum : " << posteriorAlphaSum << " [before iterations]\n";
- for (size_t i : boost::irange({0}, numTranscripts)) {
- meansOld[i] = posteriorAlphas[i] / posteriorAlphaSum;
- }
-
- std::vector<double> expctedLogThetas(transcripts_.size(), 0.0);
- std::vector<double> logRho(transcripts_.size(), -std::numeric_limits<double>::infinity());
- std::string clearline = " \r\r";
-
-
- for (size_t currIt : boost::irange({0}, numIt)) {
- std::string jumpBack = "\x1b[A";
- std::cerr << clearline << "iteration : " << currIt << "\n";
- // log rho_{ntsoa} = E_{theta}[log theta_t] + log P(S_n | T_n) + [other terms sum to 0]
- // E_{theta}[log theta_t] = digamma(alpha_t) - digamma(\sum_{t'} alpha_{t'})
- double sumAlpha = psum_(posteriorAlphas);
- double digammaSumAlpha = boost::math::digamma(sumAlpha);
- for (size_t i : boost::irange({0}, numTranscripts)) {
- auto& ts = transcripts_[i];
- //double digammaAlphaT = boost::math::digamma(posteriorAlphas[i]);
-
- logRho[i] = (ts.effectiveLength > 0 and posteriorAlphas[i] > 0.0) ?
- (boost::math::digamma(posteriorAlphas[i]) - digammaSumAlpha) + std::log(1.0 / ts.effectiveLength) :
- -std::numeric_limits<double>::infinity();
- }
-
- std::atomic<size_t> numUpdated{0};
-
- // E-Step : reassign the kmer group counts proportionally to each transcript
- tbb::parallel_for(BlockedIndexRange(size_t(0), numKmers),
- // for each kmer group
- [&meansOld, &meansNew, &logRho, &posteriorAlphas, &numUpdated, DirichletPriorAlpha, this](const BlockedIndexRange& range) -> void {
- for (auto kid : boost::irange(range.begin(), range.end())) {
- auto kmer = kid;
- // for each transcript containing this kmer group
- auto& transcripts = this->transcriptsForKmer_[kmer];
-
- double totalMass = 0.0;
- for ( auto tid : transcripts ) {
- auto el = this->transcripts_[tid].effectiveLength;
- double rho = (el > 0) ? std::exp(logRho[tid]) : 0.0;
- totalMass += rho;
- }
-
- double norm = (totalMass > 0.0) ? (1.0 / totalMass) : 0.0;
- for ( auto tid : transcripts ) {
- auto& trans = this->transcripts_[tid];
- auto lastIndex = trans.binMers.size() - 1;
- auto el = trans.effectiveLength;
- double rho = (el > 0) ? std::exp(logRho[tid]) : 0.0;
- trans.binMers[kmer] = rho * norm *
- kmerGroupBiases_[kmer] * this->kmerGroupCounts_[kmer];
-
- // If we've seen all of the k-mers that appear in this transcript,
- // then we can compute it's new estimated abundance
- if (trans.updated++ == lastIndex) {
- ++numUpdated;
-
- // We're folding the length term into the binMer weights now
- // should this computeSum be a computeMean?
- //trans.mean = this->_computeSum(trans);//this->_computeMean(trans);
- auto sumWeightedReadMass = this->_computeSum(trans);
-
- // with filter
- // posteriorAlphas[tid] = DirichletPriorAlpha + sumWeightedReadMass;
- posteriorAlphas[tid] = (posteriorAlphas[tid] > 0.0) ? DirichletPriorAlpha + sumWeightedReadMass : 0.0;
-
- // without filter
- //posteriorAlphas[tid] = DirichletPriorAlpha + trans.mean;
- if (std::isnan(posteriorAlphas[tid])) {
- std::cerr << "posterior is nan at " << tid << "\n";
- std::cerr << "mean: " << trans.mean << "\n";
- std::cerr << "norm: " << norm << "\n";
- }
- trans.updated.store(0);
- }
- }
- } // for kid in range
- });
-
- std::cerr << clearline << "numUpdated = " << numUpdated << " : numTranscripts = " << numTranscripts << "\n";
- jumpBack += "\x1b[A";
-
- auto posteriorAlphaSum = psum_(posteriorAlphas);
- std::cerr << clearline << "posterior alpha sum: " << posteriorAlphaSum << "\n";
- jumpBack += "\x1b[A";
- for (size_t i : boost::irange({0}, numTranscripts)) {
- meansNew[i] = posteriorAlphas[i] / posteriorAlphaSum;
- transcripts_[i].mean = meansNew[i];
- }
-
- // Check for data-driven convergence criteria
- if (hasConverged(meansOld, meansNew)) {
- std::cerr << clearline << "convergence criteria met; terminating VB\n";
- break;
- }
- jumpBack += "\x1b[A";
-
- std::swap(meansNew, meansOld);
-
- bool doFilter{false};
- double itFrac = currIt / static_cast<double>(numIt);
- double cutoff = std::min(1.0, std::exp(itFrac - 0.3));
- if (doFilter and currIt > 0 and currIt % 100 == 0) {
- // cutoff = e^{x^2 - 0.5}
- filterByCoverage_(cutoff, posteriorAlphas);
- }
-
- std::cerr << clearline << "cutoff = " << cutoff << "\n";
- jumpBack += "\x1b[A";
- std::cerr << jumpBack;
- }
-
- // Compute the confidence intervals for the expression values
- // The marginal for each transcript fraction takes the form of a Beta distribution
- // we compute the confidence intervals by the appropriate quantiles of the distribution
- double sumOfAlphas = psum_(posteriorAlphas);
- tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(transcriptGeneMap_.numTranscripts())),
- [this, &posteriorAlphas, sumOfAlphas](const BlockedIndexRange& range) -> void {
- for (auto tid = range.begin(); tid != range.end(); ++tid) {
- auto& transcriptData = this->transcripts_[tid];
- auto alphaT = posteriorAlphas[tid];
- if (alphaT > 0) {
- boost::math::beta_distribution<double> marginalDist(alphaT, sumOfAlphas - alphaT);
- transcriptData.mean = boost::math::mean(marginalDist);
- transcriptData.fracLow = boost::math::quantile(marginalDist, 0.025);
- transcriptData.fracHigh = boost::math::quantile(marginalDist, 0.975);
- } else {
- transcriptData.mean = transcriptData.fracLow = transcriptData.fracHigh = 0.0;
- }
- }
- });
-
- }
-
-};
-
-#endif // ITERATIVE_OPTIMIZER_HPP
diff --git a/src/SalmonQuantify.cpp.bak b/src/SalmonQuantify.cpp.bak
deleted file mode 100644
index 0c2acd3..0000000
--- a/src/SalmonQuantify.cpp.bak
+++ /dev/null
@@ -1,1314 +0,0 @@
-/**
->HEADER
- Copyright (c) 2013 Rob Patro robp at cs.cmu.edu
-
- This file is part of Salmon.
-
- Salmon is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- Salmon is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with Salmon. If not, see <http://www.gnu.org/licenses/>.
-<HEADER
-**/
-
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <cmath>
-#include <unordered_map>
-#include <map>
-#include <vector>
-#include <unordered_set>
-#include <mutex>
-#include <thread>
-#include <sstream>
-#include <exception>
-#include <random>
-#include <queue>
-#include "btree_map.h"
-#include "btree_set.h"
-
-
-/** BWA Includes */
-#include <cstdio>
-#include <unistd.h>
-#include <cstdlib>
-#include <cstring>
-#include <cctype>
-
-extern "C" {
-#include "bwa.h"
-#include "bwamem.h"
-#include "kvec.h"
-#include "utils.h"
-#include "kseq.h"
-#include "utils.h"
-}
-
-/** Boost Includes */
-#include <boost/filesystem.hpp>
-#include <boost/container/flat_map.hpp>
-#include <boost/dynamic_bitset/dynamic_bitset.hpp>
-#include <boost/range/irange.hpp>
-#include <boost/program_options.hpp>
-#include <boost/shared_ptr.hpp>
-#include <boost/scoped_ptr.hpp>
-#include <boost/lockfree/queue.hpp>
-#include <boost/thread/thread.hpp>
-
-#include "tbb/concurrent_unordered_set.h"
-#include "tbb/concurrent_vector.h"
-#include "tbb/concurrent_unordered_map.h"
-#include "tbb/concurrent_queue.h"
-#include "tbb/parallel_for.h"
-#include "tbb/parallel_for_each.h"
-#include "tbb/parallel_reduce.h"
-#include "tbb/blocked_range.h"
-#include "tbb/task_scheduler_init.h"
-#include "tbb/partitioner.h"
-
-#if HAVE_LOGGER
-#include "g2logworker.h"
-#include "g2log.h"
-#endif
-
-#include "cereal/types/vector.hpp"
-#include "cereal/archives/binary.hpp"
-
-#include "jellyfish/mer_dna.hpp"
-
-#include "ClusterForest.hpp"
-#include "PerfectHashIndex.hpp"
-#include "LookUpTableUtils.hpp"
-#include "SalmonMath.hpp"
-#include "Transcript.hpp"
-#include "LibraryFormat.hpp"
-#include "SailfishUtils.hpp"
-#include "ReadLibrary.hpp"
-
-#include "PairSequenceParser.hpp"
-
-//#include "google/dense_hash_map"
-
-extern unsigned char nst_nt4_table[256];
-char* bwa_pg = "cha";
-
-typedef pair_sequence_parser<char**> sequence_parser;
-
-using TranscriptID = uint32_t;
-using TranscriptIDVector = std::vector<TranscriptID>;
-using KmerIDMap = std::vector<TranscriptIDVector>;
-using my_mer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
-
-uint32_t transcript(uint64_t enc) {
- uint32_t t = (enc & 0xFFFFFFFF00000000) >> 32;
- return t;
-}
-
-uint32_t offset(uint64_t enc) {
- uint32_t o = enc & 0xFFFFFFFF;
- return o;
-}
-
-class Alignment {
- public:
- Alignment(TranscriptID transcriptIDIn, uint32_t kCountIn = 1, double logProbIn = salmon::math::LOG_0) :
- transcriptID_(transcriptIDIn), kmerCount(kCountIn), logProb(logProbIn) {}
-
- inline TranscriptID transcriptID() { return transcriptID_; }
- uint32_t kmerCount;
- double logProb;
-
- private:
- TranscriptID transcriptID_;
-};
-
-void processMiniBatch(
- double logForgettingMass,
- std::vector<std::vector<Alignment>>& batchHits,
- std::vector<Transcript>& transcripts,
- ClusterForest& clusterForest
- ) {
-
- using salmon::math::LOG_0;
- using salmon::math::logAdd;
- using salmon::math::logSub;
-
- size_t numTranscripts{transcripts.size()};
-
- bool burnedIn{true};
- // Build reverse map from transcriptID => hit id
- using HitID = uint32_t;
- btree::btree_map<TranscriptID, std::vector<Alignment*>> hitsForTranscript;
- size_t hitID{0};
- for (auto& hv : batchHits) {
- for (auto& tid : hv) {
- hitsForTranscript[tid.transcriptID()].push_back(&tid);
- }
- ++hitID;
- }
-
- double clustTotal = std::log(batchHits.size()) + logForgettingMass;
- {
- // E-step
-
- // Iterate over each group of alignments (a group consists of all alignments reported
- // for a single read). Distribute the read's mass proportionally dependent on the
- // current hits
- for (auto& alnGroup : batchHits) {
- if (alnGroup.size() == 0) { continue; }
- double sumOfAlignProbs{LOG_0};
- // update the cluster-level properties
- bool transcriptUnique{true};
- auto firstTranscriptID = alnGroup.front().transcriptID();
- std::unordered_set<size_t> observedTranscripts;
- for (auto& aln : alnGroup) {
- auto transcriptID = aln.transcriptID();
- auto& transcript = transcripts[transcriptID];
- transcriptUnique = transcriptUnique and (transcriptID == firstTranscriptID);
-
- double refLength = transcript.RefLength > 0 ? transcript.RefLength : 1.0;
-
- double logFragProb = salmon::math::LOG_1;
-
- // The alignment probability is the product of a transcript-level term (based on abundance and) an alignment-level
- // term below which is P(Q_1) * P(Q_2) * P(F | T)
- double logRefLength = std::log(refLength);
- double transcriptLogCount = transcript.mass();
- if ( transcriptLogCount != LOG_0 ) {
- double errLike = salmon::math::LOG_1;
- if (burnedIn) {
- //errLike = errMod.logLikelihood(aln, transcript);
- }
-
- aln.logProb = std::log(std::pow(aln.kmerCount,2.0)) + (transcriptLogCount - logRefLength);// + qualProb + errLike;
- //aln.logProb = (transcriptLogCount - logRefLength);// + qualProb + errLike;
-
-
- sumOfAlignProbs = logAdd(sumOfAlignProbs, aln.logProb);
- if (observedTranscripts.find(transcriptID) == observedTranscripts.end()) {
- transcripts[transcriptID].addTotalCount(1);
- observedTranscripts.insert(transcriptID);
- }
- } else {
- aln.logProb = LOG_0;
- }
- }
-
- // normalize the hits
- if (sumOfAlignProbs == LOG_0) { std::cerr << "0 probability fragment; skipping\n"; continue; }
- for (auto& aln : alnGroup) {
- aln.logProb -= sumOfAlignProbs;
- auto transcriptID = aln.transcriptID();
- auto& transcript = transcripts[transcriptID];
- /*
- double r = uni(eng);
- if (!burnedIn and r < std::exp(aln.logProb)) {
- errMod.update(aln, transcript, aln.logProb, logForgettingMass);
- if (aln.fragType() == ReadType::PAIRED_END) {
- double fragLength = aln.fragLen();//std::abs(aln.read1->core.pos - aln.read2->core.pos) + aln.read2->core.l_qseq;
- fragLengthDist.addVal(fragLength, logForgettingMass);
- }
- }
- */
-
- } // end normalize
-
- // update the single target transcript
- if (transcriptUnique) {
- transcripts[firstTranscriptID].addUniqueCount(1);
- clusterForest.updateCluster(firstTranscriptID, 1, logForgettingMass);
- } else { // or the appropriate clusters
- clusterForest.mergeClusters<Alignment>(alnGroup.begin(), alnGroup.end());
- clusterForest.updateCluster(alnGroup.front().transcriptID(), 1, logForgettingMass);
- }
-
- } // end read group
- }// end timer
-
- double individualTotal = LOG_0;
- {
- // M-step
- double totalMass{0.0};
- for (auto kv = hitsForTranscript.begin(); kv != hitsForTranscript.end(); ++kv) {
- auto transcriptID = kv->first;
- // The target must be a valid transcript
- if (transcriptID >= numTranscripts or transcriptID < 0) {std::cerr << "index " << transcriptID << " out of bounds\n"; }
-
- auto& transcript = transcripts[transcriptID];
-
- // The prior probability
- double hitMass{LOG_0};
-
- // The set of alignments that match transcriptID
- auto& hits = kv->second;
- std::for_each(hits.begin(), hits.end(), [&](Alignment* aln) -> void {
- if (!std::isfinite(aln->logProb)) { std::cerr << "hitMass = " << aln->logProb << "\n"; }
- hitMass = logAdd(hitMass, aln->logProb);
- });
-
- double updateMass = logForgettingMass + hitMass;
- individualTotal = logAdd(individualTotal, updateMass);
- totalMass = logAdd(totalMass, updateMass);
- transcript.addMass(updateMass);
- } // end for
- } // end timer
- //if (processedReads >= 5000000 and !burnedIn) { burnedIn = true; }
-}
-
-uint32_t basesCovered(std::vector<uint32_t>& kmerHits) {
- std::sort(kmerHits.begin(), kmerHits.end());
- uint32_t covered{0};
- uint32_t lastHit{0};
- uint32_t kl{20};
- for (auto h : kmerHits) {
- covered += std::min(h - lastHit, kl);
- lastHit = h;
- }
- return covered;
-}
-
-uint32_t basesCovered(std::vector<uint32_t>& posLeft, std::vector<uint32_t>& posRight) {
- return basesCovered(posLeft) + basesCovered(posRight);
-}
-
-class KmerVote {
- public:
- KmerVote(uint32_t vp, uint32_t rp, uint32_t vl) : votePos(vp), readPos(rp), voteLen(vl) {}
- uint32_t votePos{0};
- uint32_t readPos{0};
- uint32_t voteLen{0};
-};
-
-
-class TranscriptHitList {
- public:
- uint32_t bestHitPos{0};
- uint32_t bestHitScore{0};
- std::vector<KmerVote> votes;
-
- void addVote(uint32_t tpos, uint32_t readPos, uint32_t voteLen) {
- uint32_t transcriptPos = (readPos > tpos) ? 0 : tpos - readPos;
- votes.emplace_back(transcriptPos, readPos, voteLen);
- }
-
- void addVoteRC(uint32_t tpos, uint32_t readPos, uint32_t voteLen) {
- uint32_t transcriptPos = (readPos > tpos) ? 0 : tpos + readPos;
- votes.emplace_back(transcriptPos, readPos, voteLen);
- }
-
-
- uint32_t totalNumHits() { return votes.size(); }
-
- bool computeBestHit() {
- std::sort(votes.begin(), votes.end(),
- [](const KmerVote& v1, const KmerVote& v2) -> bool {
- if (v1.votePos == v2.votePos) {
- return v1.readPos < v2.readPos;
- }
- return v1.votePos < v2.votePos;
- });
-
- /*
- std::cerr << "(" << votes.size() << ") { ";
- for (auto v : votes) {
- std::cerr << v.votePos << " ";
- }
- std::cerr << "}\n";
- */
- uint32_t maxClusterPos{0};
- uint32_t maxClusterCount{0};
-
- struct VoteInfo {
- uint32_t coverage;
- uint32_t rightmostBase;
- };
-
- boost::container::flat_map<uint32_t, VoteInfo> hitMap;
- int32_t currClust{votes.front().votePos};
- for (size_t j = 0; j < votes.size(); ++j) {
-
- uint32_t votePos = votes[j].votePos;
- uint32_t readPos = votes[j].readPos;
- uint32_t voteLen = votes[j].voteLen;
-
- if (votePos >= currClust) {
- if (votePos - currClust > 10) {
- currClust = votePos;
- }
- auto& hmEntry = hitMap[currClust];
-
- hmEntry.coverage += std::min(voteLen, (readPos + voteLen) - hmEntry.rightmostBase);
- hmEntry.rightmostBase = readPos + voteLen;
- } else if (votePos < currClust) {
- std::cerr << "CHA?!?\n";
- if (currClust - votePos > 10) {
- currClust = votePos;
- }
- auto& hmEntry = hitMap[currClust];
- hmEntry.coverage += std::min(voteLen, (readPos + voteLen) - hmEntry.rightmostBase);
- hmEntry.rightmostBase = readPos + voteLen;
- }
-
- if (hitMap[currClust].coverage > maxClusterCount) {
- maxClusterCount = hitMap[currClust].coverage;
- maxClusterPos = currClust;
- }
-
- }
-
- bestHitPos = maxClusterPos;
- bestHitScore = maxClusterCount;
- return true;
- }
-};
-
-// To use the parser in the following, we get "jobs" until none is
-// available. A job behaves like a pointer to the type
-// jellyfish::sequence_list (see whole_sequence_parser.hpp).
-void add_sizes(sequence_parser* parser, std::atomic<uint64_t>* total_fwd, std::atomic<uint64_t>* total_bwd,
- std::atomic<uint64_t>& totalHits,
- std::atomic<uint64_t>& rn,
- PerfectHashIndex& phi,
- std::vector<Transcript>& transcripts,
- std::atomic<uint64_t>& batchNum,
- double& logForgettingMass,
- std::mutex& ffMutex,
- ClusterForest& clusterForest,
- std::vector<uint64_t>& offsets,
- std::vector<uint64_t>& kmerLocs,
- uint32_t sampleRate
- //google::dense_hash_map<uint64_t, uint64_t>& khash
- //std::vector<std::vector<uint64_t>>& kmerLocMap
- ) {
- uint64_t count_fwd = 0, count_bwd = 0;
- auto INVALID = phi.INVALID;
- auto merLen = phi.kmerLength();
-
- double forgettingFactor{0.65};
-
- std::vector<std::vector<Alignment>> hitLists;
- hitLists.resize(5000);
-
- uint64_t leftHitCount{0};
- uint64_t hitListCount{0};
-
- class DirHit {
- uint32_t fwd{0};
- uint32_t rev{0};
- };
-
- size_t locRead{0};
- while(true) {
- sequence_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
- if(j.is_empty()) break; // If got nothing, quit
-
- my_mer kmer;
- my_mer rkmer;
-
- for(size_t i = 0; i < j->nb_filled; ++i) { // For all the read we got
-
- hitLists.resize(j->nb_filled);
-
- /*
- btree::btree_map<uint64_t, uint32_t> eqLeftFwdPos;
- btree::btree_map<uint64_t, uint32_t> eqLeftBwdPos;
- btree::btree_map<uint64_t, uint32_t> eqRightFwdPos;
- btree::btree_map<uint64_t, uint32_t> eqRightBwdPos;
- */
-
- std::unordered_map<uint64_t, TranscriptHitList> leftFwdHits;
- std::unordered_map<uint64_t, TranscriptHitList> leftBwdHits;
-
- std::unordered_map<uint64_t, TranscriptHitList> rightFwdHits;
- std::unordered_map<uint64_t, TranscriptHitList> rightBwdHits;
-
- uint32_t totFwdLeft = 0;
- uint32_t totBwdLeft = 0;
- uint32_t totFwdRight = 0;
- uint32_t totBwdRight = 0;
- uint32_t readLength = 0;
-
- uint32_t leftReadLength{0};
- uint32_t rightReadLength{0};
-
- uint64_t prevEQ{std::numeric_limits<uint64_t>::max()};
-
- count_fwd += j->data[i].first.seq.size(); // Add up the size of the sequence
- count_bwd += j->data[i].second.seq.size(); // Add up the size of the sequence
-
- //---------- Left End ----------------------//
- {
- const char* start = j->data[i].first.seq.c_str();
- uint32_t readLen = j->data[i].first.seq.size();
- leftReadLength = readLen;
- const char* const end = start + readLen;
- uint32_t cmlen{0};
- uint32_t rbase{0};
- uint32_t phase{0};
- // iterate over the read base-by-base
- while(start < end) {
- ++rbase; ++readLength;
- char base = *start; ++start;
- auto c = jellyfish::mer_dna::code(base);
- kmer.shift_left(c);
- rkmer.shift_right(jellyfish::mer_dna::complement(c));
- switch(c) {
- case jellyfish::mer_dna::CODE_IGNORE:
- break;
- case jellyfish::mer_dna::CODE_COMMENT:
- std::cerr << "ERROR: unexpected character " << base << " in read!\n";
- // Fall through
- case jellyfish::mer_dna::CODE_RESET:
- cmlen = 0;
- kmer.polyA(); rkmer.polyA();
- break;
-
- default:
- if(++cmlen >= merLen) {
- if (phase++ % sampleRate != 0) { continue; }
- cmlen = merLen;
- auto merID = phi.index(kmer.get_bits(0, 2*merLen));
- //auto merIt = khash.find(kmer.get_bits(0, 2*merLen));
- //auto merID = (merIt == khash.end()) ? INVALID : merIt->second;
-
-
- if (merID != INVALID) {
- // Locations
- auto first = offsets[merID];
- auto last = offsets[merID+1];
- for (size_t ei = first; ei < last; ++ei) {
- uint64_t e = kmerLocs[ei];
- //for (uint64_t e : kmerLocMap[merID]) {
- leftFwdHits[transcript(e)].addVote(offset(e), rbase - merLen, merLen);
- }
- /*
- auto eqClass = memberships[merID];
- eqLeftFwdPos[eqClass]++;//.push_back(rbase);
- */
- totFwdLeft++;
- }
-
- auto rmerID = phi.index(rkmer.get_bits(0, 2*merLen));
- //auto rmerIt = khash.find(rkmer.get_bits(0, 2*merLen));
- //auto rmerID = (rmerIt == khash.end()) ? INVALID : rmerIt->second;
-
- if (rmerID != INVALID) {
- // Locations
- auto first = offsets[rmerID];
- auto last = offsets[rmerID+1];
- for (size_t ei = first ; ei < last; ++ei) {
- uint64_t e = kmerLocs[ei];
- //for (uint64_t e : kmerLocMap[rmerID]) {
- leftFwdHits[transcript(e)].addVoteRC(offset(e), rbase - merLen, merLen);
- }
-
- /*
- auto eqClass = memberships[rmerID];
- eqLeftBwdPos[eqClass]++;//.push_back(rbase);
- */
- totBwdLeft++;
- }
-
- }
-
- } // end switch(c)
- } // while (start < end)
- }
-
- hitLists[i].clear();
- auto& hitList = hitLists[i];
- prevEQ = std::numeric_limits<uint64_t>::max();
- //---------- Right End ----------------------//
- {
- kmer.polyA();
- rkmer.polyA();
- const char* start = j->data[i].second.seq.c_str();
- uint32_t readLen = j->data[i].second.seq.size();
- rightReadLength = readLen;
- const char* const end = start + readLen;
- uint32_t cmlen{0};
- uint32_t rbase{0};
-
- uint32_t phase{0};
- // iterate over the read base-by-base
- while(start < end) {
- ++rbase; ++readLength;
- char base = *start; ++start;
- auto c = jellyfish::mer_dna::code(base);
- kmer.shift_left(c);
- rkmer.shift_right(jellyfish::mer_dna::complement(c));
- switch(c) {
- case jellyfish::mer_dna::CODE_IGNORE:
- break;
- case jellyfish::mer_dna::CODE_COMMENT:
- std::cerr << "ERROR: unexpected character " << base << " in read!\n";
- // Fall through
- case jellyfish::mer_dna::CODE_RESET:
- cmlen = 0;
- kmer.polyA(); rkmer.polyA();
- break;
-
- default:
- if(++cmlen >= merLen) {
- if (phase++ % sampleRate != 0) { continue; }
- cmlen = merLen;
- auto merID = phi.index(kmer.get_bits(0, 2*merLen));
- //auto merIt = khash.find(kmer.get_bits(0, 2*merLen));
- //auto merID = (merIt == khash.end()) ? INVALID : merIt->second;
-
- if (merID != INVALID) {
- // Locations
- auto first = offsets[merID];
- auto last = offsets[merID+1];
- for (size_t ei = first; ei < last; ++ei) {
- uint64_t e = kmerLocs[ei];
- //for (uint64_t e : kmerLocMap[merID]) {
- rightFwdHits[transcript(e)].addVote(offset(e), rbase - merLen, merLen);
- }
- /*
- auto eqClass = memberships[merID];
- eqRightFwdPos[eqClass]++;//.push_back(rbase);
- */
- totFwdRight++;
-
- }
-
-
- auto rmerID = phi.index(rkmer.get_bits(0, 2*merLen));
- //auto rmerIt = khash.find(rkmer.get_bits(0, 2*merLen));
- //auto rmerID = (rmerIt == khash.end()) ? INVALID : rmerIt->second;
-
- if (rmerID != INVALID) {
- // Locations
- auto first = offsets[rmerID];
- auto last = offsets[rmerID+1];
- for (size_t ei = first; ei < last; ++ei) {
- uint64_t e = kmerLocs[ei];
- //for (uint64_t e : kmerLocMap[rmerID]) {
- rightFwdHits[transcript(e)].addVoteRC(offset(e), rbase - merLen, merLen);
- }
- /*
- auto eqClass = memberships[rmerID];
- eqRightBwdPos[eqClass]++;//.push_back(rbase);
- */
- totBwdRight++;
- }
-
-
-
- }
-
- } // end switch(c)
- } // while (start < end)
- }
-
- /*
- std::unordered_map<TranscriptID, uint32_t> leftHits;
- std::unordered_map<TranscriptID, uint32_t> rightHits;
-
- auto& eqLeftPos = (totFwdLeft >= totBwdLeft) ? eqLeftFwdPos : eqLeftBwdPos;
- auto& eqRightPos = (totFwdRight >= totBwdRight) ? eqRightFwdPos : eqRightBwdPos;
-
- for (auto& leqPos: eqLeftPos) {
- for (auto t : klut[leqPos.first]) {
- // --- counts
- leftHits[t] += leqPos.second;//.size();
-
- // --- positions
- //auto& thits = leftHits[t];
- //thits.insert(thits.end(), leqPos.second.begin(), leqPos.second.end());
- }
- }
-
- for (auto& reqPos : eqRightPos) {
- for (auto t : klut[reqPos.first]) {
- auto it = leftHits.find(t);
- if (it != leftHits.end() and it->second > 40) {
- // --- counts
- rightHits[t] += reqPos.second;//.size();
-
- // --- positions
- //auto score = basesCovered(it->second);
- //if (score < 70) { continue; }
- //auto& thits = rightHits[t];
- //thits.insert(thits.end(), reqPos.second.begin(), reqPos.second.end());
- }
- }
- }
-
- uint32_t readHits{0};
- for (auto& rightHit : rightHits) {
- if (rightHit.second > 40) {
- // --- counts
- uint32_t score = leftHits[rightHit.first] + rightHit.second;
- hitList.emplace_back(rightHit.first, score);
- readHits += score;
-
- // --- positions
- //auto rscore = basesCovered(rightHit.second);
- //if (rscore >= 70) {
- //uint32_t score = leftHits[rightHit.first] + rightHit.second;
- //hitList.emplace_back(rightHit.first, score);
- //readHits += score;
- }
- }
- leftHitCount += leftHits.size();
- hitListCount += hitList.size();
-
- */
-
- size_t readHits{0};
- std::unordered_map<TranscriptID, uint32_t> leftHitCounts;
- std::unordered_map<TranscriptID, uint32_t> rightHitCounts;
-
- auto& leftHits = leftFwdHits;//(totFwdLeft >= totBwdLeft) ? leftFwdHits : leftBwdHits;
- auto& rightHits = rightFwdHits;//(totFwdRight >= totBwdRight) ? rightFwdHits : rightBwdHits;
-
- for (auto& tHitList : leftHits) {
- // Coverage score
- tHitList.second.computeBestHit();
- //leftHitCounts[tHitList.first] = tHitList.second.totalNumHits();
- ++leftHitCount;
- }
-
- double cutoffLeft{ 0.75 * leftReadLength};
- double cutoffRight{ 0.75 * rightReadLength};
-
- double cutoffLeftCount{ 0.80 * leftReadLength - merLen + 1};
- double cutoffRightCount{ 0.80 * rightReadLength - merLen + 1};
- for (auto& tHitList : rightHits) {
- auto it = leftHits.find(tHitList.first);
- // Simple counting score
- /*
- if (it != leftHits.end() and it->second.totalNumHits() >= cutoffLeftCount) {
- if (tHitList.second.totalNumHits() < cutoffRightCount) { continue; }
- uint32_t score = it->second.totalNumHits() + tHitList.second.totalNumHits();
- */
- // Coverage score
- if (it != leftHits.end() and it->second.bestHitScore >= cutoffLeft) {
- tHitList.second.computeBestHit();
- if (tHitList.second.bestHitScore < cutoffRight) { continue; }
- uint32_t score = it->second.bestHitScore + tHitList.second.bestHitScore;
-
-
- hitList.emplace_back(tHitList.first, score);
- readHits += score;
- ++hitListCount;
- }
- }
-
-
- totalHits += hitList.size() > 0;
- locRead++;
- ++rn;
- if (rn % 50000 == 0) {
- std::cerr << "\r\rprocessed read " << rn;
- std::cerr << "\n leftHits.size() " << leftHits.size()
- << ", rightHits.size() " << rightHits.size() << ", hit list of size = " << hitList.size() << "\n";
- std::cerr << "average leftHits = " << leftHitCount / static_cast<float>(locRead)
- << ", average hitList = " << hitListCount / static_cast<float>(locRead) << "\n";
- }
-
- if (hitList.size() > 100) { hitList.clear(); }
-
- double invHits = 1.0 / readHits;
- for (auto t : hitList) {
- transcripts[t.transcriptID()].addSharedCount( t.kmerCount * invHits );
- }
-
- } // end for i < j->nb_filled
-
- auto oldBatchNum = batchNum++;
- if (oldBatchNum > 1) {
- ffMutex.lock();
- logForgettingMass += forgettingFactor * std::log(static_cast<double>(oldBatchNum-1)) -
- std::log(std::pow(static_cast<double>(oldBatchNum), forgettingFactor) - 1);
- ffMutex.unlock();
- }
-
- //
- processMiniBatch(logForgettingMass, hitLists, transcripts, clusterForest);
-
- }
-
- *total_fwd += count_fwd;
- *total_bwd += count_bwd;
-}
-
-// To use the parser in the following, we get "jobs" until none is
-// available. A job behaves like a pointer to the type
-// jellyfish::sequence_list (see whole_sequence_parser.hpp).
-void processReadsMEM(sequence_parser* parser, std::atomic<uint64_t>* total_fwd, std::atomic<uint64_t>* total_bwd,
- std::atomic<uint64_t>& totalHits,
- std::atomic<uint64_t>& rn,
- bwaidx_t *idx,
- std::vector<Transcript>& transcripts,
- std::atomic<uint64_t>& batchNum,
- double& logForgettingMass,
- std::mutex& ffMutex,
- ClusterForest& clusterForest
- ) {
- uint64_t count_fwd = 0, count_bwd = 0;
-
- double forgettingFactor{0.65};
-
- std::vector<std::vector<Alignment>> hitLists;
- hitLists.resize(5000);
-
- uint64_t leftHitCount{0};
- uint64_t hitListCount{0};
-
- class DirHit {
- uint32_t fwd{0};
- uint32_t rev{0};
- };
-
-
- smem_i *itr = smem_itr_init(idx->bwt);
- const bwtintv_v *a;
- int minLen{20};
- int minIWidth{20};
- int splitWidth{0};
-
- size_t locRead{0};
- while(true) {
- sequence_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group)
- if(j.is_empty()) break; // If got nothing, quit
-
- my_mer kmer;
- my_mer rkmer;
-
- for(size_t i = 0; i < j->nb_filled; ++i) { // For all the read we got
-
- hitLists.resize(j->nb_filled);
-
- std::unordered_map<uint64_t, TranscriptHitList> leftFwdHits;
- std::unordered_map<uint64_t, TranscriptHitList> leftBwdHits;
-
- std::unordered_map<uint64_t, TranscriptHitList> rightFwdHits;
- std::unordered_map<uint64_t, TranscriptHitList> rightBwdHits;
-
- uint32_t totFwdLeft = 0;
- uint32_t totBwdLeft = 0;
- uint32_t totFwdRight = 0;
- uint32_t totBwdRight = 0;
- uint32_t readLength = 0;
-
- uint32_t leftReadLength{0};
- uint32_t rightReadLength{0};
-
- uint64_t prevEQ{std::numeric_limits<uint64_t>::max()};
-
- count_fwd += j->data[i].first.seq.size(); // Add up the size of the sequence
- count_bwd += j->data[i].second.seq.size(); // Add up the size of the sequence
-
- //---------- Left End ----------------------//
- {
- std::string& readStr = j->data[i].first.seq;
- uint32_t readLen = j->data[i].first.seq.size();
-
- for (int p = 0; p < readLen; ++p) {
- readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
- }
-
- char* readPtr = const_cast<char*>(readStr.c_str());
-
- smem_set_query(itr, readLen, reinterpret_cast<uint8_t*>(readPtr));
-
- // while there are more matches on the query
- while ((a = smem_next(itr, minLen<<1, splitWidth)) != 0) {
- for (size_t mi = 0; mi < a->n; ++mi) {
- bwtintv_t *p = &a->a[mi];
- if (static_cast<uint32_t>(p->info) - (p->info>>32) < minLen) continue;
- uint32_t qstart = static_cast<uint32_t>(p->info>>32);
- uint32_t qend = static_cast<uint32_t>(p->info);
- long numHits = static_cast<long>(p->x[2]);
-
- if ( p->x[2] <= minIWidth) {
- for (bwtint_t k = 0; k < p->x[2]; ++k) {
- bwtint_t pos;
- int len, isRev, refID;
- len = static_cast<uint32_t>(p->info - (p->info >> 32));
- pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &isRev);
- if (isRev) { pos -= len - 1; }
- bns_cnt_ambi(idx->bns, pos, len, &refID);
- long hitLoc = static_cast<long>(pos - idx->bns->anns[refID].offset) + 1;
- leftFwdHits[refID].addVote(hitLoc, hitLoc, len);
- } // for k
- } // if <= minIWidth
-
- } // for mi
- } // for all query matches
- }
-
- hitLists[i].clear();
- auto& hitList = hitLists[i];
- prevEQ = std::numeric_limits<uint64_t>::max();
- //---------- Right End ----------------------//
- {
- std::string& readStr = j->data[i].second.seq;
- uint32_t readLen = j->data[i].second.seq.size();
-
- for (int p = 0; p < readLen; ++p) {
- readStr[p] = nst_nt4_table[static_cast<int>(readStr[p])];
- }
-
- char* readPtr = const_cast<char*>(readStr.c_str());
- smem_set_query(itr, readLen, reinterpret_cast<uint8_t*>(readPtr));
-
- // while there are more matches on the query
- while ((a = smem_next(itr, minLen<<1, splitWidth)) != 0) {
- for (size_t mi = 0; mi < a->n; ++mi) {
- bwtintv_t *p = &a->a[mi];
- if (static_cast<uint32_t>(p->info) - (p->info>>32) < minLen) continue;
- uint32_t qstart = static_cast<uint32_t>(p->info>>32);
- uint32_t qend = static_cast<uint32_t>(p->info);
- long numHits = static_cast<long>(p->x[2]);
-
- if ( p->x[2] <= minIWidth) {
- for (bwtint_t k = 0; k < p->x[2]; ++k) {
- bwtint_t pos;
- int len, isRev, refID;
- len = static_cast<uint32_t>(p->info - (p->info >> 32));
- pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &isRev);
- if (isRev) { pos -= len - 1; }
- bns_cnt_ambi(idx->bns, pos, len, &refID);
- long hitLoc = static_cast<long>(pos - idx->bns->anns[refID].offset) + 1;
- rightFwdHits[refID].addVote(hitLoc, hitLoc, len);
- } // for k
- } // if <= minIWidth
-
- } // for mi
- } // for all query matches
-
- } // end right
-
- size_t readHits{0};
- std::unordered_map<TranscriptID, uint32_t> leftHitCounts;
- std::unordered_map<TranscriptID, uint32_t> rightHitCounts;
-
- auto& leftHits = leftFwdHits;
- auto& rightHits = rightFwdHits;
-
- for (auto& tHitList : leftHits) {
- // Coverage score
- tHitList.second.computeBestHit();
- ++leftHitCount;
- }
-
- double cutoffLeft{ 0.75 * leftReadLength};
- double cutoffRight{ 0.75 * rightReadLength};
-
- for (auto& tHitList : rightHits) {
- auto it = leftHits.find(tHitList.first);
- // Coverage score
- if (it != leftHits.end() and it->second.bestHitScore >= cutoffLeft) {
- tHitList.second.computeBestHit();
- if (tHitList.second.bestHitScore < cutoffRight) { continue; }
- uint32_t score = it->second.bestHitScore + tHitList.second.bestHitScore;
- hitList.emplace_back(tHitList.first, score);
- readHits += score;
- ++hitListCount;
- }
- }
-
- totalHits += hitList.size() > 0;
- locRead++;
- ++rn;
- if (rn % 50000 == 0) {
- std::cerr << "\r\rprocessed read " << rn;
- std::cerr << "\n leftHits.size() " << leftHits.size()
- << ", rightHits.size() " << rightHits.size() << ", hit list of size = " << hitList.size() << "\n";
-// std::cerr << "average leftHits = " << leftHitCount / static_cast<float>(locRead)
-// << ", average hitList = " << hitListCount / static_cast<float>(locRead) << "\n";
- }
-
- if (hitList.size() > 100) { hitList.clear(); }
-
- double invHits = 1.0 / readHits;
- for (auto t : hitList) {
- transcripts[t.transcriptID()].addSharedCount( t.kmerCount * invHits );
- }
-
- } // end for i < j->nb_filled
-
- auto oldBatchNum = batchNum++;
- if (oldBatchNum > 1) {
- ffMutex.lock();
- logForgettingMass += forgettingFactor * std::log(static_cast<double>(oldBatchNum-1)) -
- std::log(std::pow(static_cast<double>(oldBatchNum), forgettingFactor) - 1);
- ffMutex.unlock();
- }
-
- processMiniBatch(logForgettingMass, hitLists, transcripts, clusterForest);
- }
- smem_itr_destroy(itr);
- *total_fwd += count_fwd;
- *total_bwd += count_bwd;
-}
-
-
-
-int salmonQuantify(int argc, char *argv[]) {
- using std::cerr;
- using std::vector;
- using std::string;
- namespace bfs = boost::filesystem;
- namespace po = boost::program_options;
-
- uint32_t maxThreads = std::thread::hardware_concurrency();
- uint32_t sampleRate{1};
-
- vector<string> unmatedReadFiles;
- vector<string> mate1ReadFiles;
- vector<string> mate2ReadFiles;
-
- po::options_description generic("salmon quant options");
- generic.add_options()
- ("version,v", "print version string")
- ("help,h", "produce help message")
- ("index,i", po::value<string>(), "sailfish index.")
- ("libtype,l", po::value<std::string>(), "Format string describing the library type")
- ("unmated_reads,r", po::value<vector<string>>(&unmatedReadFiles)->multitoken(),
- "List of files containing unmated reads of (e.g. single-end reads)")
- ("mates1,1", po::value<vector<string>>(&mate1ReadFiles)->multitoken(),
- "File containing the #1 mates")
- ("mates2,2", po::value<vector<string>>(&mate2ReadFiles)->multitoken(),
- "File containing the #2 mates")
- ("threads,p", po::value<uint32_t>()->default_value(maxThreads), "The number of threads to use concurrently.")
- ("sample,s", po::value<uint32_t>(&sampleRate)->default_value(1), "Sample rate --- only consider every s-th k-mer in a read.")
- ("output,o", po::value<std::string>(), "Output quantification file");
-
- po::variables_map vm;
- try {
- auto orderedOptions = po::command_line_parser(argc,argv).
- options(generic).run();
-
- po::store(orderedOptions, vm);
-
- if ( vm.count("help") ) {
- auto hstring = R"(
-Quant
-==========
-Perform streaming k-mer-group-based estimation of
-transcript abundance from RNA-seq reads
-)";
- std::cout << hstring << std::endl;
- std::cout << generic << std::endl;
- std::exit(1);
- }
-
- po::notify(vm);
-
-
- for (auto& opt : orderedOptions.options) {
- std::cerr << "[ " << opt.string_key << "] => {";
- for (auto& val : opt.value) {
- std::cerr << " " << val;
- }
- std::cerr << " }\n";
- }
-
- bfs::path outputDirectory(vm["output"].as<std::string>());
- bfs::create_directory(outputDirectory);
- if (!(bfs::exists(outputDirectory) and bfs::is_directory(outputDirectory))) {
- std::cerr << "Couldn't create output directory " << outputDirectory << "\n";
- std::cerr << "exiting\n";
- std::exit(1);
- }
-
- bfs::path indexDirectory(vm["index"].as<string>());
- bfs::path logDirectory = outputDirectory.parent_path() / "logs";
-
-#if HAVE_LOGGER
- bfs::create_directory(logDirectory);
- if (!(bfs::exists(logDirectory) and bfs::is_directory(logDirectory))) {
- std::cerr << "Couldn't create log directory " << logDirectory << "\n";
- std::cerr << "exiting\n";
- std::exit(1);
- }
- std::cerr << "writing logs to " << logDirectory.string() << "\n";
- g2LogWorker logger(argv[0], logDirectory.string());
- g2::initializeLogging(&logger);
-#endif
-
- vector<ReadLibrary> readLibraries;
- for (auto& opt : orderedOptions.options) {
- if (opt.string_key == "libtype") {
- LibraryFormat libFmt = salmon::utils::parseLibraryFormatString(opt.value[0]);
- if (libFmt.check()) {
- std::cerr << libFmt << "\n";
- } else {
- std::stringstream ss;
- ss << libFmt << " is invalid!";
- throw std::invalid_argument(ss.str());
- }
- readLibraries.emplace_back(libFmt);
- } else if (opt.string_key == "mates1") {
- readLibraries.back().addMates1(opt.value);
- } else if (opt.string_key == "mates2") {
- readLibraries.back().addMates2(opt.value);
- } else if (opt.string_key == "unmated_reads") {
- readLibraries.back().addUnmated(opt.value);
- }
- }
-
- for (auto& rl : readLibraries) { rl.checkValid(); }
-
- auto& rl = readLibraries.front();
- // Handle all the proper cases here
- char* readFiles[] = { const_cast<char*>(rl.mates1().front().c_str()),
- const_cast<char*>(rl.mates2().front().c_str()) };
-
- uint32_t nbThreads = vm["threads"].as<uint32_t>();
-
- size_t maxReadGroup{2000}; // Number of files to read simultaneously
- size_t concurrentFile{1}; // Number of reads in each "job"
- sequence_parser parser(4 * nbThreads, maxReadGroup, concurrentFile,
- readFiles, readFiles + 2);
-
- // kmer-based
- /*
- bfs::path sfIndexPath = indexDirectory / "transcriptome.sfi";
- std::string sfTrascriptIndexFile(sfIndexPath.string());
- std::cerr << "reading index . . . ";
- auto phi = PerfectHashIndex::fromFile(sfTrascriptIndexFile);
- std::cerr << "done\n";
- std::cerr << "index contained " << phi.numKeys() << " kmers\n";
-
- size_t nkeys = phi.numKeys();
- size_t merLen = phi.kmerLength();
- */
-
- /*
- google::dense_hash_map<uint64_t, uint64_t> khash;
- khash.set_empty_key(std::numeric_limits<uint64_t>::max());
- uint64_t i{0};
- for (auto k : phi.kmers()) {
- khash[k] = phi.index(k);
- }
- size_t merLen = 20;
- std::cerr << "kmer length = " << merLen << "\n";
- my_mer::k(merLen);
-
- bfs::path tlutPath = indexDirectory / "transcriptome.tlut";
- // Get transcript lengths
- std::ifstream ifile(tlutPath.string(), std::ios::binary);
- size_t numRecords {0};
- ifile.read(reinterpret_cast<char *>(&numRecords), sizeof(numRecords));
-
- std::vector<Transcript> transcripts_tmp;
-
- std::cerr << "Transcript LUT contained " << numRecords << " records\n";
- //transcripts_.resize(numRecords);
- for (auto i : boost::irange(size_t(0), numRecords)) {
- auto ti = LUTTools::readTranscriptInfo(ifile);
- // copy over the length, then we're done.
- transcripts_tmp.emplace_back(ti->transcriptID, ti->name.c_str(), ti->length);
- }
-
- std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
- [](const Transcript& t1, const Transcript& t2) -> bool {
- return t1.id < t2.id;
- });
- std::vector<Transcript> transcripts_;
- for (auto& t : transcripts_tmp) {
- transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength);
- }
- transcripts_tmp.clear();
- ifile.close();
- // --- done ---
-
- */
-
- bwaidx_t *idx;
-
- { // mem-based
- bfs::path indexPath = indexDirectory / "bwaidx";
- if ((idx = bwa_idx_load(indexPath.string().c_str(), BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
- }
-
- size_t numRecords = idx->bns->n_seqs;
- std::vector<Transcript> transcripts_tmp;
-
- std::cerr << "Transcript LUT contained " << numRecords << " records\n";
- //transcripts_.resize(numRecords);
- for (auto i : boost::irange(size_t(0), numRecords)) {
- uint32_t id = i;
- char* name = idx->bns->anns[i].name;
- uint32_t len = idx->bns->anns[i].len;
- // copy over the length, then we're done.
- transcripts_tmp.emplace_back(id, name, len);
- }
-
- std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
- [](const Transcript& t1, const Transcript& t2) -> bool {
- return t1.id < t2.id;
- });
- std::vector<Transcript> transcripts_;
- for (auto& t : transcripts_tmp) {
- transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength);
- }
- transcripts_tmp.clear();
- // --- done ---
-
- /*
- { // k-mer based
- bfs::path locPath = indexDirectory / "fullLookup.kmap";
- // Make sure we have the file we need to look up k-mer hits
- if (!boost::filesystem::exists(locPath)) {
- std::cerr << "could not find k-mer location index; expected at " << locPath << "\n";
- std::cerr << "please ensure that you've run salmon index before attempting to run salmon quant\n";
- std::exit(1);
- }
-
- std::cerr << "Loading k-mer location index from " << locPath << " . . .";
-
- //std::vector<std::vector<uint64_t>> kmerLocMap;
- std::vector<uint64_t> offsets;
- std::vector<uint64_t> kmerLocs;
-
- std::ifstream binstream(locPath.string(), std::ios::binary);
- cereal::BinaryInputArchive archive(binstream);
- //archive(kmerLocMap);
- archive(offsets, kmerLocs);
- binstream.close();
- std::cerr << "done\n";
- */
- std::vector<std::thread> threads;
- std::atomic<uint64_t> total_fwd(0);
- std::atomic<uint64_t> total_bwd(0);
- std::atomic<uint64_t> totalHits(0);
- std::atomic<uint64_t> rn{0};
- std::atomic<uint64_t> batchNum{0};
-
- size_t numTranscripts{transcripts_.size()};
- ClusterForest clusterForest(numTranscripts, transcripts_);
-
- double logForgettingMass{salmon::math::LOG_1};
- std::mutex ffmutex;
- for(int i = 0; i < nbThreads; ++i) {
- std::cerr << "here\n";
- /*
- threads.push_back(std::thread(add_sizes, &parser, &total_fwd, &total_bwd,
- std::ref(totalHits), std::ref(rn),
- std::ref(phi),
- std::ref(transcripts_),
- std::ref(batchNum),
- std::ref(logForgettingMass),
- std::ref(ffmutex),
- std::ref(clusterForest),
- std::ref(offsets),
- std::ref(kmerLocs),
- sampleRate
- //std::ref(khash)
- //std::ref(kmerLocMap)
- ));
- */
- threads.push_back(std::thread(processReadsMEM, &parser, &total_fwd, &total_bwd,
- std::ref(totalHits), std::ref(rn),
- idx,
- std::ref(transcripts_),
- std::ref(batchNum),
- std::ref(logForgettingMass),
- std::ref(ffmutex),
- std::ref(clusterForest)
- ));
- }
-
- for(int i = 0; i < nbThreads; ++i)
- threads[i].join();
-
- std::cerr << "\n\n";
- std::cerr << "processed " << rn << " total reads\n";
- std::cout << "Total bases: " << total_fwd << " " << total_bwd << "\n";
- std::cout << "Had a hit for " << totalHits / static_cast<double>(rn) * 100.0 << "% of the reads\n";
-
- size_t tnum{0};
-
- std::cerr << "writing output \n";
-
- bfs::path outputFile = outputDirectory / "quant.sf";
- std::ofstream output(outputFile.string());
- output << "# SDAFish v0.01\n";
- output << "# ClusterID\tName\tLength\tFPKM\tNumReads\n";
-
- const double logBillion = std::log(1000000000.0);
- const double logNumFragments = std::log(static_cast<double>(rn));
- auto clusters = clusterForest.getClusters();
- size_t clusterID = 0;
- for(auto cptr : clusters) {
- double logClusterMass = cptr->logMass();
- double logClusterCount = std::log(static_cast<double>(cptr->numHits()));
-
- if (logClusterMass == salmon::math::LOG_0) {
- std::cerr << "Warning: cluster " << clusterID << " has 0 mass!\n";
- }
-
- bool requiresProjection{false};
-
- auto& members = cptr->members();
- size_t clusterSize{0};
- for (auto transcriptID : members) {
- Transcript& t = transcripts_[transcriptID];
- t.uniqueCounts = t.uniqueCount();
- t.totalCounts = t.totalCount();
- }
-
- for (auto transcriptID : members) {
- Transcript& t = transcripts_[transcriptID];
- double logTranscriptMass = t.mass();
- double logClusterFraction = logTranscriptMass - logClusterMass;
- t.projectedCounts = std::exp(logClusterFraction + logClusterCount);
- requiresProjection |= t.projectedCounts > static_cast<double>(t.totalCounts) or
- t.projectedCounts < static_cast<double>(t.uniqueCounts);
- ++clusterSize;
- }
-
- if (clusterSize > 1 and requiresProjection) {
- cptr->projectToPolytope(transcripts_);
- }
-
- // Now posterior has the transcript fraction
- size_t tidx = 0;
- for (auto transcriptID : members) {
- auto& transcript = transcripts_[transcriptID];
- double logLength = std::log(transcript.RefLength);
- double fpkmFactor = std::exp(logBillion - logLength - logNumFragments);
- double count = transcripts_[transcriptID].projectedCounts;
- double countTotal = transcripts_[transcriptID].totalCounts;
- double countUnique = transcripts_[transcriptID].uniqueCounts;
- double fpkm = count > 0 ? fpkmFactor * count : 0.0;
- output << clusterID << '\t' << transcript.RefName << '\t' <<
- transcript.RefLength << '\t' << fpkm << '\t' << countTotal << '\t' << countUnique << '\t' << count << '\t' << transcript.mass() << '\n';
- ++tidx;
- }
-
- ++clusterID;
- }
- output.close();
-
- } catch (po::error &e) {
- std::cerr << "Exception : [" << e.what() << "]. Exiting.\n";
- std::exit(1);
- } catch (std::exception& e) {
- std::cerr << "Exception : [" << e.what() << "]\n";
- std::cerr << argv[0] << " quant was invoked improperly.\n";
- std::cerr << "For usage information, try " << argv[0] << " quant --help\nExiting.\n";
- }
-
-
- return 0;
-}
-
-
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/salmon.git
More information about the debian-med-commit
mailing list