[med-svn] [Git][med-team/kallisto][upstream] New upstream version 0.46.2+dfsg
Steffen Möller
gitlab at salsa.debian.org
Fri May 15 12:14:47 BST 2020
Steffen Möller pushed to branch upstream at Debian Med / kallisto
Commits:
53c35c62 by Steffen Moeller at 2020-05-15T13:11:58+02:00
New upstream version 0.46.2+dfsg
- - - - -
20 changed files:
- CMakeLists.txt
- src/BUSData.cpp
- src/BUSData.h
- src/Bootstrap.cpp
- src/Bootstrap.h
- src/CMakeLists.txt
- src/H5Writer.cpp
- src/H5Writer.h
- src/ProcessReads.cpp
- src/ProcessReads.h
- src/PseudoBam.cpp
- src/PseudoBam.h
- src/common.h
- src/h5utils.cpp
- src/h5utils.h
- − src/kseq.h
- src/main.cpp
- test/Snakefile
- + test/sc_reads_1.fastq.gz
- + test/sc_reads_2.fastq.gz
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -4,6 +4,13 @@ project(kallisto)
include(GNUInstallDirs)
+
+option(USE_HDF5 "Compile with HDF5 support" OFF) #OFF by default
+
+if(USE_HDF5)
+ add_compile_definitions("USE_HDF5=ON")
+endif(USE_HDF5)
+
set(EXT_PROJECTS_DIR ${PROJECT_SOURCE_DIR}/ext)
set(CMAKE_CXX_FLAGS_PROFILE "-g")
=====================================
src/BUSData.cpp
=====================================
@@ -33,4 +33,21 @@ uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag) {
flag = (numN & 3) | (posN & 31) << 2;
}
return r;
+}
+
+std::string binaryToString(uint64_t x, size_t len) {
+ std::string s(len, 'N');
+ size_t sh = len-1;
+ for (size_t i = 0; i < len; i++) {
+ char c = 'N';
+ switch((x >> (2*sh)) & 0x03ULL) {
+ case 0x00: c = 'A'; break;
+ case 0x01: c = 'C'; break;
+ case 0x02: c = 'G'; break;
+ case 0x03: c = 'T'; break;
+ }
+ sh--;
+ s.at(i) = c;
+ }
+ return std::move(s);
}
\ No newline at end of file
=====================================
src/BUSData.h
=====================================
@@ -37,5 +37,5 @@ struct BUSData {
uint64_t stringToBinary(const std::string &s, uint32_t &flag);
uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag);
-
+std::string binaryToString(uint64_t x, size_t len);
#endif // KALLISTO_BUSDATA_H
\ No newline at end of file
=====================================
src/Bootstrap.cpp
=====================================
@@ -1,4 +1,5 @@
#include "Bootstrap.h"
+#include "PlaintextWriter.h"
EMAlgorithm Bootstrap::run_em() {
auto counts = mult_.sample();
@@ -19,7 +20,7 @@ BootstrapThreadPool::BootstrapThreadPool(
const MinCollector& tc,
const std::vector<double>& eff_lens,
const ProgramOptions& p_opts,
- H5Writer& h5writer,
+ BootstrapWriter *bswriter,
const std::vector<double>& mean_fls
) :
n_threads_(n_threads),
@@ -30,7 +31,7 @@ BootstrapThreadPool::BootstrapThreadPool(
tc_(tc),
eff_lens_(eff_lens),
opt_(p_opts),
- writer_(h5writer),
+ writer_(bswriter),
mean_fls_(mean_fls)
{
for (size_t i = 0; i < n_threads_; ++i) {
@@ -78,7 +79,7 @@ void BootstrapWorker::operator() (){
std::unique_lock<std::mutex> lock(pool_.write_lock_);
++pool_.n_complete_;
std::cerr << "[bstrp] number of EM bootstraps complete: " << pool_.n_complete_ << "\r";
- pool_.writer_.write_bootstrap(res, cur_id);
+ pool_.writer_->write_bootstrap(res, cur_id);
// release write lock
} else {
// can write out plaintext in parallel
=====================================
src/Bootstrap.h
=====================================
@@ -9,7 +9,7 @@
#include "weights.h"
#include "EMAlgorithm.h"
#include "Multinomial.hpp"
-#include "H5Writer.h"
+
class Bootstrap {
// needs:
@@ -49,6 +49,21 @@ private:
const ProgramOptions& opt;
};
+class BootstrapWriter {
+ public:
+ virtual ~BootstrapWriter() {};
+
+ virtual void init(const std::string& fname, int num_bootstrap, int num_processed,
+ const std::vector<int>& fld, const std::vector<int>& preBias, const std::vector<double>& postBias, uint compression, size_t index_version,
+ const std::string& shell_call, const std::string& start_time) = 0;
+
+ virtual void write_main(const EMAlgorithm& em,
+ const std::vector<std::string>& targ_ids,
+ const std::vector<int>& lengths) = 0;
+
+ virtual void write_bootstrap(const EMAlgorithm& em, int bs_id) = 0;
+};
+
class BootstrapThreadPool {
friend class BootstrapWorker;
@@ -61,7 +76,7 @@ class BootstrapThreadPool {
const MinCollector& tc,
const std::vector<double>& eff_lens,
const ProgramOptions& p_opts,
- H5Writer& h5writer,
+ BootstrapWriter *bswriter,
const std::vector<double>& mean_fls
);
@@ -84,7 +99,7 @@ class BootstrapThreadPool {
const MinCollector& tc_;
const std::vector<double>& eff_lens_;
const ProgramOptions& opt_;
- H5Writer& writer_;
+ BootstrapWriter *writer_;
const std::vector<double>& mean_fls_;
};
=====================================
src/CMakeLists.txt
=====================================
@@ -27,23 +27,29 @@ if(LINK MATCHES static)
SET_TARGET_PROPERTIES(kallisto kallisto_core PROPERTIES LINK_SEARCH_END_STATIC 1)
endif(LINK MATCHES static)
-find_package( HDF5 REQUIRED )
+
+if(USE_HDF5)
+ find_package( HDF5 REQUIRED )
+endif(USE_HDF5)
+
find_package( ZLIB REQUIRED )
if ( ZLIB_FOUND )
include_directories( ${ZLIB_INCLUDE_DIRS} )
+ target_link_libraries(kallisto kallisto_core ${ZLIB_LIBRARIES})
else()
message(FATAL_ERROR "zlib not found. Required for to output files" )
endif( ZLIB_FOUND )
-if(HDF5_FOUND)
- include_directories( ${HDF5_INCLUDE_DIRS} )
- target_link_libraries( kallisto_core ${HDF5_LIBRARIES} )
- target_link_libraries( kallisto ${HDF5_LIBRARIES} )
-else()
- message(FATAL_ERROR "HDF5 not found. Required to output files")
-endif()
-
+if(USE_HDF5)
+ if(HDF5_FOUND)
+ include_directories( ${HDF5_INCLUDE_DIRS} )
+ target_link_libraries( kallisto_core ${HDF5_LIBRARIES} )
+ target_link_libraries( kallisto ${HDF5_LIBRARIES} )
+ else()
+ message(FATAL_ERROR "HDF5 not found. Required to output files")
+ endif()
+endif(USE_HDF5)
if(LINK MATCHES static)
if (UNIX AND NOT APPLE)
=====================================
src/H5Writer.cpp
=====================================
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#include "H5Writer.h"
void H5Writer::init(const std::string& fname, int num_bootstrap, int num_processed,
@@ -201,3 +202,5 @@ void H5Converter::rw_from_counts(hid_t group_id, const std::string& count_name,
plaintext_writer(out_fname, targ_ids_, alpha, eff_lengths_, lengths_);
}
+
+#endif // USE_HDF5
\ No newline at end of file
=====================================
src/H5Writer.h
=====================================
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#ifndef KALLISTO_H5WRITER_H
#define KALLISTO_H5WRITER_H
@@ -5,21 +6,22 @@
#include "h5utils.h"
#include "PlaintextWriter.h"
+#include "Bootstrap.h"
-class H5Writer {
+class H5Writer : public BootstrapWriter {
public:
H5Writer() : primed_(false) {}
~H5Writer();
- void init(const std::string& fname, int num_bootstrap, int num_processed,
+ virtual void init(const std::string& fname, int num_bootstrap, int num_processed,
const std::vector<int>& fld, const std::vector<int>& preBias, const std::vector<double>& postBias, uint compression, size_t index_version,
const std::string& shell_call, const std::string& start_time);
- void write_main(const EMAlgorithm& em,
+ virtual void write_main(const EMAlgorithm& em,
const std::vector<std::string>& targ_ids,
const std::vector<int>& lengths);
- void write_bootstrap(const EMAlgorithm& em, int bs_id);
+ virtual void write_bootstrap(const EMAlgorithm& em, int bs_id);
private:
bool primed_;
@@ -76,3 +78,4 @@ class H5Converter {
};
#endif // KALLISTO_H5WRITER_H
+#endif // USE_HDF5
\ No newline at end of file
=====================================
src/ProcessReads.cpp
=====================================
@@ -167,6 +167,9 @@ int64_t ProcessReads(MasterProcessor& MP, const ProgramOptions& opt) {
// for each file
std::cerr << "[quant] finding pseudoalignments for the reads ..."; std::cerr.flush();
+ if (opt.verbose) {
+ std::cerr << std::endl;
+ }
/*if (opt.pseudobam) {
bam_hdr_t *t = createPseudoBamHeader(index);
@@ -177,7 +180,11 @@ int64_t ProcessReads(MasterProcessor& MP, const ProgramOptions& opt) {
MP.processReads();
numreads = MP.numreads;
nummapped = MP.nummapped;
- std::cerr << " done" << std::endl;
+ if (opt.verbose) {
+ std::cerr << std::endl << "[quant] done " << std::endl;
+ } else {
+ std::cerr << " done" << std::endl;
+ }
//std::cout << "betterCount = " << betterCount << ", out of betterCand = " << betterCand << std::endl;
@@ -237,16 +244,70 @@ int64_t ProcessBUSReads(MasterProcessor& MP, const ProgramOptions& opt) {
// for each file
std::cerr << "[quant] finding pseudoalignments for the reads ..."; std::cerr.flush();
- /*if (opt.pseudobam) {
- bam_hdr_t *t = createPseudoBamHeader(index);
- index.writePseudoBamHeader(std::cout);
- }*/
+ if (opt.genomebam) {
+ /*
+ // open bam files for writing
+ MP.bamh = createPseudoBamHeaderGenome(MP.model);
+ MP.bamfps = new htsFile*[MP.numSortFiles];
+ for (int i = 0; i < MP.numSortFiles; i++) {
+ MP.bamfps[i] = sam_open((opt.output + "/tmp." + std::to_string(i) + ".bam").c_str(), "wb1");
+ int r = sam_hdr_write(MP.bamfps[i], MP.bamh);
+ }
+
+ // assign breakpoints to chromosomes
+ MP.breakpoints.clear();
+ MP.breakpoints.assign(MP.numSortFiles -1 , (((uint64_t)-1)<<32));
+ std::vector<std::vector<std::pair<uint32_t, uint32_t>>> chrWeights(MP.model.chr.size());
+ for (const auto& t : MP.model.genes) {
+ if (t.chr != -1 && t.stop > 0) {
+ chrWeights[t.chr].push_back({t.stop, 1});
+ }
+ }
+
+ double sum = 0;
+ for (const auto &chrw : chrWeights) {
+ for (const auto &p : chrw) {
+ sum += p.second;
+ }
+ }
+ double bpLimit = sum / (MP.numSortFiles-1);
+
+ for (auto& chrw : chrWeights) {
+ // sort each by stop point
+ std::sort(chrw.begin(), chrw.end(), [](std::pair<uint32_t, uint32_t> a, std::pair<uint32_t, uint32_t> b) { return a.first < b.first;});
+ }
+
+ double bp = 0.0;
+ int ifile = 0;
+ for (int i = 0; i < chrWeights.size(); i++) {
+ const auto &chrw = chrWeights[i];
+ for (const auto &x : chrw) {
+ bp += x.second;
+ if (bp > bpLimit) {
+ uint64_t pos = ((uint64_t) i) << 32 | ((uint64_t) x.first+1) << 1;
+ MP.breakpoints[ifile] = pos;
+ ifile++;
+ while (bp > bpLimit) {
+ bp -= bpLimit;
+ }
+ }
+ }
+ }
+ */
+ }
+
+
+
MP.processReads();
numreads = MP.numreads;
nummapped = MP.nummapped;
- std::cerr << " done" << std::endl;
+ if (opt.verbose) {
+ std::cerr << std::endl << "[quant] done " << std::endl;
+ } else {
+ std::cerr << " done" << std::endl;
+ }
//std::cout << "betterCount = " << betterCount << ", out of betterCand = " << betterCand << std::endl;
@@ -865,6 +926,7 @@ ReadProcessor::ReadProcessor(const KmerIndex& index, const ProgramOptions& opt,
if (opt.batch_mode) {
assert(id != -1);
batchSR.files = opt.batch_files[id];
+ batchSR.nfiles = opt.batch_files[id].size();
batchSR.reserveNfiles(opt.batch_files[id].size());
if (opt.umi) {
batchSR.umi_files = {opt.umi_files[id]};
@@ -951,6 +1013,7 @@ void ReadProcessor::processBuffer() {
std::vector<int> vtmp;
std::vector<int> u;
+
u.reserve(1000);
v1.reserve(1000);
v2.reserve(1000);
@@ -1199,13 +1262,17 @@ void ReadProcessor::processBuffer() {
*/
}
+ if (mp.opt.verbose && numreads > 0 && numreads % 1000000 == 0 ) {
+ int nmap = mp.nummapped;
+ for (int i = 0; i < counts.size(); i++) {
+ nmap += counts[i];
+ }
+ nmap += newEcs.size();
-
-
- /*
- if (opt.verbose && numreads % 100000 == 0 ) {
- std::cerr << "[quant] Processed " << pretty_num(numreads) << std::endl;
- }*/
+ std::cerr << '\r' << (numreads/1000000) << "M reads processed ("
+ << std::fixed << std::setw( 3 ) << std::setprecision( 1 ) << ((100.0*nmap)/double(numreads))
+ << "% pseudoaligned)"; std::cerr.flush();
+ }
}
}
@@ -1223,7 +1290,7 @@ void ReadProcessor::clear() {
BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, const MinCollector& tc, MasterProcessor& mp, int _id, int _local_id) :
- paired(!opt.single_end), bam(opt.bam), num(opt.num), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id) {
+ paired(!opt.single_end), bam(opt.bam), num(opt.num), tc(tc), index(index), mp(mp), id(_id), local_id(_local_id), numreads(0) {
// initialize buffer
bufsize = mp.bufsize;
buffer = new char[bufsize];
@@ -1288,17 +1355,16 @@ void BUSProcessor::operator()() {
// release the reader lock
}
// do the same for BUS ?!?
- /*
+
pseudobatch.aln.clear();
- pseudobatch.batch_id = readbatch_id;*/
+ pseudobatch.batch_id = readbatch_id;
// process our sequences
processBuffer();
// update the results, MP acquires the lock
std::vector<std::pair<int, std::string>> ec_umi;
std::vector<std::pair<std::vector<int>, std::string>> new_ec_umi;
- PseudoAlignmentBatch pseudobatch;
- mp.update(counts, newEcs, ec_umi, new_ec_umi, paired ? seqs.size()/2 : seqs.size(), flens, bias5, pseudobatch, bv, newB, &bc_len[0], &umi_len[0], id, local_id);
+ mp.update(counts, newEcs, ec_umi, new_ec_umi, seqs.size() / mp.opt.busOptions.nfiles , flens, bias5, pseudobatch, bv, newB, &bc_len[0], &umi_len[0], id, local_id);
clear();
}
}
@@ -1316,28 +1382,35 @@ void BUSProcessor::processBuffer() {
memset(&bc_len[0], 0, 33);
memset(&umi_len[0], 0, 33);
- const char* s1 = 0;
- const char* s2 = 0;
- const char* s3 = 0;
- int l1=0,l2=0,l3=0;
-
- const char *s[3] = {0,0,0};
- int l[3] = {0,0,0};
+ const BUSOptions& busopt = mp.opt.busOptions;
+
char buffer[100];
memset(buffer, 0, 100);
char *umi = &(buffer[50]);
char *bc = &(buffer[0]);
+ //char *seqbuffer[1000];
+ std::string seqbuffer;
+ seqbuffer.reserve(1000);
+
+ // actually process the sequence
- // actually process the sequences
- const BUSOptions& busopt = mp.opt.busOptions;
int incf, jmax;
if (bam) {
incf = 1;
jmax = 2;
+
} else {
incf = busopt.nfiles-1;
jmax = busopt.nfiles;
}
+
+ std::vector<const char*> s(jmax, nullptr);
+ std::vector<int> l(jmax,0);
+
+ bool singleSeq = busopt.seq.size() ==1 ;
+ const BUSOptionSubstr seqopt = busopt.seq.front();
+
+
//int incf = (bam) ? 1 : busopt.nfiles-1;
for (int i = 0; i + incf < seqs.size(); i++) {
for (int j = 0; j < jmax /*(bam) ? 2 : busopt.nfiles*/; j++) {
@@ -1347,8 +1420,24 @@ void BUSProcessor::processBuffer() {
i += incf;
// find where the sequence is
- const char *seq = s[busopt.seq.fileno] + busopt.seq.start;
- int seqlen = l[busopt.seq.fileno];
+ const char *seq = nullptr;
+
+ size_t seqlen = 0;
+ if (singleSeq) {
+ seq = s[seqopt.fileno] + seqopt.start;
+ seqlen = l[seqopt.fileno];
+ } else {
+ seqbuffer.clear();
+ for (int j = 0; j < busopt.seq.size(); j++) {
+ const auto &sopt = busopt.seq[j];
+ int cplen = (sopt.stop == 0) ? l[sopt.fileno]-sopt.start : sopt.stop - sopt.start;
+ seqbuffer.append(s[sopt.fileno] + sopt.start, cplen);
+ seqbuffer.push_back('N');
+ }
+ seqbuffer.push_back(0);
+ seq = seqbuffer.c_str();
+ seqlen = seqbuffer.size();
+ }
// copy the umi
int umilen = (busopt.umi.start == busopt.umi.stop) ? l[busopt.umi.fileno] - busopt.umi.start : busopt.umi.stop - busopt.umi.start;
@@ -1430,8 +1519,43 @@ void BUSProcessor::processBuffer() {
// push back BUS record
b.ec = ec;
bv.push_back(b);
+ }
+ }
+
+ if (mp.opt.pseudobam) {
+ PseudoAlignmentInfo info;
+ info.id = i;
+ info.paired = false;
+ uint32_t f = 0;
+ info.barcode = stringToBinary(bc, blen, f);
+ info.UMI = stringToBinary(umi, umilen, f);
+ if (!u.empty()) {
+ info.r1empty = v.empty();
+ KmerEntry val;
+ info.k1pos = findFirstMappingKmer(v,val);
+ info.k2pos = -1;
+
+ if (ec != -1) {
+ info.ec_id = ec;
+ } else {
+ info.ec_id = -1;
+ info.u = u; // copy
+ }
}
+ pseudobatch.aln.push_back(std::move(info));
}
+
+ if (mp.opt.verbose && numreads > 0 && numreads % 1000000 == 0 ) {
+ int nmap = mp.nummapped;
+ for (int i = 0; i < counts.size(); i++) {
+ nmap += counts[i];
+ }
+ nmap += newEcs.size();
+
+ std::cerr << '\r' << (numreads/1000000) << "M reads processed ("
+ << std::fixed << std::setw( 3 ) << std::setprecision( 1 ) << ((100.0*nmap)/double(numreads))
+ << "% pseudoaligned)"; std::cerr.flush();
+ }
}
}
@@ -1458,6 +1582,7 @@ AlnProcessor::AlnProcessor(const KmerIndex& index, const ProgramOptions& opt, Ma
/* need to check this later */
assert(id != -1);
batchSR.files = opt.batch_files[id];
+ batchSR.nfiles = opt.batch_files[id].size();
batchSR.reserveNfiles(opt.batch_files[id].size());
if (opt.umi) {
batchSR.umi_files = {opt.umi_files[id]};
@@ -1976,26 +2101,56 @@ void AlnProcessor::processBufferGenome() {
std::vector<std::pair<int,double>> ua;
u.reserve(1000);
ua.reserve(1000);
+
+ int bclen = 0;
+ int umilen = 0;
int genome_auxlen = default_genome_auxlen;
if (!useEM) {
genome_auxlen -= 7;
}
+ if (mp.opt.bus_mode) {
+ //todo replace with what is written to busfile
+ bclen = mp.opt.busOptions.getBCLength();
+ if (bclen == 0) {
+ bclen = 32;
+ }
+ umilen = mp.opt.busOptions.getUMILength();
+ if (umilen == 0) {
+ umilen = 32;
+ }
+
+ genome_auxlen += (bclen + umilen) + 8;
+ }
+ std::string bc, umi;
Kmer km1,km2;
KmerEntry val1, val2;
+ if (mp.opt.bus_mode) {
+ paired = false;
+ }
+ bool readpaired = paired || (mp.opt.bus_mode && mp.opt.busOptions.nfiles == 2);
+
+
for (int i = 0; i < n; i++) {
bam1_t b1,b2, b1c, b2c;
- int si1 = (paired) ? 2*i : i;
- int si2 = (paired) ? 2*i +1 : -1;
+ int si1 = (readpaired) ? 2*i : i;
+ int si2 = (readpaired) ? 2*i +1 : -1;
+
int rlen1 = seqs[si1].second;
int rlen2;
- if (paired) {
+ if (readpaired) {
rlen2 = seqs[si2].second;
}
+ if (mp.opt.busOptions.seq.front().fileno == 1) {
+ std::swap(si1,si2);
+ std::swap(rlen1,rlen2);
+ }
+
+
// fill in the bam core info
b1.core.tid = -1;
b1.core.pos = -1;
@@ -2029,6 +2184,22 @@ void AlnProcessor::processBufferGenome() {
fillBamRecord(b2, nullptr, seqs[si2].first, names[si2].first, quals[si2].first, seqs[si2].second, names[si2].second, pi.r2empty, genome_auxlen);
// b2.id = idnum++;
}
+
+ if (mp.opt.bus_mode) {
+ b1.data[b1.l_data] = 'C';
+ b1.data[b1.l_data+1] = 'R';
+ b1.data[b1.l_data+2] = 'Z';
+ bc = binaryToString(pi.barcode, bclen);
+ memcpy(b1.data + b1.l_data + 3, bc.c_str(), bclen+1);
+ b1.l_data += bclen + 4;
+
+ b1.data[b1.l_data] = 'U';
+ b1.data[b1.l_data+1] = 'R';
+ b1.data[b1.l_data+2] = 'Z';
+ umi = binaryToString(pi.UMI, umilen);
+ memcpy(b1.data + b1.l_data + 3, umi.c_str(), umilen+2);
+ b1.l_data += umilen + 4;
+ }
if (pi.r1empty && pi.r2empty) {
bv.push_back(b1);
@@ -2111,7 +2282,7 @@ void AlnProcessor::processBufferGenome() {
// set aux
float zero = 0.0;
- assert(b1.l_data + genome_auxlen <= b1.m_data);
+ //assert(b1.l_data + genome_auxlen <= b1.m_data);
if (useEM) {
b1.data[b1.l_data] = 'Z';
@@ -2121,7 +2292,7 @@ void AlnProcessor::processBufferGenome() {
b1.l_data += 7;
if (paired) {
- assert(b2.l_data + genome_auxlen <= b2.m_data);
+ //assert(b2.l_data + genome_auxlen <= b2.m_data);
b2.data[b2.l_data] = 'Z';
b2.data[b2.l_data+1] = 'W';
b2.data[b2.l_data+2] = 'f';
@@ -2165,7 +2336,6 @@ void AlnProcessor::processBufferGenome() {
if (!pi.r1empty) {
km1 = Kmer(seqs[si1].first + pi.k1pos);
strInfo1 = strandednessInfo(km1, val1, ua);
-
}
if (paired && !pi.r2empty) {
km2 = Kmer(seqs[si2].first + pi.k2pos);
@@ -2743,7 +2913,8 @@ bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector
umis.emplace_back(std::move(umi));
}
- flags.push_back(numreads++);
+ numreads++;
+ flags.push_back(numreads);
} else {
return true; // read it next time
}
@@ -2759,7 +2930,7 @@ bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector
}
// move constructor
-#if 1
+
FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) :
nfiles(o.nfiles),
numreads(o.numreads),
@@ -2770,34 +2941,16 @@ FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) :
files(std::move(o.files)),
umi_files(std::move(o.umi_files)),
f_umi(std::move(o.f_umi)),
- current_file(o.current_file) {
+ current_file(o.current_file),
+ seq(std::move(o.seq)) {
o.fp.resize(nfiles);
o.l.resize(nfiles, 0);
o.nl.resize(nfiles, 0);
+ o.seq.resize(nfiles, nullptr);
o.state = false;
}
-#endif
-#if 0
-FastqSequenceReader::FastqSequenceReader(FastqSequenceReader&& o) {
- fp = (std::move(o.fp));
- l = (std::move(o.l));
- nl = (std::move(o.nl));
- paired = (o.paired);
- nfiles = (o.nfiles);
- files = (std::move(o.files)),
- umi_files = (std::move(o.umi_files));
- f_umi = (std::move(o.f_umi));
- current_file= (o.current_file);
- seq = (std::move(o.seq));
- o.fp.resize(nfiles);
- o.l.resize(nfiles, 0);
- o.nl.resize(nfiles, 0);
- o.state = false;
- o.seq.resize(nfiles, nullptr);
-}
-#endif
const std::string BamSequenceReader::seq_enc = "=ACMGRSVTWYHKDBN";
BamSequenceReader::~BamSequenceReader() {
=====================================
src/ProcessReads.h
=====================================
@@ -161,7 +161,7 @@ public:
char *umi;
int l_umi;
int err;
- char seq[256]; // Is there a better limit?
+
private:
static const std::string seq_enc;
@@ -334,6 +334,7 @@ public:
int64_t numreads;
int id;
int local_id;
+ PseudoAlignmentBatch pseudobatch;
int bc_len[33];
int umi_len[33];
=====================================
src/PseudoBam.cpp
=====================================
@@ -407,6 +407,8 @@ void writePseudoAlignmentBatch(std::ofstream& of, const PseudoAlignmentBatch& ba
of.write((char*)&k1, 1);
of.write((char*)&k2, 1);
of.write((char*)&x.ec_id,sizeof(int32_t));
+ of.write((char*)&x.barcode, sizeof(uint64_t));
+ of.write((char*)&x.UMI, sizeof(uint64_t));
if (x.ec_id == -1) {
// exceptional case, no ec_id, yet, but need to write the v vector
uint32_t sz = x.u.size();
@@ -445,6 +447,8 @@ void readPseudoAlignmentBatch(std::ifstream& in, PseudoAlignmentBatch& batch) {
info.k1pos = (k1 == 255) ? -1 : k1;
info.k2pos = (k2 == 255) ? -1 : k2;
in.read((char*)&info.ec_id, sizeof(int32_t));
+ in.read((char*)&info.barcode, sizeof(uint64_t));
+ in.read((char*)&info.UMI, sizeof(uint64_t));
if (info.ec_id == -1) {
uint32_t sz;
in.read((char*)&sz, sizeof(uint32_t));
=====================================
src/PseudoBam.h
=====================================
@@ -31,7 +31,10 @@ struct PseudoAlignmentInfo {
int k2pos;
int32_t ec_id;
std::vector<int32_t> u;
- PseudoAlignmentInfo() : id(-1), r1empty (true), r2empty(true), paired(true), k1pos(-1), k2pos(-1), ec_id(-1) {}
+ uint64_t barcode;
+ uint64_t UMI;
+
+ PseudoAlignmentInfo() : id(-1), r1empty (true), r2empty(true), paired(true), k1pos(-1), k2pos(-1), ec_id(-1), barcode(0), UMI(0) {}
};
=====================================
src/common.h
=====================================
@@ -1,7 +1,7 @@
#ifndef KALLISTO_COMMON_H
#define KALLISTO_COMMON_H
-#define KALLISTO_VERSION "0.46.1"
+#define KALLISTO_VERSION "0.46.2"
#include <string>
#include <vector>
@@ -24,7 +24,7 @@ struct BUSOptions {
BUSOptionSubstr umi;
std::vector<BUSOptionSubstr> bc;
- BUSOptionSubstr seq;
+ std::vector<BUSOptionSubstr> seq;
int getBCLength() const {
int r =0 ;
=====================================
src/h5utils.cpp
=====================================
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#include "h5utils.h"
// allocate a contiguous block of memory, dependent on the largest string
@@ -138,3 +139,4 @@ void read_vector(
delete [] pool;
}
+#endif // USE_HDF5
\ No newline at end of file
=====================================
src/h5utils.h
=====================================
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#ifndef KALLISTO_H5_UTILS
#define KALLISTO_H5_UTILS
@@ -144,3 +145,4 @@ void read_dataset(hid_t group_id,
// end: reading utils
#endif // KALLISTO_H5_UTILS
+#endif // USE_HDF5
\ No newline at end of file
=====================================
src/kseq.h deleted
=====================================
@@ -1,242 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/* Last Modified: 05MAR2012 */
-
-#ifndef AC_KSEQ_H
-#define AC_KSEQ_H
-
-#include <ctype.h>
-#include <string.h>
-#include <stdlib.h>
-
-#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
-#define KS_SEP_TAB 1 // isspace() && !' '
-#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
-#define KS_SEP_MAX 2
-
-#define __KS_TYPE(type_t) \
- typedef struct __kstream_t { \
- unsigned char *buf; \
- int begin, end, is_eof; \
- type_t f; \
- } kstream_t;
-
-#define ks_err(ks) ((ks)->end == -1)
-#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
-#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
-
-#define __KS_BASIC(type_t, __bufsize) \
- static inline kstream_t *ks_init(type_t f) \
- { \
- kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
- ks->f = f; \
- ks->buf = (unsigned char*)malloc(__bufsize); \
- return ks; \
- } \
- static inline void ks_destroy(kstream_t *ks) \
- { \
- if (ks) { \
- free(ks->buf); \
- free(ks); \
- } \
- }
-
-#define __KS_GETC(__read, __bufsize) \
- static inline int ks_getc(kstream_t *ks) \
- { \
- if (ks_err(ks)) return -3; \
- if (ks->is_eof && ks->begin >= ks->end) return -1; \
- if (ks->begin >= ks->end) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end == 0) { ks->is_eof = 1; return -1;} \
- if (ks->end == -1) { ks->is_eof = 1; return -3;}\
- } \
- return (int)ks->buf[ks->begin++]; \
- }
-
-#ifndef KSTRING_T
-#define KSTRING_T kstring_t
-typedef struct __kstring_t {
- size_t l, m;
- char *s;
-} kstring_t;
-#endif
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#define __KS_GETUNTIL(__read, __bufsize) \
- static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
- { \
- int gotany = 0; \
- if (dret) *dret = 0; \
- str->l = append? str->l : 0; \
- for (;;) { \
- int i; \
- if (ks_err(ks)) return -3; \
- if (ks->begin >= ks->end) { \
- if (!ks->is_eof) { \
- ks->begin = 0; \
- ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end == 0) { ks->is_eof = 1; break; } \
- if (ks->end == -1) { ks->is_eof = 1; return -3; } \
- } else break; \
- } \
- if (delimiter == KS_SEP_LINE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == '\n') break; \
- } else if (delimiter > KS_SEP_MAX) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == delimiter) break; \
- } else if (delimiter == KS_SEP_SPACE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i])) break; \
- } else if (delimiter == KS_SEP_TAB) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
- } else i = 0; /* never come to here! */ \
- if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
- str->m = str->l + (i - ks->begin) + 1; \
- kroundup32(str->m); \
- str->s = (char*)realloc(str->s, str->m); \
- } \
- gotany = 1; \
- memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
- str->l = str->l + (i - ks->begin); \
- ks->begin = i + 1; \
- if (i < ks->end) { \
- if (dret) *dret = ks->buf[i]; \
- break; \
- } \
- } \
- if (!gotany && ks_eof(ks)) return -1; \
- if (str->s == 0) { \
- str->m = 1; \
- str->s = (char*)calloc(1, 1); \
- } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
- str->s[str->l] = '\0'; \
- return str->l; \
- } \
- static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
- { return ks_getuntil2(ks, delimiter, str, dret, 0); }
-
-#define KSTREAM_INIT(type_t, __read, __bufsize) \
- __KS_TYPE(type_t) \
- __KS_BASIC(type_t, __bufsize) \
- __KS_GETC(__read, __bufsize) \
- __KS_GETUNTIL(__read, __bufsize)
-
-#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
-
-#define __KSEQ_BASIC(SCOPE, type_t) \
- SCOPE kseq_t *kseq_init(type_t fd) \
- { \
- kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
- s->f = ks_init(fd); \
- return s; \
- } \
- SCOPE void kseq_destroy(kseq_t *ks) \
- { \
- if (!ks) return; \
- free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
- ks_destroy(ks->f); \
- free(ks); \
- }
-
-/* Return value:
- >=0 length of the sequence (normal)
- -1 end-of-file
- -2 truncated quality string
- -3 error reading stream
- */
-#define __KSEQ_READ(SCOPE) \
- SCOPE int kseq_read(kseq_t *seq) \
- { \
- int c,r; \
- kstream_t *ks = seq->f; \
- if (seq->last_char == 0) { /* then jump to the next header line */ \
- while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
- if (c < 0) return c; /* end of file or error*/ \
- seq->last_char = c; \
- } /* else: the first header char has been read in the previous call */ \
- seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
- if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \
- if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
- if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
- seq->seq.m = 256; \
- seq->seq.s = (char*)malloc(seq->seq.m); \
- } \
- while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \
- if (c == '\n') continue; /* skip empty lines */ \
- seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
- ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
- } \
- if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
- if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
- seq->seq.m = seq->seq.l + 2; \
- kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
- } \
- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
- if (c != '+') return seq->seq.l; /* FASTA */ \
- if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
- seq->qual.m = seq->seq.m; \
- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
- } \
- while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \
- if (c == -1) return -2; /* error: no quality string */ \
- while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \
- if (c == -3) return -3; /* stream error */ \
- seq->last_char = 0; /* we have not come to the next header line */ \
- if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
- return seq->seq.l; \
- }
-
-#define __KSEQ_TYPE(type_t) \
- typedef struct { \
- kstring_t name, comment, seq, qual; \
- int last_char; \
- kstream_t *f; \
- } kseq_t;
-
-#define KSEQ_INIT2(SCOPE, type_t, __read) \
- KSTREAM_INIT(type_t, __read, 16384) \
- __KSEQ_TYPE(type_t) \
- __KSEQ_BASIC(SCOPE, type_t) \
- __KSEQ_READ(SCOPE)
-
-#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
-
-#define KSEQ_DECLARE(type_t) \
- __KS_TYPE(type_t) \
- __KSEQ_TYPE(type_t) \
- extern kseq_t *kseq_init(type_t fd); \
- void kseq_destroy(kseq_t *ks); \
- int kseq_read(kseq_t *seq);
-
-#endif
=====================================
src/main.cpp
=====================================
@@ -23,6 +23,7 @@
#include "Inspect.h"
#include "Bootstrap.h"
#include "H5Writer.h"
+#include "PlaintextWriter.h"
#include "GeneModel.h"
#include "Merge.h"
@@ -238,6 +239,7 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) {
}
case 'c': {
stringstream(optarg) >> opt.chromFile;
+ break;
}
case 'd': {
stringstream(optarg) >> opt.seed;
@@ -542,8 +544,12 @@ void ListSingleCellTechnologies() {
}
void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
- const char *opt_string = "i:o:x:t:lbn";
+ int verbose_flag = 0;
+ int gbam_flag = 0;
+
+ const char *opt_string = "i:o:x:t:lbng:c:";
static struct option long_options[] = {
+ {"verbose", no_argument, &verbose_flag, 1},
{"index", required_argument, 0, 'i'},
{"output-dir", required_argument, 0, 'o'},
{"technology", required_argument, 0, 'x'},
@@ -551,6 +557,9 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
{"threads", required_argument, 0, 't'},
{"bam", no_argument, 0, 'b'},
{"num", no_argument, 0, 'n'},
+ {"genomebam", no_argument, &gbam_flag, 1},
+ {"gtf", required_argument, 0, 'g'},
+ {"chromosomes", required_argument, 0, 'c'},
{0,0,0,0}
};
@@ -596,6 +605,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
opt.num = true;
break;
}
+ case 'g': {
+ stringstream(optarg) >> opt.gtfFile;
+ break;
+ }
+ case 'c': {
+ stringstream(optarg) >> opt.chromFile;
+ break;
+ }
default: break;
}
}
@@ -605,6 +622,16 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
exit(1);
}
+ if (verbose_flag) {
+ opt.verbose = true;
+ }
+
+ if (gbam_flag) {
+ opt.pseudobam = true;
+ opt.genomebam = true;
+ }
+
+
// all other arguments are fast[a/q] files to be read
for (int i = optind; i < argc; i++) {
opt.files.push_back(argv[i]);
@@ -612,107 +639,115 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
}
-bool ParseTechnology(const std::string &techstr, std::vector<BUSOptionSubstr> &values, std::vector<int> &files, std::vector<std::string> &errorList, std::vector<BUSOptionSubstr> &bcValues) {
- int lastIndex = 0; //the last place in the string a colon and punctuation was found
- int currValue = 1; //tells us the index of the three sequences needed barcode, umi, sequence
- vector<int> numbers; //stores the numbers in a pairs
- const char punctuationCompare = ',';
- const char colonCompare = ':';
- bool colon;
- bool duplicate = false;
- std::string stringVal;
- int val;
-
- for(int i = 1; i < techstr.length(); i++) {
- colon = false;
- if(techstr[i] == colonCompare) {
- colon = true;
- }
-
- if(techstr[i] == punctuationCompare || colon) {
- stringVal = techstr.substr(lastIndex, i-lastIndex);
+bool ParseTechnology(const std::string &techstr, BUSOptions& busopt, std::vector<std::string> &errorList) {
+ auto i1 = techstr.find(':');
+ if (i1 == std::string::npos) {
+ errorList.push_back("Error: technology string must contain two colons (:), none found: \"" + techstr + "\"");
+ return false;
+ }
+ auto i2 = techstr.find(':', i1+1);
+ if (i2 == std::string::npos) {
+ errorList.push_back("Error: technology string must contain two colons (:), only one found: \"" + techstr + "\"");
+ return false;
+ }
+ auto ip = techstr.find(':', i2+1);
+ if (ip != std::string::npos) {
+ errorList.push_back("Error: technology string must contain two colons (:), three found: \"" + techstr + "\"");
+ return false;
+ }
+ auto bcstr = techstr.substr(0, i1);
+ auto umistr = techstr.substr(i1+1,i2-i1-1);
+ auto seqstr = techstr.substr(i2+1);
+
+
+
+ int maxnf = 0;
+
+ auto convert_commas_to_vector = [&](const std::string &s, std::vector<BUSOptionSubstr> &v) -> bool {
+ std::vector<int> vv;
+ v.clear();
+ std::stringstream ss(s);
+ std::string t;
+ while (std::getline(ss, t, ',')) {
try {
- val = stoi(stringVal);
- numbers.push_back(val);
- if(numbers.size() == 1) {
- if(!files.empty()) {
- for(int j = 0; j < files.size(); j++) {
- if(val == files[j]) {
- duplicate = true;
- break;
- }
- }
- if(!duplicate) {
- files.push_back(val);
- } else {
- duplicate = false;
- }
- } else {
- files.push_back(val);
- }
- }
- lastIndex = i + 1;
- } catch(const std:: invalid_argument& ia) {
- errorList.push_back("Error: Invalid argument");
- return true;
+ int i = stoi(t);
+ vv.push_back(i);
+ } catch (std::invalid_argument &e) {
+ errorList.push_back("Error: converting to int: \"" + t + "\"");
+ return false;
}
+ }
- if(colon) {
- if(numbers.size() != 3) {
- errorList.push_back("Error: Wrong number of pairs provided");
- return true;
+ int nv = vv.size();
+ if (nv % 3 == 0) {
+ for (int i = 0; i+2 < nv; i+=3) {
+ int f = vv[i];
+ int a = vv[i+1];
+ int b = vv[i+2];
+ if (f < 0) {
+ errorList.push_back("Error: invalid file number (" + to_string(f) + ") " + s);
+ }
+ if (a < 0) {
+ errorList.push_back("Error: invalid start (" + to_string(a) + ") " + s);
+ }
+ if (b != 0 && b <= a) {
+ errorList.push_back("Error: invalid stop (" + to_string(b) + ") has to be after start (" + to_string(a) + ") " + s);
+ }
+ v.push_back(BUSOptionSubstr(f,a,b));
+ if (f > maxnf) {
+ maxnf = f;
}
}
+ } else {
+ errorList.push_back("Error: number of values has to be multiple of 3 " + s);
+ return false;
}
- if(numbers.size() == 3) {
- if(currValue == 1) {
- bcValues.push_back(BUSOptionSubstr(numbers[0], numbers[1], numbers[2]));
- } else {
- values.push_back(BUSOptionSubstr(numbers[0], numbers[1], numbers[2]));
- }
- numbers.clear();
- }
- if(colon) {
- currValue++;
- }
- }
- if (files.empty()) {
- errorList.push_back(std::string("Error: parsing technology string \"") + techstr + "\"");
+ busopt.nfiles = maxnf+1;
return true;
+ };
+
+
+
+ std::vector<BUSOptionSubstr> v;
+ if (!convert_commas_to_vector(bcstr,v)) {
+ return false;
}
+ if (v.empty()) {
+ errorList.push_back("Error: empty barcode list " + bcstr);
+ return false;
+ }
+ busopt.bc = std::move(v);
- std::sort(files.begin(), files.end());
- for(int k = 0; k < files.size()-1; k++) {
- if(files[k]+1 != files[k+1]) {
- errorList.push_back("Error: files aren't correctly referenced");
- return true;
- }
+ if (!convert_commas_to_vector(umistr, v)) {
+ return false;
}
+ if (v.empty()) {
+ errorList.push_back("Error: empty UMI list " + umistr);
+ return false;
+ }
+ if (v.size() != 1) {
+ errorList.push_back("Error: only a single UMI list allow " + umistr);
+ return false;
+ }
+ busopt.umi = std::move(v.front());
- stringVal = techstr.substr(lastIndex, techstr.length()-lastIndex);
- if(numbers.size() == 2) {
- try {
- val = stoi(stringVal);
- numbers.push_back(val);
- values.push_back(BUSOptionSubstr(numbers[0], numbers[1], numbers[2]));
- numbers.clear();
- } catch(const std:: invalid_argument& ia) {
- errorList.push_back("Error: Invalid argument");
- return true;
- }
- } else {
- errorList.push_back("Error: Wrong number of pairs provided");
- return true;
+
+ if (!convert_commas_to_vector(seqstr, v)) {
+ return false;
}
- if(currValue != 3) {
- errorList.push_back("Error: Wrong number of substrings provided");
- return true;
+ if (v.empty()) {
+ errorList.push_back("Error: empty sequence list " + bcstr);
+ return false;
}
- return false;
+
+ busopt.seq = std::move(v);
+
+ return true;
}
+
void ParseOptionsH5Dump(int argc, char **argv, ProgramOptions& opt) {
int peek_flag = 0;
const char *opt_string = "o:";
@@ -828,55 +863,55 @@ bool CheckOptionsBus(ProgramOptions& opt) {
if (opt.bam) { // Note: only 10xV2 has been tested
busopt.nfiles = 1;
if (opt.technology == "10XV2") {
- busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0)); // second file, entire string
busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
} else if (opt.technology == "10XV3") {
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,16,28);
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
// } else if (opt.technology == "10XV1") {
} else if (opt.technology == "SURECELL") {
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,18,26);
busopt.bc.push_back(BUSOptionSubstr(0,0,18));
} else if (opt.technology == "DROPSEQ") {
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,12,20);
busopt.bc.push_back(BUSOptionSubstr(0,0,12));
} else if (opt.technology == "INDROPSV1") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,42,48);
busopt.bc.push_back(BUSOptionSubstr(0,0,11));
busopt.bc.push_back(BUSOptionSubstr(0,30,38));
} else if (opt.technology == "INDROPSV2") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(0,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(0,0,0));
busopt.umi = BUSOptionSubstr(1,42,48);
busopt.bc.push_back(BUSOptionSubstr(1,0,11));
busopt.bc.push_back(BUSOptionSubstr(1,30,38));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
- busopt.seq = BUSOptionSubstr(2,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(2,0,0));
busopt.umi = BUSOptionSubstr(1,8,14);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else if (opt.technology == "CELSEQ") {
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,8,12);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
} else if (opt.technology == "CELSEQ2") {
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,0,6);
busopt.bc.push_back(BUSOptionSubstr(0,6,12));
} else if (opt.technology == "SCRBSEQ") {
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,6,16);
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
} else if (opt.technology == "INDROPSV3") {
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,0,6);
busopt.bc.push_back(BUSOptionSubstr(0,6,16));
} else {
@@ -884,14 +919,18 @@ bool CheckOptionsBus(ProgramOptions& opt) {
vector<BUSOptionSubstr> values;
vector<BUSOptionSubstr> bcValues;
vector<std::string> errorList;
- bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
- if(!invalid) {
+ //bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
+ bool valid = ParseTechnology(opt.technology, busopt, errorList);
+
+ if(!valid) {
+ /*
busopt.nfiles = files.size();
for(int i = 0; i < bcValues.size(); i++) {
busopt.bc.push_back(bcValues[i]);
}
busopt.umi = values[0];
- busopt.seq = values[1];
+ busopt.seq.push_back(values[1]);
+ */
} else {
for(int j = 0; j < errorList.size(); j++) {
cerr << errorList[j] << endl;
@@ -903,67 +942,67 @@ bool CheckOptionsBus(ProgramOptions& opt) {
} else {
if (opt.technology == "10XV2") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0); // second file, entire string
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0)); // second file, entire string
busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
} else if (opt.technology == "10XV3") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,16,28);
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
} else if (opt.technology == "10XV1") {
busopt.nfiles = 3;
- busopt.seq = BUSOptionSubstr(0,0,0);
- busopt.umi = BUSOptionSubstr(1,0,0);
- busopt.bc.push_back(BUSOptionSubstr(2,0,0));
+ busopt.seq.push_back(BUSOptionSubstr(2,0,0));
+ busopt.umi = BUSOptionSubstr(1,0,10);
+ busopt.bc.push_back(BUSOptionSubstr(0,0,14));
} else if (opt.technology == "SURECELL") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,51,59);
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
busopt.bc.push_back(BUSOptionSubstr(0,21,27));
busopt.bc.push_back(BUSOptionSubstr(0,42,48));
} else if (opt.technology == "DROPSEQ") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,12,20);
busopt.bc.push_back(BUSOptionSubstr(0,0,12));
} else if (opt.technology == "INDROPSV1") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,42,48);
busopt.bc.push_back(BUSOptionSubstr(0,0,11));
busopt.bc.push_back(BUSOptionSubstr(0,30,38));
} else if (opt.technology == "INDROPSV2") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(0,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(0,0,0));
busopt.umi = BUSOptionSubstr(1,42,48);
busopt.bc.push_back(BUSOptionSubstr(1,0,11));
busopt.bc.push_back(BUSOptionSubstr(1,30,38));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
- busopt.seq = BUSOptionSubstr(2,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(2,0,0));
busopt.umi = BUSOptionSubstr(1,8,14);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else if (opt.technology == "CELSEQ") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,8,12);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
} else if (opt.technology == "CELSEQ2") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,0,6);
busopt.bc.push_back(BUSOptionSubstr(0,6,12));
} else if (opt.technology == "SCRBSEQ") {
busopt.nfiles = 2;
- busopt.seq = BUSOptionSubstr(1,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
busopt.umi = BUSOptionSubstr(0,6,16);
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
- busopt.seq = BUSOptionSubstr(2,0,0);
+ busopt.seq.push_back(BUSOptionSubstr(2,0,0));
busopt.umi = BUSOptionSubstr(1,8,14);
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
@@ -971,15 +1010,20 @@ bool CheckOptionsBus(ProgramOptions& opt) {
vector<int> files;
vector<BUSOptionSubstr> values;
vector<BUSOptionSubstr> bcValues;
- vector<std::string> errorList;
- bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
- if(!invalid) {
+ vector<std::string> errorList;
+ //bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
+ bool valid = ParseTechnology(opt.technology, busopt, errorList);
+
+
+ if(valid) {
+ /*
busopt.nfiles = files.size();
for(int i = 0; i < bcValues.size(); i++) {
busopt.bc.push_back(bcValues[i]);
}
busopt.umi = values[0];
- busopt.seq = values[1];
+ busopt.seq.push_back(values[1]);
+ */
} else {
for(int j = 0; j < errorList.size(); j++) {
cerr << errorList[j] << endl;
@@ -991,6 +1035,30 @@ bool CheckOptionsBus(ProgramOptions& opt) {
}
}
+ if (opt.genomebam) {
+ if (opt.busOptions.seq.size() != 1) {
+ cerr << "Error: BAM output is currently only supported for technologies with a single CDNA read file" << endl;
+ ret = false;
+ }
+ if (!opt.gtfFile.empty()) {
+ if (!checkFileExists(opt.gtfFile)) {
+ cerr << "Error: GTF file " << opt.gtfFile << " does not exist" << endl;
+ ret = false;
+ }
+ } else {
+ cerr << "Error: need GTF file for genome alignment" << endl;
+ ret = false;
+ }
+ if (!opt.chromFile.empty()) {
+ if (!checkFileExists(opt.chromFile)) {
+ cerr << "Error: Chromosome file not found: " << opt.chromFile << endl;
+ ret = false;
+ }
+ }
+ }
+
+
+
if (ret && !opt.bam && opt.files.size() % opt.busOptions.nfiles != 0) {
cerr << "Error: Number of files (" << opt.files.size() << ") does not match number of input files required by "
<< "technology " << opt.technology << " (" << opt.busOptions.nfiles << ")" << endl;
@@ -1223,6 +1291,14 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) {
ret = false;
}
+ #ifndef USE_HDF5
+ if (opt.bootstrap > 0 && !opt.plaintext) {
+ cerr << "Warning: kallisto was not compiled with HDF5 support so no bootstrapping" << endl
+ << "will be performed. Run quant with --plaintext option or recompile with" << endl
+ << "HDF5 support to obtain bootstrap estimates." << endl;
+ opt.bootstrap = 0;
+ }
+ #endif
return ret;
}
@@ -1648,7 +1724,8 @@ void usageBus() {
<< "-l, --list List all single-cell technologies supported" << endl
<< "-t, --threads=INT Number of threads to use (default: 1)" << endl
<< "-b, --bam Input file is a BAM file" << endl
- << "-n, --num Output number of read in flag column (incompatible with --bam)" << endl;
+ << "-n, --num Output number of read in flag column (incompatible with --bam)" << endl
+ << " --verbose Print out progress information every 1M proccessed reads" << endl;
}
void usageIndex() {
@@ -1715,7 +1792,8 @@ void usageEM(bool valid_input = true) {
<< "-g, --gtf GTF file for transcriptome information" << endl
<< " (required for --genomebam)" << endl
<< "-c, --chromosomes Tab separated file with chromosome names and lengths" << endl
- << " (optional for --genomebam, but recommended)" << endl;
+ << " (optional for --genomebam, but recommended)" << endl
+ << " --verbose Print out progress information every 1M proccessed reads" << endl;
}
@@ -1853,12 +1931,28 @@ int main(int argc, char *argv[]) {
exit(1);
} else {
int num_trans, index_version;
- int64_t num_processed, num_pseudoaligned, num_unique;
+ int64_t num_processed = 0;
+ int64_t num_pseudoaligned = 0;
+ int64_t num_unique = 0;
+
opt.bus_mode = true;
opt.single_end = false;
KmerIndex index(opt);
index.load(opt);
+
+
+ bool guessChromosomes = false;
Transcriptome model; // empty
+ if (opt.genomebam) {
+ if (!opt.chromFile.empty()) {
+ model.loadChromosomes(opt.chromFile);
+ } else {
+ guessChromosomes = true;
+ }
+ model.parseGTF(opt.gtfFile, index, opt, guessChromosomes);
+ }
+
+
MinCollector collection(index, opt);
MasterProcessor MP(index, opt, collection, model);
num_processed = ProcessBUSReads(MP, opt);
@@ -1939,6 +2033,18 @@ int main(int argc, char *argv[]) {
std::string(std::to_string(index_version)),
start_time,
call);
+
+ if (opt.pseudobam) {
+ std::vector<double> fl_means(index.target_lens_.size(),0.0);
+ EMAlgorithm em(collection.counts, index, collection, fl_means, opt);
+ MP.processAln(em, false);
+ }
+
+
+ cerr << endl;
+ if (num_pseudoaligned == 0) {
+ exit(1); // exit with error
+ }
}
} else if (cmd == "merge") {
if (argc == 2) {
@@ -2053,12 +2159,15 @@ int main(int argc, char *argv[]) {
std::string call = argv_to_string(argc, argv);
+
+ #ifdef USE_HDF5
H5Writer writer;
if (!opt.plaintext) {
writer.init(opt.output + "/abundance.h5", opt.bootstrap, num_processed, fld, preBias, em.post_bias_, 6,
index.INDEX_VERSION, call, start_time);
writer.write_main(em, index.target_names_, index.target_lens_);
}
+ #endif // USE_HDF5
for (int i = 0; i < index.num_trans; i++) {
num_unique += collection.counts[i];
@@ -2090,7 +2199,9 @@ int main(int argc, char *argv[]) {
// this happens if nothing aligns, then we write an empty bootstrap file
for (int b = 0; b < opt.bootstrap; b++) {
if (!opt.plaintext) {
+ #ifdef USE_HDF5
writer.write_bootstrap(em, b); // em is empty
+ #endif
} else {
plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
@@ -2116,9 +2227,10 @@ int main(int argc, char *argv[]) {
<< opt.bootstrap << endl;
n_threads = opt.bootstrap;
}
-
+ #ifdef USE_HDF5
BootstrapThreadPool pool(opt.threads, seeds, collection.counts, index,
- collection, em.eff_lens_, opt, writer, fl_means);
+ collection, em.eff_lens_, opt, &writer, fl_means);
+ #endif
} else {
for (auto b = 0; b < B; ++b) {
Bootstrap bs(collection.counts, index, collection, em.eff_lens_, seeds[b], fl_means, opt);
@@ -2126,7 +2238,9 @@ int main(int argc, char *argv[]) {
auto res = bs.run_em();
if (!opt.plaintext) {
+ #ifdef USE_HDF5
writer.write_bootstrap(res, b);
+ #endif
} else {
plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
em.target_names_, res.alpha_, em.eff_lens_, index.target_lens_);
@@ -2193,13 +2307,13 @@ int main(int argc, char *argv[]) {
em.run(10000, 50, true, opt.bias);
std::string call = argv_to_string(argc, argv);
- H5Writer writer;
+ //H5Writer writer;
if (!opt.plaintext) {
// setting num_processed to 0 because quant-only is for debugging/special ops
- writer.init(opt.output + "/abundance.h5", opt.bootstrap, 0, fld, preBias, em.post_bias_, 6,
+ /* writer.init(opt.output + "/abundance.h5", opt.bootstrap, 0, fld, preBias, em.post_bias_, 6,
index.INDEX_VERSION, call, start_time);
- writer.write_main(em, index.target_names_, index.target_lens_);
+ writer.write_main(em, index.target_names_, index.target_lens_);*/
} else {
plaintext_aux(
opt.output + "/run_info.json",
@@ -2231,7 +2345,7 @@ int main(int argc, char *argv[]) {
// this happens if nothing aligns, then we write an empty bootstrap file
for (int b = 0; b < opt.bootstrap; b++) {
if (!opt.plaintext) {
- writer.write_bootstrap(em, b); // em is empty
+ //writer.write_bootstrap(em, b); // em is empty
} else {
plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
em.target_names_, em.alpha_, em.eff_lens_, index.target_lens_); // re-use empty input
@@ -2258,8 +2372,8 @@ int main(int argc, char *argv[]) {
n_threads = opt.bootstrap;
}
- BootstrapThreadPool pool(n_threads, seeds, collection.counts, index,
- collection, em.eff_lens_, opt, writer, fl_means);
+ /*BootstrapThreadPool pool(n_threads, seeds, collection.counts, index,
+ collection, em.eff_lens_, opt, writer, fl_means);*/
} else {
for (auto b = 0; b < B; ++b) {
Bootstrap bs(collection.counts, index, collection, em.eff_lens_, seeds[b], fl_means, opt);
@@ -2267,7 +2381,7 @@ int main(int argc, char *argv[]) {
auto res = bs.run_em();
if (!opt.plaintext) {
- writer.write_bootstrap(res, b);
+ //writer.write_bootstrap(res, b);
} else {
plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
em.target_names_, res.alpha_, em.eff_lens_, index.target_lens_);
@@ -2326,6 +2440,11 @@ int main(int argc, char *argv[]) {
}
}
+ std::ofstream transout_f((opt.output + "/transcripts.txt"));
+ for (const auto &t : index.target_names_) {
+ transout_f << t << "\n";
+ }
+ transout_f.close();
plaintext_aux(
opt.output + "/run_info.json",
@@ -2511,12 +2630,13 @@ int main(int argc, char *argv[]) {
usageh5dump();
exit(1);
}
-
+ #ifdef USE_HDF5
H5Converter h5conv(opt.files[0], opt.output);
if (!opt.peek) {
h5conv.write_aux();
h5conv.convert();
}
+ #endif
} else {
cerr << "Error: invalid command " << cmd << endl;
usage();
=====================================
test/Snakefile
=====================================
@@ -5,13 +5,19 @@ INDEX = "{0}.kidx".format(PRE)
rule all:
input:
- "kallisto_out/abundance.h5"
+ "quant_out/abundance.tsv",
+ "bus_out/output.bus"
rule index:
input: FASTA
output: INDEX
shell:
- "kallisto index -i {output} {input}"
+ """
+ kallisto index -i {output} {input}
+
+ mkdir quant_out
+ mkdir bus_out
+ """
rule kallisto_quant:
input:
@@ -19,15 +25,36 @@ rule kallisto_quant:
"reads_2.fastq.gz",
INDEX
output:
- "kallisto_out",
- "kallisto_out/abundance.h5",
- "kallisto_out/abundance.tsv",
- "kallisto_out/run_info.json"
+ "quant_out/abundance.tsv",
+ "quant_out/run_info.json"
+ shell:
+ """
+ kallisto quant \
+ -i {INDEX} \
+ -b 30 \
+ -o quant_out/ \
+ --genomebam \
+ --gtf {GTF} \
+ --chromosomes chrom.txt \
+ {input[0]} {input[1]}
+ """
+
+rule kallisto_bus:
+ input:
+ "sc_reads_1.fastq.gz",
+ "sc_reads_2.fastq.gz",
+ INDEX
+ output:
+ "bus_out/matrix.ec",
+ "bus_out/transcripts.txt",
+ "bus_out/output.bus",
+ "bus_out/run_info.json"
shell:
- "kallisto quant "
- "-i {INDEX} "
- "-b 30 "
- "-o {output[0]} "
- "--genomebam --gtf {GTF} "
- "--chromosomes chrom.txt "
- "{input[0]} {input[1]}"
+ """
+ kallisto bus \
+ -i {INDEX} \
+ -x 10xv2 \
+ -o bus_out \
+ -t 4 \
+ {input[0]} {input[1]}
+ """
=====================================
test/sc_reads_1.fastq.gz
=====================================
Binary files /dev/null and b/test/sc_reads_1.fastq.gz differ
=====================================
test/sc_reads_2.fastq.gz
=====================================
Binary files /dev/null and b/test/sc_reads_2.fastq.gz differ
View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/commit/53c35c620a0df2920adba9b0cfbd8ea07fcdd7b2
--
View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/commit/53c35c620a0df2920adba9b0cfbd8ea07fcdd7b2
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200515/80ca58d2/attachment-0001.html>
More information about the debian-med-commit
mailing list