[med-svn] [Git][med-team/kallisto][master] 8 commits: New upstream version 0.48.0+dfsg
Andrius Merkys (@merkys)
gitlab at salsa.debian.org
Wed Jan 19 06:45:37 GMT 2022
Andrius Merkys pushed to branch master at Debian Med / kallisto
Commits:
abea4d08 by Andrius Merkys at 2022-01-19T01:08:32-05:00
New upstream version 0.48.0+dfsg
- - - - -
decabdf9 by Andrius Merkys at 2022-01-19T01:08:38-05:00
Update upstream source from tag 'upstream/0.48.0+dfsg'
Update to upstream version '0.48.0+dfsg'
with Debian dir 4179b2017828b4236a6529799a5037ecde78191f
- - - - -
fb469380 by Andrius Merkys at 2022-01-19T01:15:00-05:00
Enabling function testing.
- - - - -
d15ce768 by Andrius Merkys at 2022-01-19T01:15:28-05:00
Build with tests depend on catch2.
- - - - -
0ac5f529 by Andrius Merkys at 2022-01-19T01:16:00-05:00
Bumping copyright years.
- - - - -
08b06206 by Andrius Merkys at 2022-01-19T01:18:48-05:00
Refreshing patches, disabling Check_busOptions.seq_length_before_accessing.patch (integrated upstream).
- - - - -
651cb73e by Andrius Merkys at 2022-01-19T01:19:47-05:00
New changelog entry.
- - - - -
1e5743ab by Andrius Merkys at 2022-01-19T01:38:54-05:00
Disabling tests with bustools.
- - - - -
29 changed files:
- + .github/workflows/test.yml
- CMakeLists.txt
- debian/changelog
- debian/control
- debian/copyright
- + debian/patches/disable-failing-bustools-test.patch
- debian/patches/htslib1.3
- debian/patches/series
- debian/patches/spelling
- debian/patches/use_debian_packaged_htslib.patch
- debian/rules
- + func_tests/CMakeLists.txt
- + func_tests/runtests.sh
- src/BUSData.cpp
- src/BUSData.h
- src/BUSTools.cpp
- src/BUSTools.h
- src/GeneModel.cpp
- src/GeneModel.h
- src/Inspect.h
- src/KmerIndex.cpp
- src/KmerIndex.h
- src/MinCollector.cpp
- src/PlaintextWriter.cpp
- src/PlaintextWriter.h
- src/ProcessReads.cpp
- src/ProcessReads.h
- src/common.h
- src/main.cpp
Changes:
=====================================
.github/workflows/test.yml
=====================================
@@ -0,0 +1,28 @@
+name: Test
+
+on:
+ workflow_dispatch:
+
+jobs:
+ build-linux:
+ name: Build and test linux
+ runs-on: ubuntu-18.04
+ env:
+ RELEASE_OS: linux
+ steps:
+ - name: checkout branch
+ uses: actions/checkout at master
+ - name: Setup environment
+ id: setup
+ run: echo ::set-output name=RELEASE_VERSION::${GITHUB_REF##*/}
+ - name: make
+ run: |
+ cd ext/htslib && autoheader && autoconf && cd ../..
+ mkdir build && cd build && cmake .. -DBUILD_FUNCTESTING=ON && make && make install && cd ..
+ - name: Setup bustools
+ run: |
+ git clone https://github.com/BUStools/bustools.git
+ cd bustools && mkdir build && cd build && cmake .. && make && make install && cd ../..
+ - name: Functional testing
+ run: |
+ cd build && make test
=====================================
CMakeLists.txt
=====================================
@@ -74,5 +74,16 @@ if (BUILD_TESTING)
add_subdirectory(unit_tests)
endif(BUILD_TESTING)
+option(BUILD_FUNCTESTING "Build functional tests." OFF)
+
+if (BUILD_FUNCTESTING)
+ add_subdirectory(func_tests)
+ message("Functional testing enabled.")
+ add_custom_target(test
+ COMMAND /bin/bash ./func_tests/runtests.sh
+ DEPENDS ./src/kallisto
+ )
+endif(BUILD_FUNCTESTING)
+
# enable_testing()
# add_test(MainTest test/tests)
=====================================
debian/changelog
=====================================
@@ -1,3 +1,15 @@
+kallisto (0.48.0+dfsg-1) UNRELEASED; urgency=medium
+
+ * Team upload.
+ * New upstream version 0.48.0+dfsg
+ * Enabling function testing.
+ * Build with tests depend on catch2.
+ * Bumping copyright years.
+ * Refreshing patches, disabling
+ Check_busOptions.seq_length_before_accessing.patch (integrated upstream).
+
+ -- Andrius Merkys <merkys at debian.org> Wed, 19 Jan 2022 01:19:18 -0500
+
kallisto (0.46.2+dfsg-3) unstable; urgency=medium
* Team Upload.
=====================================
debian/control
=====================================
@@ -4,6 +4,7 @@ Uploaders: Andreas Tille <tille at debian.org>
Section: science
Priority: optional
Build-Depends: debhelper-compat (= 12),
+ catch2,
cmake,
libhdf5-dev,
libhts-dev
=====================================
debian/copyright
=====================================
@@ -34,6 +34,7 @@ License: public_domain.
Files: debian/*
Copyright: 2019 Andreas Tille <tille at debian.org>
+ 2022 Andrius Merkys <merkys at debian.org>
License: BSD-2-Clause
License: BSD-2-Clause
=====================================
debian/patches/disable-failing-bustools-test.patch
=====================================
@@ -0,0 +1,16 @@
+Description: Tests fail with bustools with:
+ terminate called after throwing an instance of 'std::bad_alloc'
+ what(): std::bad_alloc
+ Aborted
+Author: Andrius Merkys <merkys at debian.org>
+--- a/func_tests/runtests.sh
++++ b/func_tests/runtests.sh
+@@ -257,6 +257,8 @@
+ cmdexec "$kallisto quant -o $test_dir/quantlarge -t 12 -i $test_dir/basic7.idx --single -l 5 -s 2 $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz"
+ checkcmdoutput "cat $test_dir/quantlarge/abundance.tsv" 024e56c50c774aa09e00a441512415aa
+
++echo "Not testing with bustools as tests fail with bustools"
++exit 0
+
+ ### TEST - kallisto bus ###
+
=====================================
debian/patches/htslib1.3
=====================================
@@ -1,8 +1,8 @@
From: Michael R. Crusoe <michael.crusoe at gmail.com>
Subject: Use kseq.h from htslib, not the bundled version
---- kallisto.orig/src/KmerIndex.cpp
-+++ kallisto/src/KmerIndex.cpp
-@@ -4,7 +4,7 @@
+--- a/src/KmerIndex.cpp
++++ b/src/KmerIndex.cpp
+@@ -5,7 +5,7 @@
#include <ctype.h>
#include <zlib.h>
#include <unordered_set>
@@ -11,8 +11,8 @@ Subject: Use kseq.h from htslib, not the bundled version
#ifndef KSEQ_INIT_READY
#define KSEQ_INIT_READY
---- kallisto.orig/src/ProcessReads.cpp
-+++ kallisto/src/ProcessReads.cpp
+--- a/src/ProcessReads.cpp
++++ b/src/ProcessReads.cpp
@@ -1,6 +1,6 @@
/*
#include <zlib.h>
@@ -30,8 +30,8 @@ Subject: Use kseq.h from htslib, not the bundled version
#include "PseudoBam.h"
#include "Fusion.hpp"
#include "BUSData.h"
---- kallisto.orig/src/ProcessReads.h
-+++ kallisto/src/ProcessReads.h
+--- a/src/ProcessReads.h
++++ b/src/ProcessReads.h
@@ -2,7 +2,7 @@
#define KALLISTO_PROCESSREADS_H
@@ -41,8 +41,8 @@ Subject: Use kseq.h from htslib, not the bundled version
#include <string>
#include <vector>
#include <unordered_map>
---- kallisto.orig/unit_tests/test_kmerhashtable.cpp
-+++ kallisto/unit_tests/test_kmerhashtable.cpp
+--- a/unit_tests/test_kmerhashtable.cpp
++++ b/unit_tests/test_kmerhashtable.cpp
@@ -13,7 +13,7 @@
#include "KmerHashTable.h"
=====================================
debian/patches/series
=====================================
@@ -1,5 +1,6 @@
use_debian_packaged_htslib.patch
spelling
htslib1.3
-Check_busOptions.seq_length_before_accessing.patch
+#Check_busOptions.seq_length_before_accessing.patch
gcc11.patch
+disable-failing-bustools-test.patch
=====================================
debian/patches/spelling
=====================================
@@ -1,8 +1,8 @@
From: Michael R. Crusoe
Subject: fix spelling typo
---- kallisto.orig/src/Inspect.h
-+++ kallisto/src/Inspect.h
-@@ -249,7 +249,7 @@
+--- a/src/Inspect.h
++++ b/src/Inspect.h
+@@ -250,7 +250,7 @@
const Contig& c = index.dbGraph.contigs[i];
if (c.seq.size() != c.length + k-1) {
=====================================
debian/patches/use_debian_packaged_htslib.patch
=====================================
@@ -4,7 +4,7 @@ Description: Adapt build system to Debian packaged libhts
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-@@ -34,7 +34,7 @@ ELSE(LINK MATCHES shared)
+@@ -41,7 +41,7 @@
message("shared build")
ENDIF(LINK MATCHES static)
@@ -13,7 +13,7 @@ Description: Adapt build system to Debian packaged libhts
include(ExternalProject)
ExternalProject_Add(htslib
PREFIX ${PROJECT_SOURCE_DIR}/ext/htslib
-@@ -47,7 +47,7 @@ ExternalProject_Add(htslib
+@@ -54,7 +54,7 @@
)
include_directories(${htslib_PREFIX}/src/htslib)
@@ -24,7 +24,7 @@ Description: Adapt build system to Debian packaged libhts
# add_compile_options(-Wdeprecated-register)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
-@@ -3,7 +3,9 @@ file(GLOB headers *.h *.hpp)
+@@ -3,7 +3,9 @@
list(REMOVE_ITEM sources main.cpp)
@@ -34,7 +34,7 @@ Description: Adapt build system to Debian packaged libhts
add_library(kallisto_core ${sources} ${headers})
target_include_directories(kallisto_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-@@ -11,7 +13,11 @@ target_include_directories(kallisto_core
+@@ -11,7 +13,11 @@
add_executable(kallisto main.cpp)
find_package( Threads REQUIRED )
@@ -46,7 +46,7 @@ Description: Adapt build system to Debian packaged libhts
if(LINK MATCHES static)
set(BUILD_SHARED_LIBS OFF)
-@@ -56,4 +62,4 @@ else()
+@@ -62,4 +68,4 @@
endif(LINK MATCHES static)
=====================================
debian/rules
=====================================
@@ -5,19 +5,11 @@
export DEB_BUILD_MAINT_OPTIONS=hardening=+all
CMAKE_EXTRA_FLAGS += -DDEBIAN_BUILD=1 \
- -DLINK=shared
-
-# Build time test requires https://github.com/philsquared/Catch
-# -DBUILD_TESTING=ON \
+ -DLINK=shared \
+ -DBUILD_FUNCTESTING=1
%:
dh $@
override_dh_auto_configure:
dh_auto_configure -- $(CMAKE_EXTRA_FLAGS)
-
-### When overriding auto_test make sure DEB_BUILD_OPTIONS will be respected
-#override_dh_auto_test:
-#ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
-# do_stuff_for_testing
-#endif
=====================================
func_tests/CMakeLists.txt
=====================================
@@ -0,0 +1,4 @@
+project(FuncTests)
+
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/runtests.sh
+ DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
=====================================
func_tests/runtests.sh
=====================================
@@ -0,0 +1,296 @@
+#!/bin/bash
+
+### Setup ###
+
+kallisto="./src/kallisto"
+bustools="bustools"
+test_dir="./func_tests"
+
+cmdexec() {
+ cmd="$1"
+ expect_fail="$2"
+ printf "$cmd\n"
+ if eval "$cmd 2> /dev/null 1> /dev/null"; then
+ if [ "$expect_fail" = "1" ]; then
+ printf "^[Failed - Returned normally but expected failure]\n"
+ exit 1
+ else
+ printf "^[OK]\n"
+ fi
+ else
+ if [ "$expect_fail" = "1" ]; then
+ printf "^[OK - Exited non-zero as expected]\n"
+ else
+ printf "^[Failed]\n"
+ exit 1
+ fi
+ fi
+}
+
+checkcmdoutput() {
+ cmd="$1"
+ correct_md5="$2"
+ printf "$cmd\n"
+ output_md5=$(eval "$cmd"|md5sum|awk '{ print $1 }')
+ if [ "$output_md5" = "$correct_md5" ]; then
+ printf "^[Output OK]\n"
+ else
+ printf "^[Output incorrect! Expected: "
+ printf "$correct_md5"
+ printf " Actual: "
+ printf "$output_md5"
+ printf "]\n"
+ exit 1
+ fi
+}
+
+### FASTA files ###
+
+echo ">t1
+ACGTGATGAGTGAGTCAGT
+>t2
+ACGTGATGAGATGATGAGTCAGT
+>t3
+ACGTGATGTGAGTCAGT
+>t4
+ccccaaaaaa
+>t5
+ttttttgggg" > $test_dir/simple.fasta
+
+echo ">t1
+AAANNNTTTKK
+>t2
+NNNCCCCNCCC" > $test_dir/nonATCG.fasta
+
+echo ">t1
+AAAAAAAAAAAAAAAA
+>t2
+CCCAAAAAAAAAAAAA
+>t3
+CCCGGAAAAAAAAAAA
+>t4
+AAAAAAAAAAAAATTT
+>t5
+TAAAAAAAAAAAAAAT" > $test_dir/polyA.fasta
+
+echo ">t1
+AAATTTCCCGGGAAATTTCCCGGG
+>t2
+AAAtttCCCgggAAAtttCCCGGG
+>t3
+AAATTTCCCGGGAAATTTCCCGGG
+>t4
+AAATTTCCCGGGAAATTTCCCGGG
+>t4
+CCCCCCCCCCCCCCCCCCCCCCCC
+>t5
+CCCCCCCCCCCCCCCCCCCCCCCC
+>t5
+CCCCCCCCCCCCCCCCCCCCCCCC" > $test_dir/duplicates.fasta
+
+### FASTQ files ###
+
+cat $test_dir/simple.fasta $test_dir/nonATCG.fasta $test_dir/polyA.fasta $test_dir/duplicates.fasta | \
+ awk '{if(NR%2){ print "\@" $0 } else { printf "%s\n+\n",$0; i=0; while (i++ < length($0)) printf "J"; printf "\n" } }' \
+ | gzip > $test_dir/small.fastq.gz
+
+
+cat /dev/null > $test_dir/medium.fastq.gz
+for i in {1..700}; do cat $test_dir/small.fastq.gz >> $test_dir/medium.fastq.gz; done
+
+cat /dev/null > $test_dir/large.fastq.gz
+for i in {1..100}; do cat $test_dir/medium.fastq.gz >> $test_dir/large.fastq.gz; done
+
+rm $test_dir/medium.fastq.gz
+
+# Paired-end reads for basic7.idx index
+
+echo "@t1
+ACGTGATG
++
+JJJJJJJJ
+ at t2
+ACGTGATG
++
+JJJJJJJJ
+ at t3
+CCCCCCCC
++
+JJJJJJJJ
+ at t4
+ccccaaaaaa
++
+JJJJJJJJJJ
+ at t5
+ttttttgggg
++
+JJJJJJJJJJ
+" | gzip > $test_dir/simple_pair1.fastq.gz
+
+echo "@t1
+GAGTCAGT
++
+JJJJJJJJ
+ at t2
+TGAGTCAG
++
+JJJJJJJJ
+ at t3
+ACGTGATG
++
+JJJJJJJJ
+ at t4
+ccccaaaaaa
++
+JJJJJJJJJJ
+ at t5
+ccccaaaaaa
++
+JJJJJJJJJJ
+" | gzip > $test_dir/simple_pair2.fastq.gz
+
+# Barcode+UMI FastQ file for 10Xv3
+
+echo "@t1
+GCCCCCCCCCCCCCCGTAAAAAAAAAATGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t2
+GCCCCCCCCCCCCCCGTAAAAAAAAAATGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t3
+GCCCCCCCCCCCCCCGTAAAAttAAAAtGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t4
+GCCCCCCggCCCCCCGtAAAAttAAAAtGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+ at t5
+GCCCCCCttCCCCCCGtAAAAggAAAAtGG
++
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
+" | gzip > $test_dir/10xv3.fastq.gz
+
+
+### TEST - kallisto index ###
+
+echo "[INDEX]"
+
+# Test k=7
+
+cmdexec "$kallisto index -i $test_dir/basic7.idx -k 7 $test_dir/simple.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/basic7.idx 2>&1|head -4" 356d498c030cb270a124ca06ad42a9d5
+
+# Test k=9
+
+cmdexec "$kallisto index -i $test_dir/basic9.idx -k 9 $test_dir/simple.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/basic9.idx 2>&1|head -4" 9f9246d5073d8e6ba90b8644e4ed9cc9
+
+# Test non-ATCG bases
+
+cmdexec "$kallisto index -i $test_dir/nonATCG.idx -k 5 $test_dir/nonATCG.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/nonATCG.idx 2>&1|head -4" 5fcab956573ca37c2220d154e7760db2
+
+# Test polyA truncation
+
+cmdexec "$kallisto index -i $test_dir/polyA.idx -k 5 $test_dir/polyA.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/polyA.idx 2>&1|head -4" 8fe4f599c55a62f87e56067ccdbf865e
+
+# Test k = even number (should fail)
+
+cmdexec "$kallisto index -i $test_dir/basic_fail.idx -k 10 $test_dir/simple.fasta" 1
+
+# Test duplicate transcript names (should fail)
+
+cmdexec "$kallisto index -i $test_dir/duplicates_fail.idx -k 11 $test_dir/duplicates.fasta" 1
+
+# Test duplicate transcript names with --make-unique
+
+cmdexec "$kallisto index -i $test_dir/duplicates.idx -k 11 --make-unique $test_dir/duplicates.fasta"
+checkcmdoutput "$kallisto inspect ./func_tests/duplicates.idx 2>&1|head -4" b57f37128e2f2e66775aa811ed994cec
+
+
+### TEST - kallisto quant ###
+
+# Test single-end small.fastq.gz using various indices
+
+cmdexec "$kallisto quant -o $test_dir/quantbasic -i $test_dir/basic7.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasic/abundance.tsv" f1c8927dc29b943758242902d5c45a86
+
+cmdexec "$kallisto quant -o $test_dir/quantnonATCG -i $test_dir/nonATCG.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantnonATCG/abundance.tsv" 9272185ff011f9a840c29c5c40a2d390
+
+cmdexec "$kallisto quant -o $test_dir/quantpolyA -i $test_dir/polyA.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantpolyA/abundance.tsv" 745539c18d4ff07bf6de0938563f2362
+
+cmdexec "$kallisto quant -o $test_dir/quantduplicates -i $test_dir/duplicates.idx --single -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantduplicates/abundance.tsv" 49a62b913a155598dd6894a2f0eb0905
+
+# Test single-end small.fastq.gz strand-specificity
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicfr -i $test_dir/basic7.idx --single --fr-stranded -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicfr/abundance.tsv" ce2ed5a3a1bab582fcb62dc02f4d9323
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicrf -i $test_dir/basic7.idx --single --rf-stranded -l 5 -s 2 $test_dir/small.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicrf/abundance.tsv" 017e8ba77d7e7b39a60bb7c047e620dc
+
+# Test paired-end small.fastq.gz
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpaired -i $test_dir/basic7.idx $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpaired/abundance.tsv" 1cf9cd508c04eff7cbda6e74cc1e44ea
+
+# Test paired-end small.fastq.gz with multiple pairs of files and with strand-specificity
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpairedfr -i $test_dir/basic7.idx --fr-stranded $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpairedfr/abundance.tsv" bc6a75291c2eb661a57b55de534dd8f0
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpairedrf -i $test_dir/basic7.idx --rf-stranded $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpairedrf/abundance.tsv" dd1ed6ed8fb393616817a3dd506f821f
+
+cmdexec "$kallisto quant -o $test_dir/quantbasicpairedmultfr -i $test_dir/basic7.idx --fr-stranded $test_dir/simple_pair1.fastq.gz $test_dir/simple_pair2.fastq.gz $test_dir/simple_pair2.fastq.gz $test_dir/simple_pair1.fastq.gz"
+checkcmdoutput "cat $test_dir/quantbasicpairedmultfr/abundance.tsv" f810d28aed3969514f1d21120a6fc825
+
+# Test multiple large fastq files with more threads
+
+cmdexec "$kallisto quant -o $test_dir/quantlarge -t 12 -i $test_dir/basic7.idx --single -l 5 -s 2 $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz"
+checkcmdoutput "cat $test_dir/quantlarge/abundance.tsv" 024e56c50c774aa09e00a441512415aa
+
+
+### TEST - kallisto bus ###
+
+if ! command -v bustools &> /dev/null
+then
+ echo "Error: bustools could not be found"
+ exit 1
+fi
+
+# Try custom "magic string" for -x with multiple large files and many threads
+
+cmdexec "$kallisto bus -o $test_dir/buslarge -t 12 -i $test_dir/basic7.idx -x 0,0,2:0,2,6:1,0,0 $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz $test_dir/large.fastq.gz"
+cmdexec "$bustools sort -o $test_dir/buslarge/output.s.bus -t 12 $test_dir/buslarge/output.bus"
+checkcmdoutput "bustools text -p $test_dir/buslarge/output.s.bus" f3db94e1c983a1d51b8f8525bb1a5beb
+
+# Try a simple 10XV3 (with unstranded pseudoalignment)
+
+cmdexec "$kallisto bus -o $test_dir/bus10xv3 -t 1 -i $test_dir/basic7.idx -x 10XV3 --unstranded $test_dir/10xv3.fastq.gz $test_dir/small.fastq.gz"
+cmdexec "$bustools sort -o $test_dir/bus10xv3/output.s.bus -t 12 $test_dir/bus10xv3/output.bus"
+checkcmdoutput "$bustools text -p $test_dir/bus10xv3/output.s.bus" f8f76f64d3ef351057f3bae016e2fa8a
+
+
+# Try processing demultiplexed bulk RNA-seq with strand-specificity with EM and kallisto quant-tcc (and compare with quant)
+
+echo "t1 g1
+t2 g1
+t3 g2
+t4 g2
+t5 g3" > $test_dir/t2g_test.txt # Make a transcript-to-gene mapping file
+cmdexec "$kallisto bus --fr-stranded -o $test_dir/busbulklarge -t 12 -i $test_dir/basic7.idx $test_dir/large.fastq.gz"
+cmdexec "$bustools sort -o $test_dir/busbulklarge/output.s.bus -t 12 $test_dir/busbulklarge/output.bus"
+cmdexec "$bustools count --cm -m -o $test_dir/busbulklarge/counts_tcc/ -g $test_dir/t2g_test.txt -t $test_dir/busbulklarge/transcripts.txt -e $test_dir/busbulklarge/matrix.ec $test_dir/busbulklarge/output.bus"
+cmdexec "$kallisto quant-tcc -o $test_dir/quant_tcc_test1/ -l 5 -s 2 -i $test_dir/busbulklarge/index.saved -g $test_dir/t2g_test.txt -e $test_dir/busbulklarge/counts_tcc/output.ec.txt $test_dir/busbulklarge/counts_tcc/output.mtx"
+checkcmdoutput "cat $test_dir/quant_tcc_test1/matrix.abundance.gene.tpm.mtx" ed7c7fa08e283cc6a1b136cc0e1ab039
+
+
+
=====================================
src/BUSData.cpp
=====================================
@@ -50,4 +50,17 @@ std::string binaryToString(uint64_t x, size_t len) {
s.at(i) = c;
}
return std::move(s);
-}
\ No newline at end of file
+}
+
+int hamming(uint64_t a, uint64_t b, size_t len) {
+ uint64_t df = a^b;
+ int d = 0;
+ size_t sh = len-1;
+ for (size_t i = 0; i < len; ++i) {
+ if (((df>>(2*sh)) & 0x03ULL) != 0) {
+ d++;
+ }
+ sh--;
+ }
+ return d;
+}
=====================================
src/BUSData.h
=====================================
@@ -38,4 +38,5 @@ struct BUSData {
uint64_t stringToBinary(const std::string &s, uint32_t &flag);
uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag);
std::string binaryToString(uint64_t x, size_t len);
-#endif // KALLISTO_BUSDATA_H
\ No newline at end of file
+int hamming(uint64_t a, uint64_t b, size_t len);
+#endif // KALLISTO_BUSDATA_H
=====================================
src/BUSTools.cpp
=====================================
@@ -1,6 +1,6 @@
#include <vector>
#include <fstream>
-#include "BUSData.h"
+#include "BUSTools.h"
void writeBUSHeader(std::ofstream &out, int bclen, int umilen) {
out.write("BUS\0", 4);
@@ -19,4 +19,28 @@ void writeBUSData(std::ofstream &out, const std::vector<BUSData> &bv) {
out.write((char*)(&b), sizeof(b));
}
}
+}
+
+void writeBUSMatrix(const std::string &filename,
+ std::vector<std::vector<std::pair<int,int>>> &data, int cols) {
+ std::ofstream of;
+ of.open(filename.c_str(), std::ios::out | std::ios::binary);
+ writeBUSHeader(of, BUSFORMAT_FAKE_BARCODE_LEN, 1);
+ if (!data.empty()) { // reduntant
+ for (size_t j = 0; j < data.size(); j++) {
+ const auto &v = data[j];
+ for (size_t i = 0; i < v.size(); i++) {
+ if (v[i].second != 0 && v[i].first != -1) {
+ BUSData b;
+ b.barcode = j;
+ b.flags = 0;
+ b.UMI = -1;
+ b.ec = v[i].first;
+ b.count = v[i].second;
+ of.write((char*)(&b), sizeof(b));
+ }
+ }
+ }
+ }
+ of.close();
}
\ No newline at end of file
=====================================
src/BUSTools.h
=====================================
@@ -5,7 +5,11 @@
#include <vector>
#include "BUSData.h"
+const uint32_t BUSFORMAT_FAKE_BARCODE_LEN = 16;
+
void writeBUSData(std::ofstream &out, const std::vector<BUSData> &bv);
void writeBUSHeader(std::ofstream &out, int bclen, int umilen);
+void writeBUSMatrix(const std::string &filename,
+ std::vector<std::vector<std::pair<int,int>>> &data, int cols);
#endif // KALLISTO_BUSTOOLS_H
\ No newline at end of file
=====================================
src/GeneModel.cpp
=====================================
@@ -581,3 +581,57 @@ void Transcriptome::parseGTF(const std::string >f_fn, const KmerIndex& index,
std::cerr << "Warning: " << num_trans_missing << " transcripts were defined in GTF file, but not in the index" << std::endl;
}
}
+
+void Transcriptome::parseGeneMap(const std::string &genemap_fn, const KmerIndex& index, const ProgramOptions& options) {
+ for (int i = 0; i < index.num_trans; i++) {
+ TranscriptModel tr;
+ tr.id = i;
+ tr.chr = -1;
+ tr.gene_id = -1;
+ tr.strand = true;
+ transcripts.push_back(std::move(tr));
+ trNameToId.insert({index.target_names_[i], i});
+ }
+
+ std::ifstream ingenemap((genemap_fn));
+ if (ingenemap.is_open()) {
+ std::string line;
+ while (getline(ingenemap, line)) {
+ if (line.empty()) {
+ continue;
+ }
+ std::stringstream ss(line);
+ std::string txp, gene_name, gene_common_name;
+ ss >> txp >> gene_name >> gene_common_name;
+ if (gene_name.empty()) {
+ std::cerr << "Error: No gene associated with transcript " << txp << " in " << genemap_fn << std::endl;
+ exit(1);
+ }
+ auto it = trNameToId.find(txp);
+ if (it != trNameToId.end()) {
+ TranscriptModel tmodel;
+ tmodel.id = it->second;
+ tmodel.length = index.target_lens_[tmodel.id];
+ auto it2 = geneNameToId.find(gene_name);
+ if (it2 == geneNameToId.end()) {
+ GeneModel gmodel;
+ gmodel.id = genes.size();
+ gmodel.name = gene_name;
+ gmodel.commonName = gene_common_name;
+ geneNameToId.insert({gmodel.name, gmodel.id});
+ genes.push_back(std::move(gmodel));
+ tmodel.gene_id = gmodel.id;
+ } else {
+ tmodel.gene_id = it2->second;
+ }
+ transcripts[tmodel.id] = std::move(tmodel);
+ } else {
+ std::cerr << "Error: Invalid transcript: " << txp << " in " << genemap_fn << std::endl;
+ exit(1);
+ }
+ }
+ } else {
+ std::cerr << "Error: could not open file " << genemap_fn << std::endl;
+ exit(1);
+ }
+}
=====================================
src/GeneModel.h
=====================================
@@ -99,6 +99,7 @@ struct Transcriptome {
void loadChromosomes(const std::string &chrom_fn);
bool translateTrPosition(const int tr, const int pos, const int length, bool strand, TranscriptAlignment &aln) const;
void parseGTF(const std::string >f_fn, const KmerIndex& index, const ProgramOptions& options, bool guessChromosomes);
+ void parseGeneMap(const std::string &genemap_fn, const KmerIndex& index, const ProgramOptions& options);
int addGTFLine(const std::string& line, const KmerIndex& index, bool guessChromosome);
void loadTranscriptome(const KmerIndex& index, std::istream &in, const ProgramOptions& options);
=====================================
src/Inspect.h
=====================================
@@ -3,6 +3,7 @@
#include <iostream>
#include <unordered_set>
+#include <limits>
#include "KmerIndex.h"
#include "GeneModel.h"
=====================================
src/KmerIndex.cpp
=====================================
@@ -1,6 +1,7 @@
#include "KmerIndex.h"
#include <algorithm>
#include <random>
+#include <sstream>
#include <ctype.h>
#include <zlib.h>
#include <unordered_set>
@@ -782,6 +783,12 @@ bool KmerIndex::fwStep(Kmer km, Kmer& end) const {
}
void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable) {
+ if (opt.index.empty() && !loadKmerTable) {
+ // Make an index from transcript and EC files
+ loadTranscriptsFromFile(opt);
+ loadECsFromFile(opt);
+ return;
+ }
std::string& index_in = opt.index;
std::ifstream in;
@@ -967,8 +974,73 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable) {
buffer=nullptr;
in.close();
+
+ if (!opt.ecFile.empty()) {
+ loadECsFromFile(opt);
+ }
+}
+
+void KmerIndex::loadECsFromFile(const ProgramOptions& opt) {
+ ecmap.clear();
+ ecmapinv.clear();
+ int i = 0;
+ std::ifstream in((opt.ecFile));
+ if (in.is_open()) {
+ std::string line;
+ while (getline(in, line)) {
+ std::stringstream ss(line);
+ int ec;
+ std::string transcripts;
+ ss >> ec >> transcripts;
+ if (i != ec) {
+ std::cerr << "Error: equivalence class file has a misplaced equivalence class."
+ << " Found " << ec << ", expected " << i << std::endl;
+ exit(1);
+ }
+ std::vector<int> tmp_vec;
+ std::stringstream ss2(transcripts);
+ while(ss2.good()) {
+ std::string tmp_ecval;
+ getline(ss2, tmp_ecval, ',');
+ int tmp_ecval_num = std::atoi(tmp_ecval.c_str());
+ if (tmp_ecval_num < 0 || tmp_ecval_num >= num_trans) {
+ std::cerr << "Error: equivalence class file has invalid value: "
+ << tmp_ecval << " in " << transcripts << std::endl;
+ exit(1);
+ }
+ tmp_vec.push_back(tmp_ecval_num);
+ }
+ ecmap.push_back(tmp_vec); // copy
+ ecmapinv.insert({std::move(tmp_vec), i}); // move
+ i++;
+ }
+ } else {
+ std::cerr << "Error: could not open file " << opt.ecFile << std::endl;
+ exit(1);
+ }
+ std::cerr << "[index] number of equivalence classes loaded from file: "
+ << pretty_num(ecmap.size()) << std::endl;
}
+void KmerIndex::loadTranscriptsFromFile(const ProgramOptions& opt) {
+ target_names_.clear();
+ int i = 0;
+ std::ifstream in((opt.transcriptsFile));
+ if (in.is_open()) {
+ std::string txp;
+ while (in >> txp) {
+ target_names_.push_back(txp);
+ i++;
+ }
+ } else {
+ std::cerr << "Error: could not open file " << opt.transcriptsFile << std::endl;
+ exit(1);
+ }
+ num_trans = i;
+ target_lens_.assign(num_trans, 0);
+ std::cerr << "[index] number of targets loaded from file: "
+ << pretty_num(num_trans) << std::endl;
+}
int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2, int ec) const {
bool d1 = true;
=====================================
src/KmerIndex.h
=====================================
@@ -121,6 +121,8 @@ struct KmerIndex {
// load methods
void load(ProgramOptions& opt, bool loadKmerTable = true);
void loadTranscriptSequences() const;
+ void loadECsFromFile(const ProgramOptions& opt);
+ void loadTranscriptsFromFile(const ProgramOptions& opt);
void clear();
// positional information
=====================================
src/MinCollector.cpp
=====================================
@@ -1,5 +1,6 @@
#include "MinCollector.h"
#include <algorithm>
+#include <limits>
// utility functions
=====================================
src/PlaintextWriter.cpp
=====================================
@@ -64,6 +64,53 @@ void plaintext_writer(
of.close();
}
+void plaintext_writer_gene(
+ const std::string& out_name,
+ const std::vector<std::string>& targ_ids,
+ const std::vector<double>& alpha,
+ const std::vector<double>& eff_lens,
+ const Transcriptome& model
+){
+
+ std::ofstream of;
+ of.open( out_name );
+
+ if (!of.is_open()) {
+ std::cerr << "Error: Couldn't open file: " << out_name << std::endl;
+
+ exit(1);
+ }
+
+ std::vector<double> gc;
+ std::vector<double> gc_tpm;
+ gc.assign(model.genes.size(), 0.0);
+ gc_tpm.assign(model.genes.size(), 0.0);
+ auto tpm = counts_to_tpm(alpha, eff_lens);
+ for (int i = 0; i < alpha.size(); i++) {
+ if (alpha[i] > 0.0) {
+ int g_id = model.transcripts[i].gene_id;
+ if (g_id != -1) {
+ gc[g_id] += alpha[i];
+ gc_tpm[g_id] += tpm[i];
+ }
+ }
+ }
+
+ of << "gene_id" << "\t"
+ << "gene_name" << "\t"
+ << "est_counts" << "\t"
+ << "tpm" << "\n";
+
+ for (int i = 0; i < gc.size(); i++) {
+ of << model.genes[i].name << '\t'
+ << model.genes[i].commonName << '\t'
+ << gc[i] << '\t'
+ << gc_tpm[i] << "\n";
+ }
+
+ of.close();
+}
+
std::string to_json(const std::string& id, const std::string& val, bool quote,
bool comma, int level) {
std::string out;
@@ -239,7 +286,7 @@ void writeFLD(const std::string &filename, const std::vector<std::pair<double, d
of.close();
}
-void writeGeneList(const std::string &filename, const Transcriptome& model) {
+void writeGeneList(const std::string &filename, const Transcriptome& model, bool writeNamesOnly) {
std::ofstream of;
of.open(filename.c_str(), std::ios::out);
if (!of.is_open()) {
@@ -247,7 +294,11 @@ void writeGeneList(const std::string &filename, const Transcriptome& model) {
exit(1);
}
for (int j = 0; j < model.genes.size(); j++) {
- of << j << "\t" << model.genes[j].name << "\t" << model.genes[j].commonName << "\n";
+ if (writeNamesOnly) {
+ of << model.genes[j].name << "\n";
+ } else {
+ of << j << "\t" << model.genes[j].name << "\t" << model.genes[j].commonName << "\n";
+ }
}
of.close();
}
=====================================
src/PlaintextWriter.h
=====================================
@@ -21,6 +21,14 @@ void plaintext_writer(
const std::vector<int>& lens
);
+void plaintext_writer_gene(
+ const std::string& out_name,
+ const std::vector<std::string>& targ_ids,
+ const std::vector<double>& alpha,
+ const std::vector<double>& eff_lens,
+ const Transcriptome& model
+);
+
std::string to_json(const std::string& id, const std::string& val, bool quote,
bool comma = true, int level = 1);
@@ -53,7 +61,10 @@ void writeECList(
void writeFLD(const std::string &filename, const std::vector<std::pair<double, double>> &flds);
-void writeGeneList(const std::string &filename, const Transcriptome& model);
+void writeGeneList(const std::string &filename, const Transcriptome& model, bool writeNamesOnly = false);
+
+std::vector<double> counts_to_tpm(const std::vector<double>& est_counts,
+ const std::vector<double>& eff_lens);
template<typename T>
void writeSparseBatchMatrix(
=====================================
src/ProcessReads.cpp
=====================================
@@ -26,6 +26,7 @@
#include "BUSData.h"
#include "BUSTools.h"
#include <htslib/kstring.h>
+#include <unordered_set>
void printVector(const std::vector<int>& v, std::ostream& o) {
@@ -352,6 +353,23 @@ void MasterProcessor::processReads() {
}
} else if (opt.bus_mode) {
std::vector<std::thread> workers;
+ parallel_bus_read = opt.threads > 4 && opt.files.size() > opt.busOptions.nfiles && !opt.num && !opt.pseudobam;
+ if (parallel_bus_read) {
+ delete SR;
+ SR = nullptr;
+ assert(opt.files.size() % opt.busOptions.nfiles == 0);
+ int nbatches = opt.files.size() / opt.busOptions.nfiles;
+ std::vector<std::mutex> mutexes(nbatches);
+ parallel_bus_reader_locks.swap(mutexes);
+ for (int i = 0; i < nbatches; i++) {
+ FastqSequenceReader fSR(opt);
+ fSR.files.erase(fSR.files.begin(), fSR.files.begin()+opt.busOptions.nfiles*i);
+ fSR.files.erase(fSR.files.begin()+opt.busOptions.nfiles, fSR.files.end());
+ assert(fSR.files.size() == opt.busOptions.nfiles);
+ FSRs.push_back(std::move(fSR));
+ }
+ }
+
for (int i = 0; i < opt.threads; i++) {
workers.emplace_back(std::thread(BUSProcessor(index,opt,tc,*this)));
}
@@ -375,6 +393,46 @@ void MasterProcessor::processReads() {
}
+ } else if (opt.batch_mode && opt.batch_bus) {
+ std::vector<std::thread> workers;
+ int num_ids = opt.batch_ids.size();
+ int id =0;
+ while (id < num_ids) {
+ // TODO: put in thread pool
+ workers.clear();
+ int nt = std::min(opt.threads, (num_ids - id));
+ if (opt.pseudo_read_files_supplied) { // pseudo_read_files_supplied: read using SR (no batch identifier) rather than batchSR
+ for (int i = 0; i < opt.threads; i++) {
+ workers.emplace_back(std::thread(BUSProcessor(index, opt, tc, *this, id,0)));
+ }
+ for (int i = 0; i < opt.threads; i++) {
+ workers[i].join();
+ }
+ id++;
+ assert(id == num_ids);
+ } else {
+ for (int i = 0; i < nt; i++,id++) {
+ workers.emplace_back(std::thread(BUSProcessor(index, opt, tc, *this, id,i)));
+ }
+ // let the workers do their thing
+ for (int i = 0; i < nt; i++) {
+ workers[i].join(); //wait for them to finish
+ }
+ }
+ }
+
+ // now handle the modification of the mincollector
+ for (int i = 0; i < bus_ecmap.size(); i++) {
+ auto &u = bus_ecmap[i];
+ int ec = index.ecmapinv.size();
+ auto it = bus_ecmapinv.find(u);
+ if (it->second != ec) {
+ std::cout << "Error" << std::endl;
+ exit(1);
+ }
+ index.ecmapinv.insert({u,ec});
+ index.ecmap.push_back(u);
+ }
} else if (opt.batch_mode) {
std::vector<std::thread> workers;
int num_ids = opt.batch_ids.size();
@@ -385,13 +443,24 @@ void MasterProcessor::processReads() {
workers.clear();
int nt = std::min(opt.threads, (num_ids - id));
int first_id = id;
- for (int i = 0; i < nt; i++,id++) {
- tmp_bc[i].assign(tc.counts.size(), 0);
- workers.emplace_back(std::thread(ReadProcessor(index, opt, tc, *this, id,i)));
- }
-
- for (int i = 0; i < nt; i++) {
- workers[i].join();
+ if (opt.pseudo_read_files_supplied) {
+ for (int i = 0; i < opt.threads; i++) {
+ tmp_bc[i].assign(tc.counts.size(), 0);
+ workers.emplace_back(std::thread(ReadProcessor(index, opt, tc, *this, id, 0)));
+ }
+ for (int i = 0; i < opt.threads; i++) {
+ workers[i].join();
+ }
+ id++;
+ assert(id == num_ids);
+ } else {
+ for (int i = 0; i < nt; i++,id++) {
+ tmp_bc[i].assign(tc.counts.size(), 0);
+ workers.emplace_back(std::thread(ReadProcessor(index, opt, tc, *this, id,i)));
+ }
+ for (int i = 0; i < nt; i++) {
+ workers[i].join();
+ }
}
if (!opt.umi) {
@@ -465,7 +534,7 @@ void MasterProcessor::processReads() {
}
int ec = tc.findEC(t.first);
assert(ec != -1);
- ++tmp_counts[ec];
+ tmp_counts[ec] += t.second;
}
auto& bc = batchCounts[id];
for (int j = 0; j < tmp_counts.size(); j++) {
@@ -531,7 +600,7 @@ void MasterProcessor::processReads() {
if (opt.pseudobam) {
pseudobatchf_out.close();
}
- if (opt.bus_mode) {
+ if (opt.bus_mode || opt.batch_bus) {
busf_out.close();
}
}
@@ -737,7 +806,7 @@ void MasterProcessor::update(const std::vector<int>& c, const std::vector<std::v
// acquire the writer lock
std::lock_guard<std::mutex> lock(this->writer_lock);
- if (!opt.batch_mode) {
+ if (!opt.batch_mode || opt.batch_bus) {
for (int i = 0; i < c.size(); i++) {
tc.counts[i] += c[i]; // add up ec counts
nummapped += c[i];
@@ -756,7 +825,7 @@ void MasterProcessor::update(const std::vector<int>& c, const std::vector<std::v
}
}
- if (!opt.batch_mode) {
+ if (!opt.batch_mode || opt.batch_bus) {
for(auto &u : newEcs) {
++newECcount[u];
}
@@ -829,7 +898,7 @@ void MasterProcessor::update(const std::vector<int>& c, const std::vector<std::v
}
}
- if (opt.bus_mode) {
+ if (opt.bus_mode || opt.batch_bus) {
int bus_bc_sum = 0;
int bus_umi_sum = 0;
for (int i = 0; i <= 32; i++) {
@@ -978,7 +1047,7 @@ void ReadProcessor::operator()() {
while (true) {
int readbatch_id;
// grab the reader lock
- if (mp.opt.batch_mode) {
+ if (mp.opt.batch_mode && !mp.opt.pseudo_read_files_supplied) {
if (batchSR.empty()) {
return;
} else {
@@ -1294,12 +1363,23 @@ BUSProcessor::BUSProcessor(const KmerIndex& index, const ProgramOptions& opt, co
// initialize buffer
bufsize = mp.bufsize;
buffer = new char[bufsize];
+ if (opt.batch_mode) {
+ assert(id != -1);
+ batchSR.files = opt.batch_files[id];
+ batchSR.nfiles = opt.batch_files[id].size();
+ batchSR.reserveNfiles(opt.batch_files[id].size());
+ if (opt.umi) {
+ batchSR.umi_files = {opt.umi_files[id]};
+ }
+ batchSR.paired = !opt.single_end;
+ }
+
seqs.reserve(bufsize/50);
newEcs.reserve(1000);
bv.reserve(1000);
counts.reserve((int) (tc.counts.size() * 1.25));
- memset(&bc_len[0],0,33);
- memset(&umi_len[0],0,33);
+ memset(&bc_len[0],0,sizeof(bc_len));
+ memset(&umi_len[0],0,sizeof(umi_len));
clear();
}
@@ -1323,9 +1403,10 @@ BUSProcessor::BUSProcessor(BUSProcessor && o) :
flens(std::move(o.flens)),
bias5(std::move(o.bias5)),
bv(std::move(o.bv)),
+ batchSR(std::move(o.batchSR)),
counts(std::move(o.counts)) {
- memcpy(&bc_len[0], &o.bc_len[0], 33);
- memcpy(&umi_len[0], &o.umi_len[0], 33);
+ memcpy(&bc_len[0], &o.bc_len[0], sizeof(bc_len));
+ memcpy(&umi_len[0], &o.umi_len[0], sizeof(umi_len));
buffer = o.buffer;
o.buffer = nullptr;
o.bufsize = 0;
@@ -1339,22 +1420,42 @@ BUSProcessor::~BUSProcessor() {
}
void BUSProcessor::operator()() {
+ uint64_t parallel_bus_read_counter = 0;
+ std::unordered_set<int> parallel_bus_read_empty;
while (true) {
int readbatch_id;
+ std::vector<std::string> umis;
// grab the reader lock
- {
+ if (mp.opt.batch_mode && !mp.opt.pseudo_read_files_supplied) {
+ if (batchSR.empty()) {
+ return;
+ } else {
+ batchSR.fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam );
+ }
+ } else if (mp.opt.bus_mode && mp.parallel_bus_read) {
+ int nbatches = mp.opt.files.size() / mp.opt.busOptions.nfiles;
+ int i = parallel_bus_read_counter % nbatches;
+ if (parallel_bus_read_empty.size() >= nbatches) {
+ return;
+ }
+ parallel_bus_read_counter++;
+ std::lock_guard<std::mutex> lock(mp.parallel_bus_reader_locks[i]);
+ if (mp.FSRs[i].empty()) {
+ parallel_bus_read_empty.emplace(i);
+ continue;
+ }
+ mp.FSRs[i].fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
+ } else {
std::lock_guard<std::mutex> lock(mp.reader_lock);
if (mp.SR->empty()) {
// nothing to do
return;
} else {
// get new sequences
- std::vector<std::string> umis;
- mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, false);
+ mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, mp.opt.pseudobam || mp.opt.fusion);
}
// release the reader lock
}
- // do the same for BUS ?!?
pseudobatch.aln.clear();
pseudobatch.batch_id = readbatch_id;
@@ -1377,12 +1478,32 @@ void BUSProcessor::processBuffer() {
u.reserve(1000);
v.reserve(1000);
+ v2.reserve(1000);
vtmp.reserve(1000);
- memset(&bc_len[0], 0, 33);
- memset(&umi_len[0], 0, 33);
+ memset(&bc_len[0], 0, sizeof(bc_len));
+ memset(&umi_len[0], 0, sizeof(umi_len));
const BUSOptions& busopt = mp.opt.busOptions;
+ bool bulk_like = mp.opt.batch_bus || busopt.umi[0].fileno == -1; // Treat like bulk: no UMI
+
+ auto &tcount = mp.tlencount;
+ if (mp.opt.batch_bus) {
+ tcount = mp.tlencounts[id];
+ }
+ bool findFragmentLength = tcount < 10000 && busopt.paired;
+ int flengoal = 0;
+ flens.clear();
+ if (findFragmentLength) {
+ flengoal = (10000 - tcount);
+ if (flengoal <= 0) {
+ findFragmentLength = false;
+ flengoal = 0;
+ } else {
+ flens.resize(tc.flens.size(), 0);
+ }
+ }
+
char buffer[100];
memset(buffer, 0, 100);
@@ -1409,6 +1530,15 @@ void BUSProcessor::processBuffer() {
bool singleSeq = busopt.seq.size() ==1 ;
const BUSOptionSubstr seqopt = busopt.seq.front();
+ bool check_tag_sequence = !mp.opt.tagsequence.empty();
+ int taglen = 0;
+ uint64_t tag_binary = 0;
+ int hamming_dist = 1;
+ if (check_tag_sequence) {
+ taglen = mp.opt.tagsequence.length();
+ uint32_t f = 0;
+ tag_binary = stringToBinary(mp.opt.tagsequence, f);
+ }
//int incf = (bam) ? 1 : busopt.nfiles-1;
@@ -1421,42 +1551,97 @@ void BUSProcessor::processBuffer() {
// find where the sequence is
const char *seq = nullptr;
+ const char *seq2 = nullptr;
+
+ bool ignore_umi = false;
+
+ // copy the umi
+ int ulen = 0;
+ bool bad_umi = false;
+ if (bulk_like) {
+ ulen = 1;
+ umi[0] = 'A';
+ umi[ulen] = 0;
+ ignore_umi = true;
+ } else {
+ for (auto &umic : busopt.umi) {
+ int umilen = (0 == umic.stop) ? l[umic.fileno] - umic.start : umic.stop - umic.start;
+ if (l[umic.fileno] < umic.start + umilen || umilen <= 0) {
+ bad_umi = true;
+ break;
+ }
+ if (check_tag_sequence && ulen == 0) {
+ umilen += taglen; // Expand to include tag sequence
+ memcpy(umi, s[umic.fileno] + umic.start - taglen, umilen);
+ } else {
+ memcpy(umi+ulen, s[umic.fileno] + umic.start, umilen);
+ }
+ ulen += umilen;
+ }
+ if (bad_umi) {
+ continue;
+ }
+ umi[ulen] = 0;
+ }
+
+ uint64_t umi_binary = -1;
+ if (check_tag_sequence) {
+ uint32_t f = 0;
+ umi_binary = stringToBinary(umi, ulen, f);
+ if (hamming(tag_binary, umi_binary >> 2*(ulen-taglen), taglen) <= hamming_dist) { // if tag present
+ umi_binary = umi_binary & ~(~0ULL << (2*(ulen - taglen)));
+ ulen = ulen - taglen;
+ if (ulen <= 32) {
+ umi_len[ulen]++;
+ }
+ } else {
+ ignore_umi = true; // tag not present, it's not a UMI-read
+ umi_binary = -1;
+ }
+ } else {
+ if (ulen <= 32) {
+ umi_len[ulen]++;
+ }
+ }
size_t seqlen = 0;
+ size_t seqlen2 = 0;
if (singleSeq) {
- seq = s[seqopt.fileno] + seqopt.start;
- seqlen = l[seqopt.fileno];
+ int seqstart = (ignore_umi && busopt.umi[0].fileno == seqopt.fileno ? busopt.umi[0].start - taglen : seqopt.start);
+ seq = s[seqopt.fileno] + seqstart;
+ seqlen = (seqopt.stop == 0) ? l[seqopt.fileno]-seqstart : seqopt.stop - seqstart;
+ } else if (busopt.paired) {
+ const auto &sopt1 = busopt.seq[0];
+ const auto &sopt2 = busopt.seq[1];
+ int seqstart1 = (ignore_umi && busopt.umi[0].fileno == sopt1.fileno ? busopt.umi[0].start - taglen : sopt1.start);
+ int seqstart2 = (ignore_umi && busopt.umi[0].fileno == sopt2.fileno ? busopt.umi[0].start - taglen : sopt2.start);
+ int cplen1 = (sopt1.stop == 0) ? l[sopt1.fileno]-seqstart1 : sopt1.stop - seqstart1;
+ int cplen2 = (sopt2.stop == 0) ? l[sopt2.fileno]-seqstart2 : sopt2.stop - seqstart2;
+ seq = s[sopt1.fileno] + seqstart1;
+ seq2 = s[sopt2.fileno] + seqstart2;
+ seqlen = cplen1;
+ seqlen2 = cplen2;
} else {
seqbuffer.clear();
for (int j = 0; j < busopt.seq.size(); j++) {
const auto &sopt = busopt.seq[j];
- int cplen = (sopt.stop == 0) ? l[sopt.fileno]-sopt.start : sopt.stop - sopt.start;
- seqbuffer.append(s[sopt.fileno] + sopt.start, cplen);
- seqbuffer.push_back('N');
+ int seqstart = (ignore_umi && busopt.umi[0].fileno == sopt.fileno ? busopt.umi[0].start - taglen : sopt.start);
+ int cplen = (sopt.stop == 0) ? l[sopt.fileno]-seqstart : sopt.stop - seqstart;
+ seqbuffer.append(s[sopt.fileno] + seqstart, cplen);
+ seqbuffer.push_back('N');
}
seqbuffer.push_back(0);
seq = seqbuffer.c_str();
seqlen = seqbuffer.size();
}
- // copy the umi
- int umilen = (busopt.umi.start == busopt.umi.stop) ? l[busopt.umi.fileno] - busopt.umi.start : busopt.umi.stop - busopt.umi.start;
- if (l[busopt.umi.fileno] < busopt.umi.start + umilen) {
- continue; // too short
- }
- memcpy(umi, s[busopt.umi.fileno] + busopt.umi.start, umilen);
- umi[umilen] = 0;
- if (umilen >= 0 && umilen <= 32) {
- umi_len[umilen]++;
- }
-
// auto &bcc = busopt.bc[0];
int blen = 0;
bool bad_bc = false;
for (auto &bcc : busopt.bc) {
- int bclen = (bcc.start == bcc.stop) ? l[bcc.fileno] - bcc.start : bcc.stop - bcc.start;
- if (l[bcc.fileno] < bcc.start + bclen) {
+ int bclen = (0 == bcc.stop) ? l[bcc.fileno] - bcc.start : bcc.stop - bcc.start;
+ if (l[bcc.fileno] < bcc.start + bclen || bclen <= 0) {
bad_bc = true;
break;
}
@@ -1466,6 +1651,11 @@ void BUSProcessor::processBuffer() {
if (bad_bc) {
continue;
}
+ if (mp.opt.batch_bus) {
+ ignore_umi = true;
+ blen = BUSFORMAT_FAKE_BARCODE_LEN;
+ memcpy(bc, binaryToString(id, blen).c_str(), blen); // Create fake barcode that identifies the batch
+ }
bc[blen] = 0;
if (blen >= 0 && blen <= 32) {
@@ -1483,14 +1673,74 @@ void BUSProcessor::processBuffer() {
// process 2nd read
index.match(seq,seqlen, v);
+ if (busopt.paired) {
+ v2.clear();
+ index.match(seq2,seqlen2, v2);
+ }
// collect the target information
int ec = -1;
- int r = tc.intersectKmers(v, v2, false,u);
+ int r = tc.intersectKmers(v, v2, !busopt.paired,u);
if (!u.empty()) {
ec = tc.findEC(u);
}
+ if ((!ignore_umi || bulk_like) && mp.opt.strand_specific && !u.empty()) { // Strand-specificity
+ int p = -1;
+ Kmer km;
+ KmerEntry val;
+ if (!v.empty()) {
+ vtmp.clear();
+ bool firstStrand = (mp.opt.strand == ProgramOptions::StrandType::FR); // FR have first read mapping forward
+ p = findFirstMappingKmer(v,val);
+ km = Kmer((seq+p));
+ bool strand = (val.isFw() == (km == km.rep())); // k-mer maps to fw strand?
+ // might need to optimize this
+ const auto &c = index.dbGraph.contigs[val.contig];
+ for (auto tr : u) {
+ for (auto ctx : c.transcripts) {
+ if (tr == ctx.trid) {
+ if ((strand == ctx.sense) == firstStrand) {
+ // swap out
+ vtmp.push_back(tr);
+ }
+ break;
+ }
+ }
+ }
+ if (vtmp.size() < u.size()) {
+ u = vtmp; // copy
+ }
+ }
+ if (!v2.empty()) {
+ vtmp.clear();
+ bool secondStrand = (mp.opt.strand == ProgramOptions::StrandType::RF);
+ p = findFirstMappingKmer(v2,val);
+ km = Kmer((seq2+p));
+ bool strand = (val.isFw() == (km == km.rep())); // k-mer maps to fw strand?
+ // might need to optimize this
+ const auto &c = index.dbGraph.contigs[val.contig];
+ for (auto tr : u) {
+ for (auto ctx : c.transcripts) {
+ if (tr == ctx.trid) {
+ if ((strand == ctx.sense) == secondStrand) {
+ // swap out
+ vtmp.push_back(tr);
+ }
+ break;
+ }
+ }
+ }
+ if (vtmp.size() < u.size()) {
+ u = vtmp; // copy
+ }
+ }
+ ec = -1;
+ if (!u.empty()) { // update the ec
+ ec = tc.findEC(u);
+ }
+ }
+
// find the ec
if (!u.empty()) {
BUSData b;
@@ -1498,15 +1748,26 @@ void BUSProcessor::processBuffer() {
b.flags = 0;
b.barcode = stringToBinary(bc, blen, f);
b.flags |= f;
- b.UMI = stringToBinary(umi, umilen, f);
+ b.UMI = check_tag_sequence || bulk_like ? umi_binary : stringToBinary(umi, ulen, f);
b.flags |= (f) << 8;
b.count = 1;
//std::cout << std::string(s1,10) << "\t" << b.barcode << "\t" << std::string(s1+10,16) << "\t" << b.UMI << "\n";
if (num) {
b.flags = (uint32_t) flags[i / jmax];
}
-
+
//ec = tc.findEC(u);
+
+ if (busopt.paired && ignore_umi) {
+ if (findFragmentLength && flengoal > 0 && 0 <= ec && ec < index.num_trans && !v.empty() && !v2.empty()) {
+ // try to map the reads
+ int tl = index.mapPair(seq, seqlen, seq2, seqlen2, ec);
+ if (0 < tl && tl < flens.size()) {
+ flens[tl]++;
+ flengoal--;
+ }
+ }
+ }
// count the pseudoalignment
if (ec == -1 || ec >= counts.size()) {
@@ -1524,16 +1785,20 @@ void BUSProcessor::processBuffer() {
if (mp.opt.pseudobam) {
PseudoAlignmentInfo info;
- info.id = i;
- info.paired = false;
+ info.id = i / jmax;
+ info.paired = busopt.paired;
uint32_t f = 0;
info.barcode = stringToBinary(bc, blen, f);
- info.UMI = stringToBinary(umi, umilen, f);
+ info.UMI = check_tag_sequence ? umi_binary : stringToBinary(umi, ulen, f);
if (!u.empty()) {
info.r1empty = v.empty();
KmerEntry val;
info.k1pos = findFirstMappingKmer(v,val);
info.k2pos = -1;
+ if (busopt.paired) {
+ info.r2empty = v2.empty();
+ info.k2pos = findFirstMappingKmer(v2,val);
+ }
if (ec != -1) {
info.ec_id = ec;
@@ -1670,7 +1935,11 @@ void AlnProcessor::operator()() {
mp.SR->fetchSequences(buffer, bufsize, seqs, names, quals, flags, umis, readbatch_id, true);
readPseudoAlignmentBatch(mp.pseudobatchf_in, pseudobatch);
assert(pseudobatch.batch_id == readbatch_id);
- assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
+ if (mp.opt.bus_mode) {
+ assert(pseudobatch.aln.size() == seqs.size()/mp.opt.busOptions.nfiles); // sanity checks
+ } else {
+ assert(pseudobatch.aln.size() == ((paired) ? seqs.size()/2 : seqs.size())); // sanity checks
+ }
}
// release the reader lock
}
@@ -1689,7 +1958,6 @@ void AlnProcessor::operator()() {
void AlnProcessor::processBufferTrans() {
-
/* something simple where we can construct the bam records */
std::vector<bam1_t> bv;
@@ -2091,7 +2359,6 @@ void AlnProcessor::processBufferTrans() {
void AlnProcessor::processBufferGenome() {
-
/* something simple where we can construct the bam records */
std::vector<bam1_t> bv;
int idnum = 0;
@@ -2110,14 +2377,21 @@ void AlnProcessor::processBufferGenome() {
genome_auxlen -= 7;
}
if (mp.opt.bus_mode) {
- //todo replace with what is written to busfile
bclen = mp.opt.busOptions.getBCLength();
if (bclen == 0) {
- bclen = 32;
+ for (int i = 0; i <= 32; i++) {
+ if (mp.bus_bc_len[i] > mp.bus_bc_len[bclen]) {
+ bclen = i;
+ }
+ }
}
umilen = mp.opt.busOptions.getUMILength();
if (umilen == 0) {
- umilen = 32;
+ for (int i = 0; i <= 32; i++) {
+ if (mp.bus_umi_len[i] > mp.bus_umi_len[umilen]) {
+ umilen = i;
+ }
+ }
}
genome_auxlen += (bclen + umilen) + 8;
@@ -2129,28 +2403,31 @@ void AlnProcessor::processBufferGenome() {
KmerEntry val1, val2;
if (mp.opt.bus_mode) {
paired = false;
+ if (mp.opt.busOptions.paired) {
+ paired = true;
+ }
}
- bool readpaired = paired || (mp.opt.bus_mode && mp.opt.busOptions.nfiles == 2);
-
-
for (int i = 0; i < n; i++) {
bam1_t b1,b2, b1c, b2c;
- int si1 = (readpaired) ? 2*i : i;
- int si2 = (readpaired) ? 2*i +1 : -1;
-
+ int si1 = (paired) ? 2*i : i;
+ int si2 = (paired) ? 2*i +1 : -1;
+ // For BUS file processing with paired-end reads:
+ if (mp.opt.bus_mode) {
+ const BUSOptions& busopt = mp.opt.busOptions;
+ si1 = i*busopt.nfiles + busopt.seq[0].fileno;
+ if (paired) {
+ si2 = i*busopt.nfiles + busopt.seq[1].fileno;
+ }
+ }
int rlen1 = seqs[si1].second;
int rlen2;
- if (readpaired) {
+ int seq1_offset = 0;
+ int seq2_offset = 0;
+ if (paired) {
rlen2 = seqs[si2].second;
}
- if (mp.opt.busOptions.seq.front().fileno == 1) {
- std::swap(si1,si2);
- std::swap(rlen1,rlen2);
- }
-
-
// fill in the bam core info
b1.core.tid = -1;
b1.core.pos = -1;
@@ -2176,16 +2453,42 @@ void AlnProcessor::processBufferGenome() {
}
PseudoAlignmentInfo &pi = pseudobatch.aln[i];
-
- // fill in the data info
- fillBamRecord(b1, nullptr, seqs[si1].first, names[si1].first, quals[si1].first, seqs[si1].second, names[si1].second, pi.r1empty, genome_auxlen);
- //b1.id = idnum++;
- if (paired) {
- fillBamRecord(b2, nullptr, seqs[si2].first, names[si2].first, quals[si2].first, seqs[si2].second, names[si2].second, pi.r2empty, genome_auxlen);
- // b2.id = idnum++;
- }
- if (mp.opt.bus_mode) {
+ if (!mp.opt.bus_mode) {
+ // fill in the data info
+ fillBamRecord(b1, nullptr, seqs[si1].first, names[si1].first, quals[si1].first, seqs[si1].second, names[si1].second, pi.r1empty, genome_auxlen);
+ //b1.id = idnum++;
+ if (paired) {
+ fillBamRecord(b2, nullptr, seqs[si2].first, names[si2].first, quals[si2].first, seqs[si2].second, names[si2].second, pi.r2empty, genome_auxlen);
+ // b2.id = idnum++;
+ }
+ }
+ else {
+ bool tag_present = !mp.opt.tagsequence.empty() && pi.UMI != -1; // tag+UMI present in the sequence
+ bool umi_first_file = mp.opt.busOptions.umi[0].fileno == mp.opt.busOptions.seq[0].fileno; // UMI-containing file is the first file (file number 0)
+ int tagseqstart;
+ if (tag_present) {
+ tagseqstart = umi_first_file ? mp.opt.busOptions.seq[0].start : mp.opt.busOptions.seq[1].start; // Position where the actual sequence begins when tag+UMI present
+ }
+
+ // fill in the data info
+ if (tag_present && umi_first_file) {
+ rlen1 = seqs[si1].second-tagseqstart;
+ seq1_offset = tagseqstart;
+ fillBamRecord(b1, nullptr, seqs[si1].first+tagseqstart, names[si1].first, quals[si1].first+tagseqstart, seqs[si1].second-tagseqstart, names[si1].second, pi.r1empty, genome_auxlen);
+ } else {
+ fillBamRecord(b1, nullptr, seqs[si1].first, names[si1].first, quals[si1].first, seqs[si1].second, names[si1].second, pi.r1empty, genome_auxlen);
+ }
+ if (paired) {
+ if (tag_present && !umi_first_file) {
+ rlen2 = seqs[si2].second-tagseqstart;
+ seq2_offset = tagseqstart;
+ fillBamRecord(b2, nullptr, seqs[si2].first+tagseqstart, names[si2].first, quals[si2].first+tagseqstart, seqs[si2].second-tagseqstart, names[si2].second, pi.r2empty, genome_auxlen);
+ } else {
+ fillBamRecord(b2, nullptr, seqs[si2].first, names[si2].first, quals[si2].first, seqs[si2].second, names[si2].second, pi.r2empty, genome_auxlen);
+ }
+ }
+
b1.data[b1.l_data] = 'C';
b1.data[b1.l_data+1] = 'R';
b1.data[b1.l_data+2] = 'Z';
@@ -2196,9 +2499,25 @@ void AlnProcessor::processBufferGenome() {
b1.data[b1.l_data] = 'U';
b1.data[b1.l_data+1] = 'R';
b1.data[b1.l_data+2] = 'Z';
- umi = binaryToString(pi.UMI, umilen);
+ if (!mp.opt.tagsequence.empty() && pi.UMI == -1) { // Non-UMI read
+ umi = std::string(umilen, 'N');
+ } else {
+ umi = binaryToString(pi.UMI, umilen);
+ }
memcpy(b1.data + b1.l_data + 3, umi.c_str(), umilen+2);
b1.l_data += umilen + 4;
+ if (paired) {
+ b2.data[b2.l_data] = 'C';
+ b2.data[b2.l_data+1] = 'R';
+ b2.data[b2.l_data+2] = 'Z';
+ memcpy(b2.data + b2.l_data + 3, bc.c_str(), bclen+1);
+ b2.l_data += bclen + 4;
+ b2.data[b2.l_data] = 'U';
+ b2.data[b2.l_data+1] = 'R';
+ b2.data[b2.l_data+2] = 'Z';
+ memcpy(b2.data + b2.l_data + 3, umi.c_str(), umilen+2);
+ b2.l_data += umilen + 4;
+ }
}
if (pi.r1empty && pi.r2empty) {
@@ -2334,11 +2653,11 @@ void AlnProcessor::processBufferGenome() {
std::pair<bool,bool> strInfo1 = {true,true}, strInfo2 = {true,true};
if (!pi.r1empty) {
- km1 = Kmer(seqs[si1].first + pi.k1pos);
+ km1 = Kmer(seqs[si1].first + seq1_offset + pi.k1pos);
strInfo1 = strandednessInfo(km1, val1, ua);
}
if (paired && !pi.r2empty) {
- km2 = Kmer(seqs[si2].first + pi.k2pos);
+ km2 = Kmer(seqs[si2].first + seq2_offset + pi.k2pos);
strInfo2 = strandednessInfo(km2, val2, ua);
}
@@ -2459,7 +2778,7 @@ void AlnProcessor::processBufferGenome() {
b1c.core.tid = tra.first.chr;
if (!pi.r1empty) {
b1c.core.pos = tra.first.chrpos;
- b1c.core.bin = hts_reg2bin(b1c.core.pos, b1c.core.pos + seqs[si1].second-1, 14, 5);
+ b1c.core.bin = hts_reg2bin(b1c.core.pos, b1c.core.pos + seqs[si1].second-seq1_offset-1, 14, 5);
b1c.core.qual = 255;
if (useEM) {
memcpy(b1c.data + b1c.l_data - 4, &prob, 4); // set ZW tag
@@ -2470,7 +2789,7 @@ void AlnProcessor::processBufferGenome() {
b2c.core.tid = tra.second.chr;
if (!pi.r2empty) {
b2c.core.pos = tra.second.chrpos;
- b2c.core.bin = hts_reg2bin(b2c.core.pos, b2c.core.pos + seqs[si2].second, 14, 5);
+ b2c.core.bin = hts_reg2bin(b2c.core.pos, b2c.core.pos + seqs[si2].second-seq2_offset, 14, 5);
b2c.core.qual = 255;
if (useEM) {
memcpy(b2c.data + b2c.l_data - 4, &prob, 4);
@@ -2914,7 +3233,7 @@ bool FastqSequenceReader::fetchSequences(char *buf, const int limit, std::vector
}
numreads++;
- flags.push_back(numreads);
+ flags.push_back(numreads-1);
} else {
return true; // read it next time
}
=====================================
src/ProcessReads.h
=====================================
@@ -179,8 +179,6 @@ public:
}
if (opt.batch_mode) {
- memset(&bus_bc_len[0],0,33);
- memset(&bus_umi_len[0],0,33);
batchCounts.assign(opt.batch_ids.size(), {});
tlencounts.assign(opt.batch_ids.size(), 0);
batchFlens.assign(opt.batch_ids.size(), std::vector<int>(1000,0));
@@ -198,10 +196,16 @@ public:
if (opt.pseudobam) {
pseudobatchf_out.open(opt.output + "/pseudoaln.bin", std::ios::out | std::ios::binary);
}
- if (opt.bus_mode) {
+ if (opt.bus_mode || opt.batch_bus) {
+ memset(&bus_bc_len[0],0,sizeof(bus_bc_len));
+ memset(&bus_umi_len[0],0,sizeof(bus_umi_len));
busf_out.open(opt.output + "/output.bus", std::ios::out | std::ios::binary);
- writeBUSHeader(busf_out, opt.busOptions.getBCLength(), opt.busOptions.getUMILength());
+ if (opt.batch_bus) {
+ writeBUSHeader(busf_out, BUSFORMAT_FAKE_BARCODE_LEN, 1);
+ } else {
+ writeBUSHeader(busf_out, opt.busOptions.getBCLength(), opt.busOptions.getUMILength());
+ }
}
}
@@ -227,10 +231,13 @@ public:
}
std::mutex reader_lock;
+ std::vector<std::mutex> parallel_bus_reader_locks;
+ bool parallel_bus_read;
std::mutex writer_lock;
SequenceReader *SR;
+ std::vector<FastqSequenceReader> FSRs;
MinCollector& tc;
KmerIndex& index;
const Transcriptome& model;
@@ -335,6 +342,7 @@ public:
int id;
int local_id;
PseudoAlignmentBatch pseudobatch;
+ FastqSequenceReader batchSR;
int bc_len[33];
int umi_len[33];
=====================================
src/common.h
=====================================
@@ -1,7 +1,7 @@
#ifndef KALLISTO_COMMON_H
#define KALLISTO_COMMON_H
-#define KALLISTO_VERSION "0.46.2"
+#define KALLISTO_VERSION "0.48.0"
#include <string>
#include <vector>
@@ -22,9 +22,11 @@ struct BUSOptionSubstr {
struct BUSOptions {
int nfiles;
- BUSOptionSubstr umi;
+ std::vector<BUSOptionSubstr> umi;
std::vector<BUSOptionSubstr> bc;
std::vector<BUSOptionSubstr> seq;
+
+ bool paired;
int getBCLength() const {
int r =0 ;
@@ -32,6 +34,8 @@ struct BUSOptions {
for (auto& b : bc) {
if (b.start < 0) {
return 0;
+ } else if (b.stop == 0) {
+ return 0;
} else {
r += b.stop - b.start;
}
@@ -41,11 +45,19 @@ struct BUSOptions {
}
int getUMILength() const {
- if (umi.start >= 0) {
- return umi.stop - umi.start;
- } else {
- return 0;
+ int r =0 ;
+ if (!umi.empty()) {
+ for (auto& u : umi) {
+ if (u.start < 0) {
+ return 0;
+ } else if (u.stop == 0) {
+ return 0;
+ } else {
+ r += u.stop - u.start;
+ }
+ }
}
+ return r;
}
};
@@ -64,6 +76,7 @@ struct ProgramOptions {
int bootstrap;
std::vector<std::string> transfasta;
bool batch_mode;
+ bool pseudo_read_files_supplied;
bool bus_mode;
BUSOptions busOptions;
bool pseudo_quant;
@@ -87,6 +100,8 @@ struct ProgramOptions {
enum class StrandType {None, FR, RF};
StrandType strand;
bool umi;
+ bool batch_bus_write;
+ bool batch_bus;
std::string gfa; // used for inspect
bool inspect_thorough;
bool single_overhang;
@@ -94,6 +109,12 @@ struct ProgramOptions {
std::string chromFile;
std::string bedFile;
std::string technology;
+ std::string tagsequence;
+ std::string tccFile;
+ std::string ecFile;
+ std::string fldFile;
+ std::string transcriptsFile;
+ std::string genemap;
ProgramOptions() :
verbose(false),
@@ -107,6 +128,7 @@ ProgramOptions() :
min_range(1),
bootstrap(0),
batch_mode(false),
+ pseudo_read_files_supplied(false),
bus_mode(false),
pseudo_quant(false),
bam(false),
@@ -123,6 +145,8 @@ ProgramOptions() :
fusion(false),
strand(StrandType::None),
umi(false),
+ batch_bus_write(false),
+ batch_bus(false),
inspect_thorough(false),
single_overhang(false)
{}
=====================================
src/main.cpp
=====================================
@@ -8,6 +8,7 @@
#include <thread>
#include <time.h>
#include <algorithm>
+#include <limits>
#include <cstdio>
@@ -379,23 +380,118 @@ void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions& opt) {
}
}
+void ParseOptionsTCCQuant(int argc, char **argv, ProgramOptions& opt) {
+ const char *opt_string = "o:i:T:e:f:l:s:t:g:G:b:d:";
+ static struct option long_options[] = {
+ {"index", required_argument, 0, 'i'},
+ {"txnames", required_argument, 0, 'T'},
+ {"threads", required_argument, 0, 't'},
+ {"fragment-file", required_argument, 0, 'f'},
+ {"fragment-length", required_argument, 0, 'l'},
+ {"sd", required_argument, 0, 's'},
+ {"output-dir", required_argument, 0, 'o'},
+ {"ec-file", required_argument, 0, 'e'},
+ {"genemap", required_argument, 0, 'g'},
+ {"gtf", required_argument, 0, 'G'},
+ {"bootstrap-samples", required_argument, 0, 'b'},
+ {"seed", required_argument, 0, 'd'},
+ {0,0,0,0}
+ };
+ int c;
+ int option_index = 0;
+ while (true) {
+ c = getopt_long(argc,argv,opt_string, long_options, &option_index);
+
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 0:
+ break;
+ case 't': {
+ stringstream(optarg) >> opt.threads;
+ break;
+ }
+ case 'f': {
+ stringstream(optarg) >> opt.fldFile;
+ break;
+ }
+ case 'l': {
+ stringstream(optarg) >> opt.fld;
+ break;
+ }
+ case 's': {
+ stringstream(optarg) >> opt.sd;
+ break;
+ }
+ case 'o': {
+ opt.output = optarg;
+ break;
+ }
+ case 'i': {
+ opt.index = optarg;
+ break;
+ }
+ case 'e': {
+ opt.ecFile = optarg;
+ break;
+ }
+ case 'g': {
+ opt.genemap = optarg;
+ break;
+ }
+ case 'G': {
+ opt.gtfFile = optarg;
+ break;
+ }
+ case 'b': {
+ stringstream(optarg) >> opt.bootstrap;
+ break;
+ }
+ case 'd': {
+ stringstream(optarg) >> opt.seed;
+ break;
+ }
+ case 'T': {
+ opt.transcriptsFile = optarg;
+ break;
+ }
+ default: break;
+ }
+ }
+ for (int i = optind; i < argc; i++) {
+ opt.files.push_back(argv[i]);
+ }
+ if (opt.files.size() > 0) {
+ opt.tccFile = opt.files[0];
+ }
+}
+
void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
int verbose_flag = 0;
int single_flag = 0;
int strand_flag = 0;
+ int strand_FR_flag = 0;
+ int strand_RF_flag = 0;
int pbam_flag = 0;
int gbam_flag = 0;
int umi_flag = 0;
int quant_flag = 0;
+ int bus_flag = 0;
- const char *opt_string = "t:i:l:s:o:b:u:g:";
+ const char *opt_string = "t:i:l:s:o:b:u:g:n";
static struct option long_options[] = {
// long args
{"verbose", no_argument, &verbose_flag, 1},
{"single", no_argument, &single_flag, 1},
//{"strand-specific", no_argument, &strand_flag, 1},
+ {"fr-stranded", no_argument, &strand_FR_flag, 1},
+ {"rf-stranded", no_argument, &strand_RF_flag, 1},
{"pseudobam", no_argument, &pbam_flag, 1},
{"quant", no_argument, &quant_flag, 1},
+ {"bus", no_argument, &bus_flag, 1},
+ {"num", no_argument, 0, 'n'},
{"umi", no_argument, &umi_flag, 'u'},
{"batch", required_argument, 0, 'b'},
// short args
@@ -435,6 +531,10 @@ void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
stringstream(optarg) >> opt.sd;
break;
}
+ case 'n': {
+ opt.num = true;
+ break;
+ }
case 'o': {
opt.output = optarg;
break;
@@ -456,6 +556,14 @@ void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
opt.umi = true;
opt.single_end = true; // UMI implies single-end reads
}
+
+ if (bus_flag) {
+ if (!opt.num) {
+ opt.batch_bus_write = true;
+ } else {
+ opt.batch_bus = true;
+ }
+ }
// all other arguments are fast[a/q] files to be read
for (int i = optind; i < argc; i++) {
@@ -478,6 +586,16 @@ void ParseOptionsPseudo(int argc, char **argv, ProgramOptions& opt) {
if (strand_flag) {
opt.strand_specific = true;
}
+
+ if (strand_FR_flag) {
+ opt.strand_specific = true;
+ opt.strand = ProgramOptions::StrandType::FR;
+ }
+
+ if (strand_RF_flag) {
+ opt.strand_specific = true;
+ opt.strand = ProgramOptions::StrandType::RF;
+ }
if (pbam_flag) {
opt.pseudobam = true;
@@ -532,6 +650,8 @@ void ListSingleCellTechnologies() {
<< "10xv1 10x version 1 chemistry" << endl
<< "10xv2 10x version 2 chemistry" << endl
<< "10xv3 10x version 3 chemistry" << endl
+ << "Bulk Bulk RNA-seq or Smart-seq2 (multiplexed)" << endl
+ << "BDWTA BD Rhapsody WTA" << endl
<< "CELSeq CEL-Seq" << endl
<< "CELSeq2 CEL-Seq version 2" << endl
<< "DropSeq DropSeq" << endl
@@ -539,27 +659,40 @@ void ListSingleCellTechnologies() {
<< "inDropsv2 inDrops version 2 chemistry" << endl
<< "inDropsv3 inDrops version 3 chemistry" << endl
<< "SCRBSeq SCRB-Seq" << endl
+ << "SmartSeq3 Smart-seq3" << endl
+ << "SPLiT-seq SPLiT-seq" << endl
<< "SureCell SureCell for ddSEQ" << endl
+ << "Visium 10x Visium Spatial Transcriptomics" << endl
<< endl;
}
void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
int verbose_flag = 0;
int gbam_flag = 0;
+ int paired_end_flag = 0;
+ int strand_FR_flag = 0;
+ int strand_RF_flag = 0;
+ int unstranded_flag = 0;
- const char *opt_string = "i:o:x:t:lbng:c:";
+ const char *opt_string = "i:o:x:t:lbng:c:T:B:";
static struct option long_options[] = {
{"verbose", no_argument, &verbose_flag, 1},
{"index", required_argument, 0, 'i'},
{"output-dir", required_argument, 0, 'o'},
{"technology", required_argument, 0, 'x'},
{"list", no_argument, 0, 'l'},
+ {"batch", required_argument, 0, 'B'},
{"threads", required_argument, 0, 't'},
{"bam", no_argument, 0, 'b'},
{"num", no_argument, 0, 'n'},
{"genomebam", no_argument, &gbam_flag, 1},
{"gtf", required_argument, 0, 'g'},
{"chromosomes", required_argument, 0, 'c'},
+ {"tag", required_argument, 0, 'T'},
+ {"fr-stranded", no_argument, &strand_FR_flag, 1},
+ {"rf-stranded", no_argument, &strand_RF_flag, 1},
+ {"unstranded", no_argument, &unstranded_flag, 1},
+ {"paired", no_argument, &paired_end_flag, 1},
{0,0,0,0}
};
@@ -601,6 +734,11 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
opt.bam = true;
break;
}
+ case 'B': {
+ opt.batch_mode = true;
+ opt.batch_file_name = optarg;
+ break;
+ }
case 'n': {
opt.num = true;
break;
@@ -613,6 +751,10 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
stringstream(optarg) >> opt.chromFile;
break;
}
+ case 'T': {
+ stringstream(optarg) >> opt.tagsequence;
+ break;
+ }
default: break;
}
}
@@ -631,6 +773,26 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) {
opt.genomebam = true;
}
+ if (strand_FR_flag) {
+ opt.strand_specific = true;
+ opt.strand = ProgramOptions::StrandType::FR;
+ }
+
+ if (strand_RF_flag) {
+ opt.strand_specific = true;
+ opt.strand = ProgramOptions::StrandType::RF;
+ }
+
+ if (unstranded_flag) {
+ opt.strand_specific = true;
+ opt.strand = ProgramOptions::StrandType::None;
+ }
+
+ if (paired_end_flag) {
+ opt.single_end = false;
+ } else {
+ opt.single_end = true;
+ }
// all other arguments are fast[a/q] files to be read
for (int i = optind; i < argc; i++) {
@@ -726,11 +888,7 @@ bool ParseTechnology(const std::string &techstr, BUSOptions& busopt, std::vector
errorList.push_back("Error: empty UMI list " + umistr);
return false;
}
- if (v.size() != 1) {
- errorList.push_back("Error: only a single UMI list allow " + umistr);
- return false;
- }
- busopt.umi = std::move(v.front());
+ busopt.umi = std::move(v);
if (!convert_commas_to_vector(seqstr, v)) {
@@ -808,7 +966,7 @@ bool CheckOptionsBus(ProgramOptions& opt) {
}
// check files
- if (opt.files.size() == 0) {
+ if (opt.files.size() == 0 && !opt.batch_mode) {
cerr << ERROR_STR << " Missing read files" << endl;
ret = false;
} else {
@@ -853,67 +1011,264 @@ bool CheckOptionsBus(ProgramOptions& opt) {
<< ", but only " << n << " cores on the machine" << endl;
}
}
-
- if (opt.technology.empty()) {
- cerr << "Error: need to specify technology to use" << endl;
+ if (!opt.technology.empty() && opt.batch_mode) {
+ cerr << "Error: cannot specify both -x and --batch" << endl;
ret = false;
+ }
+
+ ProgramOptions::StrandType strand = ProgramOptions::StrandType::None;
+
+ if (opt.technology.empty()) { // kallisto pseudo
+ if (!opt.num) {
+ opt.batch_bus_write = true;
+ } else {
+ opt.batch_bus = true;
+ }
+ // check for read files
+ if (!opt.batch_mode) {
+ cerr << "[bus] no technology specified; will try running read files as-is" << endl;
+ opt.batch_mode = true;
+ if (!opt.single_end && opt.files.size() % 2 != 0) {
+ cerr << "Error: paired-end mode requires an even number of input files" << endl;
+ ret = false;
+ } else {
+ int i = 0;
+ int sample_id = 0;
+ while (i < opt.files.size()) {
+ opt.batch_ids.push_back("batch" + std::to_string(sample_id));
+ std::string f1,f2;
+ struct stat stFileInfo;
+ f1 = opt.files[i];
+ if (opt.single_end) {
+ opt.batch_files.push_back({f1});
+ auto intstat = stat(f1.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f1 << endl;
+ ret = false;
+ }
+ } else {
+ i++;
+ f2 = opt.files[i];
+ opt.batch_files.push_back({f1,f2});
+ auto intstat = stat(f1.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f1 << endl;
+ ret = false;
+ }
+ intstat = stat(f2.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f2 << endl;
+ ret = false;
+ }
+ }
+ i++;
+ sample_id++;
+ }
+ if (sample_id == 1) {
+ opt.pseudo_read_files_supplied = true; // Only one sample supplied; use SR instead of batchSR to read
+ }
+ }
+ } else if (ret) {
+ cerr << "[bus] no technology specified; will try running read files supplied in batch file" << endl;
+ if (!opt.single_end) {
+ cerr << "[bus] --paired ignored; single/paired-end is inferred from number of files supplied" << endl;
+ }
+ if (opt.files.size() != 0) {
+ cerr << ERROR_STR << " cannot specify batch mode and supply read files" << endl;
+ ret = false;
+ } else {
+ // check for batch files
+ if (opt.batch_mode) {
+ struct stat stFileInfo;
+ auto intstat = stat(opt.batch_file_name.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << opt.batch_file_name << endl;
+ ret = false;
+ }
+ // open the file, parse and fill the batch_files values
+ std::ifstream bfile(opt.batch_file_name);
+ std::string line;
+ std::string id,f1,f2;
+ bool read_first_batch_file_line = false;
+ while (std::getline(bfile,line)) {
+ if (line.size() == 0) {
+ continue;
+ }
+ std::stringstream ss(line);
+ ss >> id;
+ if (id[0] == '#') {
+ continue;
+ }
+ opt.batch_ids.push_back(id);
+ ss >> f1 >> f2;
+ if (!read_first_batch_file_line) {
+ if (f2.empty()) {
+ opt.single_end = true;
+ } else {
+ opt.single_end = false;
+ }
+ read_first_batch_file_line = true;
+ }
+ if (opt.single_end) {
+ opt.batch_files.push_back({f1});
+ intstat = stat(f1.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f1 << endl;
+ ret = false;
+ }
+ if (!f2.empty()) {
+ cerr << ERROR_STR << " batch file malformatted" << endl;
+ ret = false;
+ break;
+ }
+ } else {
+ opt.batch_files.push_back({f1,f2});
+ intstat = stat(f1.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f1 << endl;
+ ret = false;
+ }
+ if (f2.empty()) {
+ cerr << ERROR_STR << " batch file malformatted" << endl;
+ ret = false;
+ break;
+ }
+ intstat = stat(f2.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f2 << endl;
+ ret = false;
+ }
+ }
+ f1.clear();
+ f2.clear();
+ }
+ }
+ }
+ }
+ if (opt.pseudobam) {
+ cerr << "Error: Pseudobam not supported yet in this mode" << endl;
+ ret = false;
+ }
+ if (opt.bam) {
+ cerr << "Error: --bam not supported in this mode" << endl;
+ ret = false;
+ }
+ if (!opt.tagsequence.empty()) {
+ cerr << "Error: --tag not supported in this mode" << endl;
+ ret = false;
+ }
+ if (opt.strand_specific && opt.strand == ProgramOptions::StrandType::None) {
+ opt.strand_specific = false;
+ }
+ if (ret) { // Set up BUS options
+ auto& busopt = opt.busOptions;
+ busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+ busopt.umi.resize(0);
+ busopt.bc.resize(0);
+ if (opt.single_end) {
+ busopt.nfiles = 1;
+ busopt.paired = false;
+ } else {
+ busopt.nfiles = 2;
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
+ busopt.paired = true;
+ }
+ busopt.umi.push_back(BUSOptionSubstr(-1,-1,-1));
+ }
+ return ret;
} else {
auto& busopt = opt.busOptions;
if (opt.bam) { // Note: only 10xV2 has been tested
busopt.nfiles = 1;
+ busopt.paired = false;
if (opt.technology == "10XV2") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0)); // second file, entire string
- busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+ busopt.umi.push_back(BUSOptionSubstr(0,16,26)); // first file [16:26]
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
} else if (opt.technology == "10XV3") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,16,28);
+ busopt.umi.push_back(BUSOptionSubstr(0,16,28));
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
// } else if (opt.technology == "10XV1") {
} else if (opt.technology == "SURECELL") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,18,26);
+ busopt.umi.push_back(BUSOptionSubstr(0,18,26));
busopt.bc.push_back(BUSOptionSubstr(0,0,18));
} else if (opt.technology == "DROPSEQ") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,12,20);
+ busopt.umi.push_back(BUSOptionSubstr(0,12,20));
busopt.bc.push_back(BUSOptionSubstr(0,0,12));
} else if (opt.technology == "INDROPSV1") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,42,48);
+ busopt.umi.push_back(BUSOptionSubstr(0,42,48));
busopt.bc.push_back(BUSOptionSubstr(0,0,11));
busopt.bc.push_back(BUSOptionSubstr(0,30,38));
} else if (opt.technology == "INDROPSV2") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(0,0,0));
- busopt.umi = BUSOptionSubstr(1,42,48);
+ busopt.umi.push_back(BUSOptionSubstr(1,42,48));
busopt.bc.push_back(BUSOptionSubstr(1,0,11));
busopt.bc.push_back(BUSOptionSubstr(1,30,38));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
busopt.seq.push_back(BUSOptionSubstr(2,0,0));
- busopt.umi = BUSOptionSubstr(1,8,14);
+ busopt.umi.push_back(BUSOptionSubstr(1,8,14));
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else if (opt.technology == "CELSEQ") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,8,12);
+ busopt.umi.push_back(BUSOptionSubstr(0,8,12));
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
} else if (opt.technology == "CELSEQ2") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,0,6);
+ busopt.umi.push_back(BUSOptionSubstr(0,0,6));
busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+ } else if (opt.technology == "SPLIT-SEQ") {
+ busopt.nfiles = 2;
+ busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+ busopt.umi.push_back(BUSOptionSubstr(1,0,10));
+ busopt.bc.push_back(BUSOptionSubstr(1,10,18));
+ busopt.bc.push_back(BUSOptionSubstr(1,48,56));
+ busopt.bc.push_back(BUSOptionSubstr(1,78,86));
} else if (opt.technology == "SCRBSEQ") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,6,16);
+ busopt.umi.push_back(BUSOptionSubstr(0,6,16));
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
} else if (opt.technology == "INDROPSV3") {
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,0,6);
+ busopt.umi.push_back(BUSOptionSubstr(0,0,6));
busopt.bc.push_back(BUSOptionSubstr(0,6,16));
+ } else if (opt.technology == "SMARTSEQ3") {
+ busopt.nfiles = 4;
+ busopt.seq.push_back(BUSOptionSubstr(2,22,0));
+ busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+ busopt.umi.push_back(BUSOptionSubstr(2,0,19));
+ busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+ busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+ busopt.paired = true;
+ } else if (opt.technology == "BULK") {
+ busopt.nfiles = 3;
+ busopt.seq.push_back(BUSOptionSubstr(2,0,0));
+ busopt.umi.push_back(BUSOptionSubstr(-1,-1,-1));
+ busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+ busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+ if (!opt.single_end) {
+ busopt.nfiles++;
+ busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+ busopt.paired = true;
+ }
+ } else if (opt.technology == "BDWTA") {
+ busopt.nfiles = 2;
+ busopt.bc.push_back(BUSOptionSubstr(0, 0, 9)); // bc1 CLS1
+ // busopt.bc.push_back(BUSOptionSubstr(0,9,9+12)); // linker
+ busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12, 9 + 12 + 9)); // bc2 CLS2
+ // busopt.bc.push_back(BUSOptionSubstr(0,9+12+9,9+12+9+13)); // linker
+ busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13, 9 + 12 + 9 + 13 + 9)); // bc3 CLS3
+ busopt.umi.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13 + 9, 9 + 12 + 9 + 13 + 9 + 8)); // umi
+ busopt.seq.push_back(BUSOptionSubstr(1, 0, 0));
} else {
vector<int> files;
vector<BUSOptionSubstr> values;
@@ -922,13 +1277,17 @@ bool CheckOptionsBus(ProgramOptions& opt) {
//bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
bool valid = ParseTechnology(opt.technology, busopt, errorList);
+ if (busopt.seq.size() == 2 && !opt.single_end) {
+ busopt.paired = true;
+ }
+
if(!valid) {
/*
busopt.nfiles = files.size();
for(int i = 0; i < bcValues.size(); i++) {
busopt.bc.push_back(bcValues[i]);
}
- busopt.umi = values[0];
+ busopt.umi.push_back(values[0]);
busopt.seq.push_back(values[1]);
*/
} else {
@@ -940,72 +1299,116 @@ bool CheckOptionsBus(ProgramOptions& opt) {
}
}
} else {
+ busopt.paired = false;
if (opt.technology == "10XV2") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0)); // second file, entire string
- busopt.umi = BUSOptionSubstr(0,16,26); // first file [16:26]
+ busopt.umi.push_back(BUSOptionSubstr(0,16,26)); // first file [16:26]
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+ strand = ProgramOptions::StrandType::FR;
} else if (opt.technology == "10XV3") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,16,28);
+ busopt.umi.push_back(BUSOptionSubstr(0,16,28));
busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+ strand = ProgramOptions::StrandType::FR;
+ } else if (opt.technology == "VISIUM") {
+ busopt.nfiles = 2;
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
+ busopt.umi.push_back(BUSOptionSubstr(0,16,28));
+ busopt.bc.push_back(BUSOptionSubstr(0,0,16));
+ strand = ProgramOptions::StrandType::FR;
} else if (opt.technology == "10XV1") {
busopt.nfiles = 3;
busopt.seq.push_back(BUSOptionSubstr(2,0,0));
- busopt.umi = BUSOptionSubstr(1,0,10);
+ busopt.umi.push_back(BUSOptionSubstr(1,0,10));
busopt.bc.push_back(BUSOptionSubstr(0,0,14));
+ strand = ProgramOptions::StrandType::FR;
} else if (opt.technology == "SURECELL") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,51,59);
+ busopt.umi.push_back(BUSOptionSubstr(0,51,59));
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
busopt.bc.push_back(BUSOptionSubstr(0,21,27));
busopt.bc.push_back(BUSOptionSubstr(0,42,48));
+ strand = ProgramOptions::StrandType::FR;
} else if (opt.technology == "DROPSEQ") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,12,20);
+ busopt.umi.push_back(BUSOptionSubstr(0,12,20));
busopt.bc.push_back(BUSOptionSubstr(0,0,12));
} else if (opt.technology == "INDROPSV1") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,42,48);
+ busopt.umi.push_back(BUSOptionSubstr(0,42,48));
busopt.bc.push_back(BUSOptionSubstr(0,0,11));
- busopt.bc.push_back(BUSOptionSubstr(0,30,38));
+ busopt.bc.push_back(BUSOptionSubstr(0,30,38));
} else if (opt.technology == "INDROPSV2") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(0,0,0));
- busopt.umi = BUSOptionSubstr(1,42,48);
+ busopt.umi.push_back(BUSOptionSubstr(1,42,48));
busopt.bc.push_back(BUSOptionSubstr(1,0,11));
- busopt.bc.push_back(BUSOptionSubstr(1,30,38));
+ busopt.bc.push_back(BUSOptionSubstr(1,30,38));
} else if (opt.technology == "INDROPSV3") {
busopt.nfiles = 3;
busopt.seq.push_back(BUSOptionSubstr(2,0,0));
- busopt.umi = BUSOptionSubstr(1,8,14);
+ busopt.umi.push_back(BUSOptionSubstr(1,8,14));
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
busopt.bc.push_back(BUSOptionSubstr(1,0,8));
} else if (opt.technology == "CELSEQ") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,8,12);
+ busopt.umi.push_back(BUSOptionSubstr(0,8,12));
busopt.bc.push_back(BUSOptionSubstr(0,0,8));
+ strand = ProgramOptions::StrandType::FR;
} else if (opt.technology == "CELSEQ2") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,0,6);
+ busopt.umi.push_back(BUSOptionSubstr(0,0,6));
busopt.bc.push_back(BUSOptionSubstr(0,6,12));
+ strand = ProgramOptions::StrandType::FR;
+ } else if (opt.technology == "SPLIT-SEQ"){
+ busopt.nfiles = 2;
+ busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+ busopt.umi.push_back(BUSOptionSubstr(1,0,10));
+ busopt.bc.push_back(BUSOptionSubstr(1,10,18));
+ busopt.bc.push_back(BUSOptionSubstr(1,48,56));
+ busopt.bc.push_back(BUSOptionSubstr(1,78,86));
} else if (opt.technology == "SCRBSEQ") {
busopt.nfiles = 2;
busopt.seq.push_back(BUSOptionSubstr(1,0,0));
- busopt.umi = BUSOptionSubstr(0,6,16);
+ busopt.umi.push_back(BUSOptionSubstr(0,6,16));
busopt.bc.push_back(BUSOptionSubstr(0,0,6));
- } else if (opt.technology == "INDROPSV3") {
+ } else if (opt.technology == "SMARTSEQ3") {
+ busopt.nfiles = 4;
+ busopt.seq.push_back(BUSOptionSubstr(2,22,0));
+ busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+ busopt.umi.push_back(BUSOptionSubstr(2,0,19));
+ busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+ busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+ busopt.paired = true;
+ strand = ProgramOptions::StrandType::FR;
+ } else if (opt.technology == "BULK") {
busopt.nfiles = 3;
busopt.seq.push_back(BUSOptionSubstr(2,0,0));
- busopt.umi = BUSOptionSubstr(1,8,14);
- busopt.bc.push_back(BUSOptionSubstr(0,0,8));
- busopt.bc.push_back(BUSOptionSubstr(1,0,8));
+ busopt.umi.push_back(BUSOptionSubstr(-1,-1,-1));
+ busopt.bc.push_back(BUSOptionSubstr(0,0,0));
+ busopt.bc.push_back(BUSOptionSubstr(1,0,0));
+ if (!opt.single_end) {
+ busopt.nfiles++;
+ busopt.seq.push_back(BUSOptionSubstr(3,0,0));
+ busopt.paired = true;
+ }
+ } else if (opt.technology == "BDWTA") {
+ busopt.nfiles = 2;
+ busopt.bc.push_back(BUSOptionSubstr(0, 0, 9)); // bc1 CLS1
+ // busopt.bc.push_back(BUSOptionSubstr(0,9,9+12)); // linker
+ busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12, 9 + 12 + 9)); // bc2 CLS2
+ // busopt.bc.push_back(BUSOptionSubstr(0,9+12+9,9+12+9+13)); // linker
+ busopt.bc.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13, 9 + 12 + 9 + 13 + 9)); // bc3 CLS3
+ busopt.umi.push_back(BUSOptionSubstr(0, 9 + 12 + 9 + 13 + 9, 9 + 12 + 9 + 13 + 9 + 8)); // umi
+ busopt.seq.push_back(BUSOptionSubstr(1, 0, 0));
+ strand = ProgramOptions::StrandType::FR;
} else {
vector<int> files;
vector<BUSOptionSubstr> values;
@@ -1014,6 +1417,9 @@ bool CheckOptionsBus(ProgramOptions& opt) {
//bool invalid = ParseTechnology(opt.technology, values, files, errorList, bcValues);
bool valid = ParseTechnology(opt.technology, busopt, errorList);
+ if (busopt.seq.size() == 2 && !opt.single_end) {
+ busopt.paired = true;
+ }
if(valid) {
/*
@@ -1021,7 +1427,7 @@ bool CheckOptionsBus(ProgramOptions& opt) {
for(int i = 0; i < bcValues.size(); i++) {
busopt.bc.push_back(bcValues[i]);
}
- busopt.umi = values[0];
+ busopt.umi.push_back(values[0]);
busopt.seq.push_back(values[1]);
*/
} else {
@@ -1034,10 +1440,50 @@ bool CheckOptionsBus(ProgramOptions& opt) {
}
}
}
+
+ if (opt.tagsequence.empty() && (opt.technology == "SMARTSEQ3")) {
+ opt.tagsequence = "ATTGCGCAATG";
+ cerr << "[bus] Using " << opt.tagsequence << " as UMI tag sequence" << endl;
+ }
+
+ if (opt.strand_specific && opt.strand == ProgramOptions::StrandType::None) {
+ opt.strand_specific = false; // User specified --unstranded
+ } else if (!opt.strand_specific && ret) {
+ opt.strand = strand;
+ if (opt.strand == ProgramOptions::StrandType::FR) {
+ cerr << "[bus] Note: Strand option was not specified; setting it to --fr-stranded for specified technology" << endl;
+ opt.strand_specific = true;
+ } else if (opt.strand == ProgramOptions::StrandType::RF) {
+ cerr << "[bus] Note: Strand option was not specified; setting it to --rf-stranded for specified technology" << endl;
+ opt.strand_specific = true;
+ } else {
+ cerr << "[bus] Note: Strand option was not specified; setting it to --unstranded for specified technology" << endl;
+ }
+ }
+
+ if (!opt.tagsequence.empty()) {
+ opt.busOptions.umi[0].start += opt.tagsequence.length();
+ if (opt.busOptions.umi[0].start >= opt.busOptions.umi[0].stop) {
+ cerr << "Error: Tag sequence must be shorter than UMI sequence" << endl;
+ ret = false;
+ }
+ }
+
+ if (!opt.single_end && !opt.busOptions.paired) {
+ cerr << "Error: Paired reads are not compatible with the specified technology" << endl;
+ ret = false;
+ }
+
+ if (opt.strand_specific) {
+ if (opt.busOptions.seq.size() != 1 && !(opt.busOptions.seq.size() == 2 && opt.busOptions.paired)) {
+ cerr << "Error: Strand-specific read processing is only supported for technologies with a single CDNA read file or paired-end reads" << endl;
+ ret = false;
+ }
+ }
if (opt.genomebam) {
- if (opt.busOptions.seq.size() != 1) {
- cerr << "Error: BAM output is currently only supported for technologies with a single CDNA read file" << endl;
+ if (opt.busOptions.seq.size() != 1 && !(opt.busOptions.seq.size() == 2 && opt.busOptions.paired)) {
+ cerr << "Error: BAM output is currently only supported for technologies with a single CDNA read file or paired-end reads" << endl;
ret = false;
}
if (!opt.gtfFile.empty()) {
@@ -1285,24 +1731,179 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) {
//ret = false;
}
}
-
+
+ if (opt.bootstrap < 0) {
+ cerr << "Error: number of bootstrap samples must be a non-negative integer." << endl;
+ ret = false;
+ }
+
+ #ifndef USE_HDF5
+ if (opt.bootstrap > 0 && !opt.plaintext) {
+ cerr << "Warning: kallisto was not compiled with HDF5 support so no bootstrapping" << endl
+ << "will be performed. Run quant with --plaintext option or recompile with" << endl
+ << "HDF5 support to obtain bootstrap estimates." << endl;
+ opt.bootstrap = 0;
+ }
+ #endif
+ return ret;
+}
+
+bool CheckOptionsTCCQuant(ProgramOptions& opt) {
+
+ bool ret = true;
+
+ cerr << endl;
+ // check for index
+ if (opt.index.empty() && opt.transcriptsFile.empty()) {
+ cerr << ERROR_STR << " either a kallisto index file or a transcripts file need to be supplied" << endl;
+ ret = false;
+ } else if (!opt.index.empty() && !opt.transcriptsFile.empty()) {
+ cerr << ERROR_STR << " cannot supply both a kallisto index file and a transcripts file" << endl;
+ ret = false;
+ } else if (!opt.index.empty()) {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.index.c_str(), &stFileInfo);
+ if (intStat != 0) {
+ cerr << ERROR_STR << " kallisto index file not found " << opt.index << endl;
+ ret = false;
+ }
+ } else if (!opt.transcriptsFile.empty()) {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.transcriptsFile.c_str(), &stFileInfo);
+ if (intStat != 0) {
+ cerr << ERROR_STR << " transcripts file not found " << opt.transcriptsFile << endl;
+ ret = false;
+ }
+ }
+
+ if (opt.tccFile.empty()) {
+ cerr << ERROR_STR << " transcript-compatibility counts file missing" << endl;
+ ret = false;
+ } else {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.tccFile.c_str(), &stFileInfo);
+ if (intStat != 0) {
+ cerr << ERROR_STR << " transcript-compatibility counts file not found " << opt.tccFile << endl;
+ ret = false;
+ }
+ }
+
+ if (opt.ecFile.empty() && !opt.transcriptsFile.empty()) {
+ cerr << ERROR_STR << " equivalence class file must be supplied if transcripts file is supplied " << opt.tccFile << endl;
+ ret = false;
+ }
+
+ if (!opt.ecFile.empty()) {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.ecFile.c_str(), &stFileInfo);
+ if (intStat != 0) {
+ cerr << ERROR_STR << " equivalence class file not found " << opt.ecFile << endl;
+ ret = false;
+ }
+ }
+
+ if (!opt.fldFile.empty()) {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.fldFile.c_str(), &stFileInfo);
+ if (intStat != 0) {
+ cerr << ERROR_STR << " fragment length distribution file not found " << opt.fldFile << endl;
+ ret = false;
+ }
+ }
+
+ if (!opt.genemap.empty() && !opt.gtfFile.empty()) {
+ cerr << ERROR_STR << " Cannot supply both --genemap and --gtf" << endl;
+ ret = false;
+ }
+
+ if (!opt.gtfFile.empty()) {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.gtfFile.c_str(), &stFileInfo);
+ if (intStat != 0) {
+ cerr << ERROR_STR << " GTF file not found " << opt.gtfFile << endl;
+ ret = false;
+ }
+ }
+
+ if (!opt.genemap.empty()) {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.genemap.c_str(), &stFileInfo);
+ if (intStat != 0) {
+ cerr << ERROR_STR << " file for mapping transcripts to genes not found " << opt.genemap << endl;
+ ret = false;
+ }
+ }
+
+ if ((opt.fld != 0.0 || opt.sd != 0.0) && !opt.fldFile.empty()) {
+ cerr << ERROR_STR <<" cannot supply mean or sd while also supplying a fragment length distribution file" << endl;
+ ret = false;
+ }
+
+ if ((opt.fld != 0.0 && opt.sd == 0.0) || (opt.sd != 0.0 && opt.fld == 0.0)) {
+ cerr << ERROR_STR << " cannot supply mean/sd without supplying both -l and -s" << endl;
+ ret = false;
+ }
+
+ if (opt.index.empty() && (!opt.fldFile.empty() || opt.fld != 0.0 || opt.sd != 0.0)) {
+ cerr << ERROR_STR << " cannot supply fragment length information unless a kallisto index is provided" << endl;
+ ret = false;
+ }
+
+ if (ret && opt.fld > 0.0 && opt.sd > 0.0) {
+ cerr << "[tcc] fragment length distribution is truncated gaussian with mean = " <<
+ opt.fld << ", sd = " << opt.sd << endl;
+ }
+
+ if (opt.fld < 0.0) {
+ cerr << ERROR_STR << " invalid value for mean fragment length " << opt.fld << endl;
+ ret = false;
+ }
+
+ if (opt.sd < 0.0) {
+ cerr << ERROR_STR << " invalid value for fragment length standard deviation " << opt.sd << endl;
+ ret = false;
+ }
+
+ if (opt.output.empty()) {
+ cerr << ERROR_STR << " need to specify output directory " << opt.output << endl;
+ ret = false;
+ } else {
+ struct stat stFileInfo;
+ auto intStat = stat(opt.output.c_str(), &stFileInfo);
+ if (intStat == 0) {
+ // file/dir exits
+ if (!S_ISDIR(stFileInfo.st_mode)) {
+ cerr << ERROR_STR << " file " << opt.output << " exists and is not a directory" << endl;
+ ret = false;
+ }
+ } else {
+ // create directory
+ if (my_mkdir(opt.output.c_str(), 0777) == -1) {
+ cerr << ERROR_STR << " could not create directory " << opt.output << endl;
+ ret = false;
+ }
+ }
+ }
+
+ if (opt.threads <= 0) {
+ cerr << ERROR_STR << " invalid number of threads " << opt.threads << endl;
+ ret = false;
+ } else {
+ unsigned int n = std::thread::hardware_concurrency();
+ if (n != 0 && n < opt.threads) {
+ cerr << "Warning: you asked for " << opt.threads
+ << ", but only " << n << " cores on the machine" << endl;
+ }
+ }
+
if (opt.bootstrap < 0) {
cerr << "Error: number of bootstrap samples must be a non-negative integer." << endl;
ret = false;
}
- #ifndef USE_HDF5
- if (opt.bootstrap > 0 && !opt.plaintext) {
- cerr << "Warning: kallisto was not compiled with HDF5 support so no bootstrapping" << endl
- << "will be performed. Run quant with --plaintext option or recompile with" << endl
- << "HDF5 support to obtain bootstrap estimates." << endl;
- opt.bootstrap = 0;
- }
- #endif
- return ret;
+ return ret;
}
-
bool CheckOptionsMerge(ProgramOptions& opt) {
bool ret = true;
@@ -1404,12 +2005,24 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
ret = false;
}
}
+
+ assert(!(opt.batch_bus && opt.batch_bus_write));
+ bool opt_bus_mode = opt.batch_bus || opt.batch_bus_write;
if (opt.pseudo_quant) {
if (!opt.batch_mode) {
cerr << ERROR_STR << " --quant can only be run with in batch mode with --batch" << endl;
ret = false;
}
+ if (opt_bus_mode) {
+ cerr << ERROR_STR << " --quant cannot be run with --bus" << endl;
+ ret = false;
+ }
+ }
+
+ if (opt_bus_mode && opt.umi) {
+ cerr << ERROR_STR << " UMI cannot be run with --bus" << endl;
+ ret = false;
}
// check for read files
@@ -1418,8 +2031,45 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
cerr << ERROR_STR << " UMI must be run in batch mode, use --batch option" << endl;
ret = false;
}
-
- if (opt.files.size() == 0) {
+ if (opt_bus_mode && ret) {
+ cerr << "--bus specified; will try running read files in batch mode" << endl;
+ opt.batch_ids.push_back("sample");
+ opt.batch_mode = true;
+ opt.pseudo_read_files_supplied = true;
+ if (opt.files.size() != 1 && opt.files.size() != 2) {
+ cerr << ERROR_STR << " A minimum of one and a maximum of two read files must be provided" << endl;
+ ret = false;
+ } else if (opt.single_end && opt.files.size() != 1) {
+ cerr << ERROR_STR << " single-end mode requires exactly one read file" << endl;
+ ret = false;
+ } else {
+ std::string f1,f2;
+ struct stat stFileInfo;
+ if (opt.files.size() == 1) {
+ f1 = opt.files[0];
+ opt.batch_files.push_back({f1});
+ auto intstat = stat(f1.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f1 << endl;
+ ret = false;
+ }
+ } else {
+ f1 = opt.files[0];
+ f2 = opt.files[1];
+ opt.batch_files.push_back({f1,f2});
+ auto intstat = stat(f1.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f1 << endl;
+ ret = false;
+ }
+ intstat = stat(f2.c_str(), &stFileInfo);
+ if (intstat != 0) {
+ cerr << ERROR_STR << " file not found " << f2 << endl;
+ ret = false;
+ }
+ }
+ }
+ } else if (opt.files.size() == 0) {
cerr << ERROR_STR << " Missing read files" << endl;
ret = false;
} else {
@@ -1511,6 +2161,12 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
if (opt.fld != 0.0 || opt.sd != 0.0) {
cerr << "[~warn] you supplied fragment length information for UMI data which will be ignored" << endl;
}
+ } else if (opt_bus_mode) {
+ if (opt.fld != 0.0 || opt.sd != 0.0) {
+ cerr << "[~warn] you supplied fragment length information for --bus mode which will be ignored" << endl;
+ }
+ opt.fld = 0.0;
+ opt.sd = 0.0;
} else {
if ((opt.fld != 0.0 && opt.sd == 0.0) || (opt.sd != 0.0 && opt.fld == 0.0)) {
cerr << "Error: cannot supply mean/sd without supplying both -l and -s" << endl;
@@ -1585,6 +2241,25 @@ bool CheckOptionsPseudo(ProgramOptions& opt) {
cerr << "Error: Pseudobam not supported yet in pseudo mode, use quant mode to obtain BAM files" << endl;
ret = false;
}
+
+ if (!opt_bus_mode && opt.num) {
+ cerr << "Warning: --bus option was not used, so --num option will be ignored" << endl;
+ }
+
+ if (opt_bus_mode && ret) { // Set up BUS options
+ auto& busopt = opt.busOptions;
+ busopt.seq.push_back(BUSOptionSubstr(0,0,0));
+ busopt.umi.resize(0);
+ busopt.bc.resize(0);
+ if (opt.single_end) {
+ busopt.nfiles = 1;
+ busopt.paired = false;
+ } else {
+ busopt.nfiles = 2;
+ busopt.seq.push_back(BUSOptionSubstr(1,0,0));
+ busopt.paired = true;
+ }
+ }
return ret;
}
@@ -1701,8 +2376,8 @@ void usage() {
<< "Where <CMD> can be one of:" << endl << endl
<< " index Builds a kallisto index "<< endl
<< " quant Runs the quantification algorithm " << endl
+ << " quant-tcc Runs quantification on transcript-compatibility counts" << endl
<< " bus Generate BUS files for single-cell data " << endl
- << " pseudo Runs the pseudoalignment step " << endl
<< " merge Merges several batch runs " << endl
<< " h5dump Converts HDF5-formatted results to plaintext" << endl
<< " inspect Inspects and gives information about an index" << endl
@@ -1718,13 +2393,24 @@ void usageBus() {
<< "Required arguments:" << endl
<< "-i, --index=STRING Filename for the kallisto index to be used for" << endl
<< " pseudoalignment" << endl
- << "-o, --output-dir=STRING Directory to write output to" << endl
- << "-x, --technology=STRING Single-cell technology used " << endl << endl
+ << "-o, --output-dir=STRING Directory to write output to" << endl << endl
<< "Optional arguments:" << endl
+ << "-x, --technology=STRING Single-cell technology used " << endl
<< "-l, --list List all single-cell technologies supported" << endl
+ << "-B, --batch=FILE Process files listed in FILE" << endl
<< "-t, --threads=INT Number of threads to use (default: 1)" << endl
<< "-b, --bam Input file is a BAM file" << endl
<< "-n, --num Output number of read in flag column (incompatible with --bam)" << endl
+ << "-T, --tag=STRING 5′ tag sequence to identify UMI reads for certain technologies" << endl
+ << " --fr-stranded Strand specific reads for UMI-tagged reads, first read forward" << endl
+ << " --rf-stranded Strand specific reads for UMI-tagged reads, first read reverse" << endl
+ << " --unstranded Treat all read as non-strand-specific" << endl
+ << " --paired Treat reads as paired" << endl
+ << " --genomebam Project pseudoalignments to genome sorted BAM file" << endl
+ << "-g, --gtf GTF file for transcriptome information" << endl
+ << " (required for --genomebam)" << endl
+ << "-c, --chromosomes Tab separated file with chromosome names and lengths" << endl
+ << " (optional for --genomebam, but recommended)" << endl
<< " --verbose Print out progress information every 1M proccessed reads" << endl;
}
@@ -1800,7 +2486,7 @@ void usageEM(bool valid_input = true) {
void usagePseudo(bool valid_input = true) {
if (valid_input) {
cout << "kallisto " << KALLISTO_VERSION << endl
- << "Computes equivalence classes for reads and quantifies abundances" << endl << endl;
+ << "Computes equivalence classes for reads and quantifies abundances (deprecated)" << endl << endl;
}
cout << "Usage: kallisto pseudo [arguments] FASTQ-files" << endl << endl
@@ -1812,11 +2498,16 @@ void usagePseudo(bool valid_input = true) {
<< "-u --umi First file in pair is a UMI file" << endl
<< "-b --batch=FILE Process files listed in FILE" << endl
<< " --quant Quantify using EM algorithm (only in batch mode)" << endl
+ << " --bus Output a BUS file" << endl
<< " --single Quantify single-end reads" << endl
<< "-l, --fragment-length=DOUBLE Estimated average fragment length" << endl
<< "-s, --sd=DOUBLE Estimated standard deviation of fragment length" << endl
<< " (default: -l, -s values are estimated from paired" << endl
- << " end data, but are required when using --single)" << endl
+ << " end data, but are required when using --single" << endl
+ << " unless outputting a BUS file via --bus)" << endl
+ << " --fr-stranded Strand specific reads, first read forward" << endl
+ << " --rf-stranded Strand specific reads, first read reverse" << endl
+ << "-n, --num Output number of read in BUS file flag column (only with --bus)" << endl
<< "-t, --threads=INT Number of threads to use (default: 1)" << endl;
// << " --pseudobam Output pseudoalignments in SAM format to stdout" << endl;
@@ -1848,6 +2539,38 @@ void usageEMOnly() {
<< " --plaintext Output plaintext instead of HDF5" << endl << endl;
}
+void usageTCCQuant(bool valid_input = true) {
+ if (valid_input) {
+ cout << "kallisto " << KALLISTO_VERSION << endl
+ << "Quantifies abundance from pre-computed transcript-compatibility counts" << endl << endl;
+ }
+
+ cout << "Usage: kallisto quant-tcc [arguments] transcript-compatibility-counts-file" << endl << endl
+ << "Required arguments:" << endl
+ << "-o, --output-dir=STRING Directory to write output to" << endl << endl
+ << "Optional arguments:" << endl
+ << "-i, --index=STRING Filename for the kallisto index to be used" << endl
+ << " (required if file with names of transcripts not supplied)" << endl
+ << "-T, --txnames=STRING File with names of transcripts" << endl
+ << " (required if index file not supplied)" << endl
+ << "-e, --ec-file=FILE File containing equivalence classes" << endl
+ << " (default: equivalence classes are taken from the index)" << endl
+ << "-f, --fragment-file=FILE File containing fragment length distribution" << endl
+ << " (default: effective length normalization is not performed)" << endl
+ << "-l, --fragment-length=DOUBLE Estimated average fragment length" << endl
+ << "-s, --sd=DOUBLE Estimated standard deviation of fragment length" << endl
+ << " (note: -l, -s values only should be supplied when" << endl
+ << " effective length normalization needs to be performed" << endl
+ << " but --fragment-file is not specified)" << endl
+ << "-t, --threads=INT Number of threads to use (default: 1)" << endl
+ << "-g, --genemap File for mapping transcripts to genes" << endl
+ << " (required for obtaining gene-level abundances)" << endl
+ << "-G, --gtf=FILE GTF file for transcriptome information" << endl
+ << " (can be used instead of --genemap for obtaining gene-level abundances)" << endl
+ << "-b, --bootstrap-samples=INT Number of bootstrap samples (default: 0)" << endl
+ << " --seed=INT Seed for the bootstrap sampling (default: 42)" << endl;
+}
+
std::string argv_to_string(int argc, char *argv[]) {
std::string res;
for (int i = 0; i < argc; ++i) {
@@ -1929,7 +2652,115 @@ int main(int argc, char *argv[]) {
if (!CheckOptionsBus(opt)) {
usageBus();
exit(1);
- } else {
+ } else if (opt.batch_mode) { // kallisto pseudo
+ // pseudoalign the reads
+ KmerIndex index(opt);
+ index.load(opt);
+
+ MinCollector collection(index, opt);
+ int64_t num_processed = 0;
+ int64_t num_pseudoaligned = 0;
+ int64_t num_unique = 0;
+
+ Transcriptome model; // empty model
+ if (!opt.gtfFile.empty()) {
+ model.parseGTF(opt.gtfFile, index, opt, true);
+ }
+ MasterProcessor MP(index, opt, collection, model);
+ if (!opt.batch_mode) {
+ num_processed = ProcessReads(MP, opt);
+ collection.write((opt.output + "/pseudoalignments"));
+ } else {
+ num_processed = ProcessBatchReads(MP,opt);
+ }
+
+ std::string call = argv_to_string(argc, argv);
+
+ if (!opt.batch_bus) {
+ for (int id = 0; id < MP.batchCounts.size(); id++) {
+ const auto &cc = MP.batchCounts[id];
+ for (const auto &p : cc) {
+ if (p.first < index.num_trans) {
+ num_unique += p.second;
+ }
+ num_pseudoaligned += p.second;
+ }
+ }
+ } else {
+ for (int i = 0; i < index.num_trans; i++) {
+ num_unique += collection.counts[i];
+ }
+ for (int i = 0; i < collection.counts.size(); i++) {
+ num_pseudoaligned += collection.counts[i];
+ }
+ }
+
+ std::ofstream transout_f((opt.output + "/transcripts.txt"));
+ for (const auto &t : index.target_names_) {
+ transout_f << t << "\n";
+ }
+ transout_f.close();
+
+ plaintext_aux(
+ opt.output + "/run_info.json",
+ std::string(std::to_string(index.num_trans)),
+ std::string(std::to_string(0)), // no bootstraps in pseudo
+ std::string(std::to_string(num_processed)),
+ std::string(std::to_string(num_pseudoaligned)),
+ std::string(std::to_string(num_unique)),
+ KALLISTO_VERSION,
+ std::string(std::to_string(index.INDEX_VERSION)),
+ start_time,
+ call);
+
+ cerr << endl;
+
+ std::string prefix = opt.output + "/matrix";
+ std::string ecfilename = prefix + ".ec";
+ std::string tccfilename = prefix + ".tcc.mtx";
+ std::string cellnamesfilename = prefix + ".cells";
+ std::string genelistname = prefix + ".genelist.txt";
+ std::string busbarcodelistname = prefix + ".barcodes";
+ std::string busoutputname = opt.output + "/output.bus";
+
+ writeECList(ecfilename, index);
+ writeCellIds(cellnamesfilename, opt.batch_ids);
+ if (opt.batch_bus || opt.batch_bus_write) {
+ if (opt.batch_bus_write) {
+ writeBUSMatrix(busoutputname, MP.batchCounts, index.ecmap.size());
+ }
+ if (!MP.batchCounts.empty()) {
+ // Write out fake barcodes that identify each cell
+ std::vector<std::string> fake_bcs;
+ fake_bcs.reserve(MP.batchCounts.size());
+ for (size_t j = 0; j < MP.batchCounts.size(); j++) {
+ fake_bcs.push_back(binaryToString(j, BUSFORMAT_FAKE_BARCODE_LEN));
+ }
+ writeCellIds(busbarcodelistname, fake_bcs);
+ }
+ // Write out index:
+ index.write((opt.output + "/index.saved"), false);
+ // Write out fragment length distributions if reads paired-end:
+ if (!opt.single_end) {
+ std::ofstream flensout_f((opt.output + "/flens.txt"));
+ for (size_t id = 0; id < opt.batch_ids.size(); id++) {
+ std::vector<int> fld = MP.batchFlens[id];
+ for ( size_t i = 0 ; i < fld.size(); ++i ) {
+ if (i != 0) {
+ flensout_f << " ";
+ }
+ flensout_f << fld[i];
+ }
+ flensout_f << "\n";
+ }
+ flensout_f.close();
+ }
+ }
+ if (!opt.gtfFile.empty()) {
+ // write out gene info
+ writeGeneList(genelistname, model);
+ }
+ } else { // kallisto bus -x
int num_trans, index_version;
int64_t num_processed = 0;
int64_t num_pseudoaligned = 0;
@@ -2008,7 +2839,27 @@ int main(int argc, char *argv[]) {
transout_f << t << "\n";
}
transout_f.close();
-
+
+ std::vector<int> fld;
+ if (opt.busOptions.paired) {
+ fld = collection.flens; // copy
+ collection.compute_mean_frag_lens_trunc();
+ // Write out index:
+ index.write((opt.output + "/index.saved"), false);
+ // Write out fragment length distribution:
+ std::ofstream flensout_f((opt.output + "/flens.txt"));
+ for ( size_t i = 0 ; i < fld.size(); ++i ) {
+ if (i != 0) {
+ flensout_f << " ";
+ }
+ flensout_f << fld[i];
+ }
+ flensout_f << "\n";
+ flensout_f.close();
+ } else if (opt.busOptions.umi[0].fileno == -1) {
+ // Write out index:
+ index.write((opt.output + "/index.saved"), false);
+ }
// gather stats
num_unique = 0;
@@ -2395,6 +3246,302 @@ int main(int argc, char *argv[]) {
exit(1);
}
}
+ } else if (cmd == "quant-tcc") {
+ if (argc==2) {
+ usageTCCQuant();
+ return 0;
+ }
+ ParseOptionsTCCQuant(argc-1,argv+1,opt);
+ if (!CheckOptionsTCCQuant(opt)) {
+ usageTCCQuant();
+ exit(1);
+ } else {
+ KmerIndex index(opt);
+ index.load(opt, false); // skip the k-mer map
+ size_t num_ecs = index.ecmap.size();
+ MinCollector collection(index, opt);
+ std::vector<std::vector<std::pair<int32_t, int32_t>>> batchCounts; // Stores TCCs
+ std::ifstream in((opt.tccFile));
+ bool firstline = true;
+ bool isMatrixFile = false;
+ size_t nrow = 0, ncol = 0, nlines = 0;
+ int i = 0, prevRow = 0, prevCol = 0;
+ if (in.is_open()) { // Read in TCC file
+ std::string line;
+ while (getline(in, line)) {
+ if (firstline) { // First line decides whether the file is in matrix format
+ firstline = false;
+ if (line.rfind("%%MatrixMarket", 0) == 0) {
+ std::cerr << "[tcc] Parsing transcript-compatibility counts (TCC) file as a matrix file" << std::endl;
+ isMatrixFile = true;
+ while (getline(in, line) && line.rfind("%", 0) == 0) { } // Skip header comments
+ std::stringstream ss(line);
+ ss >> nrow >> ncol >> nlines;
+ std::cerr << "[tcc] Matrix dimensions: " << pretty_num(nrow) << " x " << pretty_num(ncol) << std::endl;
+ batchCounts.assign(nrow, {});
+ if (opt.bootstrap > 0) {
+ cerr << "[tcc] Warning: Bootstrap can only be used for non-matrix files; will not be performed" << endl;
+ }
+ continue;
+ } else {
+ std::cerr << "[tcc] Transcript-compatibility counts (TCC) file is not in matrix format; "
+ << "it will not be parsed as a matrix file" << std::endl;
+ isMatrixFile = false;
+ batchCounts.assign(1, {});
+ }
+ }
+ std::stringstream ss(line);
+ int row, col, val;
+ if (isMatrixFile) {
+ if (i >= nlines) {
+ std::cerr << "[tcc] Warning: TCC matrix file contains additional lines which will not be read; "
+ << "only " << pretty_num(nlines) << " entries, as specified on the first line, will be read." << std::endl;
+ break;
+ }
+ ss >> row >> col >> val;
+ if (row > nrow || col > ncol) {
+ std::cerr << "Error: TCC matrix file is malformed; "
+ << "row numbers or column numbers exceed the dimensions of the matrix." << std::endl;
+ exit(1);
+ }
+ } else {
+ ss >> col >> val;
+ col += 1; // Because we'll consider non-matrix TCC files to be zero-indexed
+ row = 1;
+ nrow = 1;
+ if (ncol < col) {
+ ncol = col;
+ }
+ }
+ if (row <= 0 || col <= 0) {
+ std::cerr << "Error: Invalid indices in TCC file." << std::endl;
+ exit(1);
+ }
+ if (row < prevRow || (row == prevRow && col <= prevCol)) {
+ std::cerr << "Error: TCC file is not sorted." << std::endl;
+ exit(1);
+ }
+ prevRow = row;
+ prevCol = col;
+ auto &bc = batchCounts[row-1];
+ bc.push_back({col-1, val});
+ i++;
+ }
+ } else {
+ std::cerr << "Error: could not open file " << opt.tccFile << std::endl;
+ exit(1);
+ }
+ if (isMatrixFile && i < nlines) {
+ std::cerr << "Error: Found only " << pretty_num(i) << " entries in TCC matrix file, "
+ << "expected " << pretty_num(nlines) << std::endl;
+ exit(1);
+ }
+ if ((isMatrixFile && ncol != num_ecs) || (!isMatrixFile && ncol > num_ecs)) {
+ std::cerr << "Error: number of equivalence classes does not match index. Found "
+ << pretty_num(ncol) << ", expected " << pretty_num(num_ecs) << std::endl;
+ exit(1);
+ }
+
+ std::vector<std::vector<std::pair<int32_t, double>>> Abundance_mat, Abundance_mat_gene, TPM_mat, TPM_mat_gene;
+ Abundance_mat.resize(nrow, {});
+ Abundance_mat_gene.resize(nrow, {});
+ TPM_mat.resize(nrow, {});
+ TPM_mat_gene.resize(nrow, {});
+ std::vector<std::pair<double, double>> FLD_mat;
+ FLD_mat.resize(nrow, {});
+ std::vector<std::vector<int>> FLDs;
+ Transcriptome model; // empty model
+
+ std::ofstream transout_f((opt.output + "/transcripts.txt"));
+ for (const auto &t : index.target_names_) {
+ transout_f << t << "\n";
+ }
+ transout_f.close();
+ std::string prefix = opt.output + "/matrix";
+ std::string abtsvfilename = opt.output + "/abundance.tsv";
+ std::string gene_abtsvfilename = opt.output + "/abundance.gene.tsv";
+ std::string genelistname = opt.output + "/genes.txt";
+ std::string abfilename = prefix + ".abundance.mtx";
+ std::string abtpmfilename = prefix + ".abundance.tpm.mtx";
+ std::string gene_abfilename = prefix + ".abundance.gene.mtx";
+ std::string gene_abtpmfilename = prefix + ".abundance.gene.tpm.mtx";
+ std::string fldfilename = prefix + ".fld.tsv";
+
+ const bool calcEffLen = !opt.fldFile.empty() || opt.fld != 0.0;
+ if (calcEffLen && !opt.fldFile.empty()) { // Parse supplied fragment length distribution file
+ std::ifstream infld((opt.fldFile));
+ if (infld.is_open()) {
+ std::string line;
+ while (getline(infld, line)) {
+ if (line.empty() || line.rfind("#", 0) == 0) {
+ continue; // Ignore empty lines or lines that begin with a #
+ }
+ std::vector<int> tmp_vec;
+ std::stringstream ss(line);
+ while(ss.good()) {
+ std::string tmp_val;
+ getline(ss, tmp_val, ' ');
+ int tmp_val_num = std::atoi(tmp_val.c_str());
+ if (tmp_val_num < 0) {
+ std::cerr << "Error: Fragment length distribution file contains invalid value: "
+ << tmp_val_num << std::endl;
+ exit(1);
+ }
+ tmp_vec.push_back(tmp_val_num);
+ }
+ if (tmp_vec.size() != MAX_FRAG_LEN) {
+ std::cerr << "Error: Fragment length distribution file contains a line with "
+ << tmp_vec.size() << " values; expected: " << MAX_FRAG_LEN << std::endl;
+ exit(1);
+ }
+ FLDs.push_back(tmp_vec);
+ }
+ if (FLDs.size() != 1 && FLDs.size() != nrow) {
+ std::cerr << "Error: Fragment length distribution file contains "
+ << FLDs.size() << " valid lines; expected: " << nrow << std::endl;
+ exit(1);
+ }
+ } else {
+ std::cerr << "Error: could not open file " << opt.fldFile << std::endl;
+ exit(1);
+ }
+ }
+
+ if (!opt.genemap.empty()) { // Parse supplied gene mapping file
+ model.parseGeneMap(opt.genemap, index, opt);
+ } else if (!opt.gtfFile.empty()) {
+ model.parseGTF(opt.gtfFile, index, opt, true);
+ }
+ const bool gene_level_counting = !opt.genemap.empty() || !opt.gtfFile.empty();
+
+ std::cerr << "[quant] Running EM algorithm..."; std::cerr.flush();
+ auto EM_lambda = [&](int id) {
+ MinCollector collection(index, opt);
+ collection.counts.assign(index.ecmap.size(), 0);
+ const auto& bc = batchCounts[id];
+ for (const auto &p : bc) {
+ collection.counts[p.first] = p.second;
+ }
+
+ std::vector<double> fl_means;
+ if (calcEffLen) {
+ if (opt.fld != 0.0) {
+ collection.init_mean_fl_trunc(opt.fld, opt.sd);
+ } else {
+ std::vector<int> fld;
+ if (FLDs.size() == 1) { // Only one distribution supplied; will use this for everything
+ fld = FLDs[0];
+ } else {
+ fld = FLDs[id];
+ }
+ collection.flens = fld;
+ collection.compute_mean_frag_lens_trunc(false);
+ }
+ fl_means = get_frag_len_means(index.target_lens_, collection.mean_fl_trunc);
+ double mean_fl = collection.get_mean_frag_len(true);
+ double sd_fl = collection.get_sd_frag_len();
+ FLD_mat[id] = {mean_fl, sd_fl};
+ } else { // Set mean fragment lengths to be target lengths so that effective lengths are all one
+ fl_means = std::vector<double>(index.target_lens_.begin(), index.target_lens_.end());
+ }
+
+ EMAlgorithm em(collection.counts, index, collection, fl_means, opt);
+ em.run(10000, 50, false, false);
+
+ if (isMatrixFile) { // Update abundances matrix
+ auto &ab_m = Abundance_mat[id];
+ auto &tpm_m = TPM_mat[id];
+ std::vector<double> gc;
+ std::vector<double> gc_tpm;
+ if (gene_level_counting) {
+ gc.assign(model.genes.size(), 0.0);
+ gc_tpm.assign(model.genes.size(), 0.0);
+ }
+ auto tpm = counts_to_tpm(em.alpha_, em.eff_lens_);
+ for (int i = 0; i < em.alpha_.size(); i++) {
+ if (em.alpha_[i] > 0.0) {
+ ab_m.push_back({i,em.alpha_[i]});
+ tpm_m.push_back({i,tpm[i]});
+ if (gene_level_counting) {
+ int g_id = model.transcripts[i].gene_id;
+ if (g_id != -1) {
+ gc[g_id] += em.alpha_[i];
+ gc_tpm[g_id] += tpm[i];
+ }
+ }
+ }
+ }
+ if (gene_level_counting) {
+ auto &ab_m_g = Abundance_mat_gene[id];
+ auto &tpm_m_g = TPM_mat_gene[id];
+ for (int i = 0; i < gc.size(); i++) {
+ if (gc[i] > 0.0) {
+ ab_m_g.push_back({i, gc[i]});
+ tpm_m_g.push_back({i, gc_tpm[i]});
+ }
+ }
+ }
+ } else { // Write plaintext abundances
+ plaintext_writer(abtsvfilename, em.target_names_,
+ em.alpha_, em.eff_lens_, index.target_lens_);
+ if (gene_level_counting) {
+ plaintext_writer_gene(gene_abtsvfilename, em.target_names_,
+ em.alpha_, em.eff_lens_, model);
+ }
+ }
+
+ if (opt.bootstrap > 0 && !isMatrixFile) {
+ auto B = opt.bootstrap;
+ std::mt19937_64 rand;
+ rand.seed( opt.seed );
+ std::vector<size_t> seeds;
+ for (auto s = 0; s < B; ++s) {
+ seeds.push_back( rand() );
+ }
+ cerr << endl;
+ for (auto b = 0; b < B; ++b) {
+ Bootstrap bs(collection.counts, index, collection, em.eff_lens_, seeds[b], fl_means, opt);
+ cerr << "[bstrp] running EM for the bootstrap: " << b + 1 << "\r";
+ auto res = bs.run_em();
+ plaintext_writer(opt.output + "/bs_abundance_" + std::to_string(b) + ".tsv",
+ em.target_names_, res.alpha_, em.eff_lens_, index.target_lens_);
+ }
+ cerr << endl;
+ }
+ }; // end of EM_lambda
+
+ std::vector<std::thread> workers;
+ int num_ids = nrow;
+ int id = 0;
+ while (id < num_ids) {
+ workers.clear();
+ int nt = std::min(opt.threads, (num_ids - id));
+ int first_id = id;
+ for (int i = 0; i < nt; i++,id++) {
+ workers.emplace_back(std::thread(EM_lambda, id));
+ }
+ for (int i = 0; i < nt; i++) {
+ workers[i].join();
+ }
+ }
+ std::cerr << " done" << std::endl;
+
+ cerr << endl;
+
+
+ if (isMatrixFile) {
+ writeSparseBatchMatrix(abfilename, Abundance_mat, index.num_trans);
+ writeSparseBatchMatrix(abtpmfilename, TPM_mat, index.num_trans);
+ if (gene_level_counting) {
+ writeSparseBatchMatrix(gene_abfilename, Abundance_mat_gene, model.genes.size());
+ writeSparseBatchMatrix(gene_abtpmfilename, TPM_mat_gene, model.genes.size());
+ writeGeneList(genelistname, model, true);
+ }
+ }
+ if (calcEffLen) {
+ writeFLD(fldfilename, FLD_mat);
+ }
+ }
} else if (cmd == "pseudo") {
if (argc==2) {
usagePseudo();
@@ -2429,14 +3576,22 @@ int main(int argc, char *argv[]) {
std::string call = argv_to_string(argc, argv);
-
- for (int id = 0; id < MP.batchCounts.size(); id++) {
- const auto &cc = MP.batchCounts[id];
- for (const auto &p : cc) {
- if (p.first < index.num_trans) {
- num_unique += p.second;
+ if (!opt.batch_bus) {
+ for (int id = 0; id < MP.batchCounts.size(); id++) {
+ const auto &cc = MP.batchCounts[id];
+ for (const auto &p : cc) {
+ if (p.first < index.num_trans) {
+ num_unique += p.second;
+ }
+ num_pseudoaligned += p.second;
}
- num_pseudoaligned += p.second;
+ }
+ } else {
+ for (int i = 0; i < index.num_trans; i++) {
+ num_unique += collection.counts[i];
+ }
+ for (int i = 0; i < collection.counts.size(); i++) {
+ num_pseudoaligned += collection.counts[i];
}
}
@@ -2546,10 +3701,44 @@ int main(int argc, char *argv[]) {
std::string fldfilename = prefix + ".fld.tsv";
std::string genelistname = prefix + ".genelist.txt";
std::string genecountname = prefix + ".genes.mtx";
+ std::string busbarcodelistname = prefix + ".barcodes";
+ std::string busoutputname = opt.output + "/output.bus";
writeECList(ecfilename, index);
writeCellIds(cellnamesfilename, opt.batch_ids);
- writeSparseBatchMatrix(tccfilename, MP.batchCounts, index.ecmap.size());
+ if (opt.batch_bus || opt.batch_bus_write) {
+ if (opt.batch_bus_write) {
+ writeBUSMatrix(busoutputname, MP.batchCounts, index.ecmap.size());
+ }
+ if (!MP.batchCounts.empty()) {
+ // Write out fake barcodes that identify each cell
+ std::vector<std::string> fake_bcs;
+ fake_bcs.reserve(MP.batchCounts.size());
+ for (size_t j = 0; j < MP.batchCounts.size(); j++) {
+ fake_bcs.push_back(binaryToString(j, BUSFORMAT_FAKE_BARCODE_LEN));
+ }
+ writeCellIds(busbarcodelistname, fake_bcs);
+ }
+ // Write out index:
+ index.write((opt.output + "/index.saved"), false);
+ // Write out fragment length distributions if reads paired-end:
+ if (!opt.single_end) {
+ std::ofstream flensout_f((opt.output + "/flens.txt"));
+ for (size_t id = 0; id < opt.batch_ids.size(); id++) {
+ std::vector<int> fld = MP.batchFlens[id];
+ for ( size_t i = 0 ; i < fld.size(); ++i ) {
+ if (i != 0) {
+ flensout_f << " ";
+ }
+ flensout_f << fld[i];
+ }
+ flensout_f << "\n";
+ }
+ flensout_f.close();
+ }
+ } else {
+ writeSparseBatchMatrix(tccfilename, MP.batchCounts, index.ecmap.size());
+ }
if (opt.pseudo_quant) {
writeSparseBatchMatrix(abfilename, Abundance_mat, index.num_trans);
writeFLD(fldfilename, FLD_mat);
View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/compare/a1ccc5194630a474877de4bad34ce53508312a1c...1e5743ab0d054a3a530ba2322e48f4c92336b43c
--
View it on GitLab: https://salsa.debian.org/med-team/kallisto/-/compare/a1ccc5194630a474877de4bad34ce53508312a1c...1e5743ab0d054a3a530ba2322e48f4c92336b43c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220119/cf885e4b/attachment-0001.htm>
More information about the debian-med-commit
mailing list